# Butina 클러스터링

- [Taylor-Butina](https://pubs.acs.org/doi/pdf/10.1021/ci9803381) 클러스터링 동작 설명

# import

In [1]:
!pip install rdkit mols2grid

Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Collecting mols2grid
  Downloading mols2grid-2.0.0-py3-none-any.whl.metadata (16 kB)
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets<8,>=7->mols2grid)
  Using cached jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading mols2grid-2.0.0-py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.0/107.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
Installing collected packages: rdkit, jedi, mols2grid
Successfully installed jedi-0.19.1 mols2grid-2.0.0 rdkit-2024.3.5


In [2]:
import pandas as pd
import pickle
from rdkit import Chem
from rdkit.Chem import PandasTools, Draw
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina
from rdkit.Chem import rdMolDescriptors as rdmd
from rdkit.Chem import Descriptors
from tqdm import tqdm
import mols2grid

%config InlineBackend.figure_format = 'retina'

# 데이터

- [1-2] 예제에서 저장한 COCONUT SMILES 데이터 불러오기

In [3]:
# coconut database 다운로드 후 파일 압축 해제 및 파일명 확인
! wget https://coconut.s3.uni-jena.de/prod/downloads/2024-09/coconut-09-2024.csv.zip
!unzip ./coconut-09-2024.csv.zip

--2024-09-23 06:55:08--  https://coconut.s3.uni-jena.de/prod/downloads/2024-09/coconut-09-2024.csv.zip
Resolving coconut.s3.uni-jena.de (coconut.s3.uni-jena.de)... 141.35.104.25, 141.35.104.26, 2001:638:1558:2368::8d23:6819, ...
Connecting to coconut.s3.uni-jena.de (coconut.s3.uni-jena.de)|141.35.104.25|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 88935618 (85M) [application/zip]
Saving to: ‘coconut-09-2024.csv.zip’


2024-09-23 06:55:14 (15.2 MB/s) - ‘coconut-09-2024.csv.zip’ saved [88935618/88935618]

Archive:  ./coconut-09-2024.csv.zip
  inflating: coconut-09-2024.csv     
  inflating: __MACOSX/._coconut-09-2024.csv  


In [4]:
coconut = pd.read_csv('./coconut-09-2024.csv')
coconut

Unnamed: 0,standard_inchi,standard_inchi_key,canonical_smiles,identifier
0,InChI=1S/C43H53N9O14S2.Na/c1-5-22(3)35-36(57)4...,DRKUXFLLRIKRHH-QDVYGYDXSA-M,CC[C@@H]1NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)CNC(...,CNP0437004.1
1,InChI=1S/C21H32O12/c1-9-14(23)16(25)18(27)21(3...,OXHVQSRYUNGYOK-NUASCYGXSA-N,COC1=CC=C(CCO[C@@H]2O[C@H](CO[C@@H]3O[C@@H](C)...,CNP0243002.1
2,InChI=1S/C36H61N5O7/c1-21(2)18-27-35(47)48-28(...,NEGZFRNAAJQQEG-NOFCQABOSA-N,C/C1=C\[C@@H](C(C)(C)C)OC(=O)[C@H](CC(C)C)N(C)...,CNP0458114.1
3,InChI=1S/C22H22O9/c1-28-12-4-2-11(3-5-12)15-9-...,DQIVYFNWBDHNFD-WHCFWRGISA-N,COC1=CC=C(C2=CC(=O)OC3=CC(O[C@@H]4O[C@H](CO)[C...,CNP0252086.2
4,InChI=1S/C32H41N5O4/c1-6-18(4)28-32(41)36-12-8...,HKVSEIVDIONNKB-QWNGKRCASA-N,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)N1C(=O)[C...,CNP0107934.1
...,...,...,...,...
695128,InChI=1S/C19H19N3O4/c1-26-13-8-6-12(7-9-13)20-...,HHSNDFVMRMIDBG-INIZCTEOSA-N,COC1=CC=C(NC(=O)CC[C@@H]2NC(=O)C3=CC=CC=C3NC2=...,CNP0395779.1
695129,InChI=1S/C30H30N2O10/c1-12-23(34)27(38)28(39)3...,VVPODVCQSZKNKL-RLOKSPFPSA-N,CC(=O)OC1=CC=C2C(=O)C3=C(O)C(CC4=CC=CC(C(N)N)=...,CNP0097600.1
695130,InChI=1S/C21H22O7/c1-11(2)4-5-13-15(23)7-6-14(...,LMFCHRAKSGPODM-OAQYLSRUSA-N,COC1=C([C@]2(O)COC3=CC(O)=CC(O)=C3C2=O)C=CC(O)...,CNP0212403.1
695131,InChI=1S/C20H30O7/c1-17(2)4-3-12(23)18-8-27-20...,IJWNAKYUVUUYTE-HMBONYETSA-N,CC1(C)CC[C@H](O)[C@]23COC(O)([C@@H](O)[C@H]12)...,CNP0494455.1


In [7]:
# SMILES와 cid를 저장할 리스트 생성
coconut_smiles = list(coconut['canonical_smiles'])
coconut_cid = list(coconut['identifier'])

print('cid_names :\t', coconut_cid[:3])
print('smiles :\t\t', coconut_smiles[:3])
print('cid_len :\t', len(coconut_cid))
print('smiles_len :\t', len(coconut_smiles))

cid_names :	 ['CNP0437004.1', 'CNP0243002.1', 'CNP0458114.1']
smiles :		 ['CC[C@@H]1NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)CNC(=O)C2=CC=C(O)C=C2)CNC(=O)[C@H](CS(=O)(=O)[O-])NC(=O)/C=C/C2=CSC(=N2)[C@H](CC2=CC=C(O)C=C2)NC(=O)C(=O)[C@H]([C@@H](C)CC)NC1=O.[Na+]', 'COC1=CC=C(CCO[C@@H]2O[C@H](CO[C@@H]3O[C@@H](C)[C@H](O)[C@@H](O)[C@H]3O)[C@@H](O)[C@H](O)[C@H]2O)C=C1O', 'C/C1=C\\[C@@H](C(C)(C)C)OC(=O)[C@H](CC(C)C)N(C)C(=O)[C@H](C)N(C)C(=O)CNC(=O)[C@H](C(C)C)NC(=O)[C@@H]2CCCN2C(=O)[C@H](C)CC1']
cid_len :	 695133
smiles_len :	 695133


In [8]:
df = pd.DataFrame({'ID' : coconut_cid[:20000], 'SMILES' : coconut_smiles[:20000]})
df[:5]

Unnamed: 0,ID,SMILES
0,CNP0437004.1,CC[C@@H]1NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)CNC(...
1,CNP0243002.1,COC1=CC=C(CCO[C@@H]2O[C@H](CO[C@@H]3O[C@@H](C)...
2,CNP0458114.1,C/C1=C\[C@@H](C(C)(C)C)OC(=O)[C@H](CC(C)C)N(C)...
3,CNP0252086.2,COC1=CC=C(C2=CC(=O)OC3=CC(O[C@@H]4O[C@H](CO)[C...
4,CNP0107934.1,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)N1C(=O)[C...


In [9]:
mols2grid.display(df)

Output hidden; open in https://colab.research.google.com to view.

## mol 객체 추가
- MolFromSmiles 함수 사용

In [10]:
# 데이터프레임에서 tqdm 사용 가능하게 설정
# progress_apply 함수 사용
tqdm.pandas()

In [11]:
df['mol'] = df.SMILES.progress_apply(Chem.MolFromSmiles)
df[:10]

100%|██████████| 20000/20000 [00:10<00:00, 1913.85it/s]


Unnamed: 0,ID,SMILES,mol
0,CNP0437004.1,CC[C@@H]1NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)CNC(...,<rdkit.Chem.rdchem.Mol object at 0x7e21217d37d0>
1,CNP0243002.1,COC1=CC=C(CCO[C@@H]2O[C@H](CO[C@@H]3O[C@@H](C)...,<rdkit.Chem.rdchem.Mol object at 0x7e21217d3840>
2,CNP0458114.1,C/C1=C\[C@@H](C(C)(C)C)OC(=O)[C@H](CC(C)C)N(C)...,<rdkit.Chem.rdchem.Mol object at 0x7e21217d38b0>
3,CNP0252086.2,COC1=CC=C(C2=CC(=O)OC3=CC(O[C@@H]4O[C@H](CO)[C...,<rdkit.Chem.rdchem.Mol object at 0x7e21217d3920>
4,CNP0107934.1,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)N1C(=O)[C...,<rdkit.Chem.rdchem.Mol object at 0x7e21217d3990>
5,CNP0100225.1,CNCCN[C@@]12CC#CCOC3=CC4=C(O[C@@H](CCCOC)[C@@H...,<rdkit.Chem.rdchem.Mol object at 0x7e21217d3a70>
6,CNP0134983.2,C=C1CC[C@H]2[C@](C)(COC(=O)C(CC(=O)O)C(OC[C@H]...,<rdkit.Chem.rdchem.Mol object at 0x7e21217d3ae0>
7,CNP0366639.1,CC(=O)OC[C@@]12[C@H](O)C[C@]3(C)C(=CC[C@@H]4[C...,<rdkit.Chem.rdchem.Mol object at 0x7e21217d3b50>
8,CNP0366010.2,COC1=C2OCOC2=CC2=C1[C@H](C(=O)NC1=CC=C(C)C=C1)...,<rdkit.Chem.rdchem.Mol object at 0x7e21217d3bc0>
9,CNP0151673.0,COC1=CC=C(OC)C(N)=C1,<rdkit.Chem.rdchem.Mol object at 0x7e21217d3c30>


## butina 클러스터링 구현

- 함수 butina_cluster 정의

> 1. 각 분자에 대해서 fingerprints 계산
> 2. 각 분자간의 (pair) 유사도를 계산
> 3. distance = 1 - similarity 로 거리 매트릭스를 계산
> 4. 거리가 가까운 분자들에게 같은 클러스터 번호 배정 (같은 클러스터에 속하는 cutoff 지정)



In [12]:
def butina_cluster(mol_list,cutoff=0.35):
    fp_list = [rdmd.GetMorganFingerprintAsBitVect(m, 3, nBits=2048) for m in mol_list]
    dists = []
    nfps = len(fp_list)

    for i in range(1,nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fp_list[i],fp_list[:i])
        dists.extend([1-x for x in sims])
    mol_clusters = Butina.ClusterData(dists,nfps,cutoff,isDistData=True)
    cluster_id_list = [0]*nfps

    for idx,cluster in enumerate(mol_clusters, start=1):
        for member in cluster:
            cluster_id_list[member] = idx

    return cluster_id_list

# 클러스터링 수행

- 클러스터 번호 배정

In [13]:
df['Cluster'] = butina_cluster(df.mol.values, 0.4)
df['Cluster'].value_counts()

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m


Unnamed: 0_level_0,count
Cluster,Unnamed: 1_level_1
1,1246
2,184
14,82
4,81
17,46
...,...
13439,1
9538,1
2357,1
9537,1


In [14]:
mols2grid.display(df,subset=["img","ID","Cluster"])

Output hidden; open in https://colab.research.google.com to view.

## LogP 값 보기

In [15]:
df["logP"] = df.mol.progress_apply(Descriptors.MolLogP)

100%|██████████| 20000/20000 [00:19<00:00, 1013.63it/s]


In [16]:
mols2grid.display(df,subset=["img","ID","Cluster","logP"],transform={"logP": lambda x: f"{x:.2f}"})

Output hidden; open in https://colab.research.google.com to view.

- **Cluster** 번호와 **logP** 값으로 소팅 수행

In [17]:
df.sort_values(["Cluster","logP"],inplace=True)

In [18]:
mols2grid.display(df,subset=["img","ID","Cluster","logP"],
                  transform={"logP": lambda x: f"{x:.2f}"})

Output hidden; open in https://colab.research.google.com to view.

## 가장 작은 LogP를 갖는 샘플 찾기
- [drop_duplicates](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html)를 사용하면 두번째 이후의 중복된 샘플을 제거하므로 클러스터별로 가장 작은 LogP 샘플 하나씩을 얻는다

In [19]:
df_unique = df.drop_duplicates("Cluster")

In [20]:
mols2grid.display(df_unique,subset=["img","ID","Cluster","logP"],transform={"logP": lambda x: f"{x:.2f}"})

Output hidden; open in https://colab.research.google.com to view.