# Butina 클러스터링

- [Taylor-Butina](https://pubs.acs.org/doi/pdf/10.1021/ci9803381) 클러스터링 동작 설명

# import

In [None]:
!pip install rdkit mols2grid

In [1]:
import pandas as pd
import pickle
from rdkit import Chem
from rdkit.Chem import PandasTools, Draw
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina
from rdkit.Chem import rdMolDescriptors as rdmd
from rdkit.Chem import Descriptors
from tqdm import tqdm
import mols2grid

%config InlineBackend.figure_format = 'retina'

# 데이터

- [1-2] 예제에서 저장한 COCONUT SMILES 데이터 불러오기

In [2]:
with open("./real_coconut_smiles.pkl","rb") as f:
    coconut_smiles = pickle.load(f)
    
with open("./real_coconut_cid.pkl","rb") as f:
    coconut_cid = pickle.load(f)

print('cid_names :\t', coconut_cid[:3])
print('smiles :\t\t', coconut_smiles[:3])
print('cid_len :\t', len(coconut_cid))
print('smiles_len :\t', len(coconut_smiles))

cid_names :	 ['CNP0000002', 'CNP0000003', 'CNP0000003.1']
smiles :		 ['O=C(O)C=1C(=O)C(O)(CC(=O)C1N)C2OC(COC(=O)C)C(OC(=O)C(N=CS)=CC)C(OC3OC(C)C(O)C(OC)C3)C2O', 'O=C1OC2C(O)C=CC3C4=C5C(=O)C=6C(OC)=CC=C(OC)C6C(=O)C5=C(OC)C=C4CC32C(O)C7=CC(=CC(OC)=C17)C', 'CC1=CC(=C2C(=C1)[C@@H]([C@@]34CC5=C([C@H]4C=C[C@H]([C@H]3OC2=O)O)C6=C(C(=C5)OC)C(=O)C7=C(C(=CC=C7OC)OC)C6=O)O)OC']
cid_len :	 895068
smiles_len :	 895068


In [3]:
df = pd.DataFrame({'ID' : coconut_cid[:20000], 'SMILES' : coconut_smiles[:20000]})
df[:5]

Unnamed: 0,ID,SMILES
0,CNP0000002,O=C(O)C=1C(=O)C(O)(CC(=O)C1N)C2OC(COC(=O)C)C(O...
1,CNP0000003,O=C1OC2C(O)C=CC3C4=C5C(=O)C=6C(OC)=CC=C(OC)C6C...
2,CNP0000003.1,CC1=CC(=C2C(=C1)[C@@H]([C@@]34CC5=C([C@H]4C=C[...
3,CNP0000004,O=C1OC2C(O)C=CC3C4=C5C(=O)C=6C=CC=C(O)C6C(=O)C...
4,CNP0000004.1,CC1=CC(=C2C(=C1)[C@@H]([C@@]34CC5=C([C@H]4C=C[...


In [None]:
mols2grid.display(df)

## mol 객체 추가
- MolFromSmiles 함수 사용

In [4]:
# 데이터프레임에서 tqdm 사용 가능하게 설정
# progress_apply 함수 사용
tqdm.pandas()

In [5]:
df['mol'] = df.SMILES.progress_apply(Chem.MolFromSmiles)
df[:10]

100%|███████████████████████████████████| 20000/20000 [00:03<00:00, 6641.76it/s]


Unnamed: 0,ID,SMILES,mol
0,CNP0000002,O=C(O)C=1C(=O)C(O)(CC(=O)C1N)C2OC(COC(=O)C)C(O...,<rdkit.Chem.rdchem.Mol object at 0x7f652378a420>
1,CNP0000003,O=C1OC2C(O)C=CC3C4=C5C(=O)C=6C(OC)=CC=C(OC)C6C...,<rdkit.Chem.rdchem.Mol object at 0x7f652378a490>
2,CNP0000003.1,CC1=CC(=C2C(=C1)[C@@H]([C@@]34CC5=C([C@H]4C=C[...,<rdkit.Chem.rdchem.Mol object at 0x7f652378a500>
3,CNP0000004,O=C1OC2C(O)C=CC3C4=C5C(=O)C=6C=CC=C(O)C6C(=O)C...,<rdkit.Chem.rdchem.Mol object at 0x7f652378a570>
4,CNP0000004.1,CC1=CC(=C2C(=C1)[C@@H]([C@@]34CC5=C([C@H]4C=C[...,<rdkit.Chem.rdchem.Mol object at 0x7f652378a5e0>
5,CNP0000005,O=C(OC1CC(C)(CCOC(=O)C(C)(C)C)C2CC(C)(C)CC2C1O...,<rdkit.Chem.rdchem.Mol object at 0x7f652378a650>
6,CNP0000005.1,CC(C)(C)C(=O)OCC[C@@]1(C)C[C@@H]([C@H]([C@@H]2...,<rdkit.Chem.rdchem.Mol object at 0x7f652378a6c0>
7,CNP0000006,O=C1OC2C(O)C=CC3C4=C5C(=O)C=6C=CC=C(O)C6C(=O)C...,<rdkit.Chem.rdchem.Mol object at 0x7f652378a730>
8,CNP0000006.1,CC1=CC(=C2C(=C1)[C@@H]([C@@]34CC5=C([C@H]4C=C[...,<rdkit.Chem.rdchem.Mol object at 0x7f652378a7a0>
9,CNP0000007,O=C1OC(C)C(O)C12C(O)C(O)CCC2C,<rdkit.Chem.rdchem.Mol object at 0x7f652378a810>


## butina 클러스터링 구현

- 함수 butina_cluster 정의

> 1. 각 분자에 대해서 fingerprints 계산
> 2. 각 분자간의 (pair) 유사도를 계산
> 3. distance = 1 - similarity 로 거리 매트릭스를 계산
> 4. 거리가 가까운 분자들에게 같은 클러스터 번호 배정 (같은 클러스터에 속하는 cutoff 지정)



In [6]:
def butina_cluster(mol_list,cutoff=0.35):
    fp_list = [rdmd.GetMorganFingerprintAsBitVect(m, 3, nBits=2048) for m in mol_list]
    dists = []
    nfps = len(fp_list)
    
    for i in range(1,nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fp_list[i],fp_list[:i])
        dists.extend([1-x for x in sims])
    mol_clusters = Butina.ClusterData(dists,nfps,cutoff,isDistData=True)
    cluster_id_list = [0]*nfps
    
    for idx,cluster in enumerate(mol_clusters, start=1):
        for member in cluster:
            cluster_id_list[member] = idx
            
    return cluster_id_list

# 클러스터링 수행

- 클러스터 번호 배정

In [7]:
df['Cluster'] = butina_cluster(df.mol.values, 0.4)
df['Cluster'].value_counts()

Cluster
1       308
2       289
4       142
5        92
6        88
       ... 
4313      1
4314      1
4315      1
3345      1
5266      1
Name: count, Length: 5266, dtype: int64

In [None]:
mols2grid.display(df,subset=["img","ID","Cluster"])

## LogP 값 보기

In [8]:
df["logP"] = df.mol.progress_apply(Descriptors.MolLogP)

100%|███████████████████████████████████| 20000/20000 [00:06<00:00, 3016.03it/s]


In [None]:
mols2grid.display(df,subset=["img","ID","Cluster","logP"],transform={"logP": lambda x: f"{x:.2f}"})

- **Cluster** 번호와 **logP** 값으로 소팅 수행

In [9]:
df.sort_values(["Cluster","logP"],inplace=True)

In [None]:
mols2grid.display(df,subset=["img","ID","Cluster","logP"],
                  transform={"logP": lambda x: f"{x:.2f}"})

## 가장 작은 LogP를 갖는 샘플 찾기
- [drop_duplicates](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html)를 사용하면 두번째 이후의 중복된 샘플을 제거하므로 클러스터별로 가장 작은 LogP 샘플 하나씩을 얻는다

In [10]:
df_unique = df.drop_duplicates("Cluster")

In [None]:
mols2grid.display(df_unique,subset=["img","ID","Cluster","logP"],transform={"logP": lambda x: f"{x:.2f}"})