### This notebook will process experiment data
1. Transfer sequence to smiles formation
2. Compute feature vectors of peptide with unimol tools 
3. Compute solvent by one-hot coding or unimol features

In [1]:
import os

import pandas as pd
import numpy  as np
from rdkit import  Chem
from rdkit.Chem import  AllChem
import utils
from tqdm import tqdm
import time

from unimol_tools import UniMolRepr

In [2]:
# read .csv format data
DATA_DIR = '../Data'
data_path= []
for file in os.listdir(DATA_DIR):
    if '.csv' in file:
        path = DATA_DIR+ '/' + file
        data_path.append(path)

print(data_path)
df = pd.read_csv(data_path[-3])
df.head(), df.shape

['../Data/data_140_ori.csv', '../Data/data_160_ori.csv', '../Data/data_180_ori.csv', '../Data/data_180_ori_1.csv', '../Data/data_180_ori_2.csv', '../Data/tetrapep-140.csv', '../Data/tetrapep-180.csv', '../Data/tetrapep-add.csv', '../Data/tetrapep-d-former.csv']


(  Peptide Solvent   Sol_smi    ee yields
 0    GPLL    MeCN      CC#N  58.1    low
 1    GPLL    MeOH        CO  64.2    low
 2    GPLL   iPrOH    CC(C)O  37.2    low
 3    GPLL     DCM   C(Cl)Cl  39.7    low
 4    GPLL     DCE  C(CCl)Cl  62.7    low,
 (180, 5))

In [3]:
# Insert SMILES column to dataframem, if dataframe has no smiles column
df.insert(1, column='pep_smiles', value='')
for i in df.index.tolist():
    df.iloc[i,1] = utils.pep_seq_transform(df.iloc[i,0])

df.head()

Unnamed: 0,Peptide,pep_smiles,Solvent,Sol_smi,ee,yields
0,GPLL,CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H]1CC...,MeCN,CC#N,58.1,low
1,GPLL,CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H]1CC...,MeOH,CO,64.2,low
2,GPLL,CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H]1CC...,iPrOH,CC(C)O,37.2,low
3,GPLL,CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H]1CC...,DCM,C(Cl)Cl,39.7,low
4,GPLL,CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H]1CC...,DCE,C(CCl)Cl,62.7,low


In [5]:
# concate calculated unimol representations of peptides
# df = pd.read_csv('../Data/data_140_ori.csv',index_col=0)
# fea_lib = pd.read_csv('../Unimol_model/cluster/results_VPGLA/VPGLA_feature.csv', header=None)

# df_fea = pd.DataFrame(index=range(140),columns=range(512))

# for i in range(len(df['lib_index'])):
#     df_fea.loc[i] = fea_lib.loc[df['lib_index'][i]]

# df_fea.to_csv('../Data/UniMolRepr.csv')

# Optional: A convenient way to generate UniMol Representations of peptides
clf = UniMolRepr(data_type='molecule')
smi_list = df['pep_smiles'].values.tolist()
unimol_repr = clf.get_repr(smi_list, return_atomic_reprs=False)
# CLS token repr
print(np.array(unimol_repr['cls_repr']).shape)

df_repr = pd.DataFrame(np.array(unimol_repr['cls_repr'])).to_csv('../Reprs/UniMolRepr_180.csv', )

2024-04-25 10:06:27 | unimol_tools/models/unimol.py | 146 | INFO | Uni-Mol(QSAR) | Loading pretrained weights from /home/troy/miniconda3/envs/unimol-tool/lib/python3.9/site-packages/unimol_tools-1.0.0-py3.9.egg/unimol_tools/weights/mol_pre_all_h_220816.pt
2024-04-25 10:06:27 | unimol_tools/data/conformer.py | 90 | INFO | Uni-Mol(QSAR) | Start generating conformers...
180it [00:01, 103.77it/s]
2024-04-25 10:06:29 | unimol_tools/data/conformer.py | 94 | INFO | Uni-Mol(QSAR) | Failed to generate conformers for 0.00% of molecules.
2024-04-25 10:06:29 | unimol_tools/data/conformer.py | 96 | INFO | Uni-Mol(QSAR) | Failed to generate 3d conformers for 0.00% of molecules.
100%|██████████| 6/6 [00:04<00:00,  1.36it/s]

(180, 512)





In [6]:
# Generate Representations of solvent

# one-hot coding
sol_df = df['Solvent']
sol_oh = pd.get_dummies(sol_df, dtype=int)
sol_oh.to_csv('../Reprs/sol_oh_180.csv')

In [7]:
# unimol-repr
sol_smi = df['Sol_smi'].values.tolist()
clf = UniMolRepr(data_type='molecule')
unimol_repr = clf.get_repr(sol_smi, return_atomic_reprs=False)
df_repr = pd.DataFrame(np.array(unimol_repr['cls_repr'])).to_csv('../Reprs/Solvent_Repr_180.csv')

2024-04-25 10:06:34 | unimol_tools/models/unimol.py | 146 | INFO | Uni-Mol(QSAR) | Loading pretrained weights from /home/troy/miniconda3/envs/unimol-tool/lib/python3.9/site-packages/unimol_tools-1.0.0-py3.9.egg/unimol_tools/weights/mol_pre_all_h_220816.pt
2024-04-25 10:06:34 | unimol_tools/data/conformer.py | 90 | INFO | Uni-Mol(QSAR) | Start generating conformers...
180it [00:00, 2710.55it/s]
2024-04-25 10:06:34 | unimol_tools/data/conformer.py | 94 | INFO | Uni-Mol(QSAR) | Failed to generate conformers for 0.00% of molecules.
2024-04-25 10:06:34 | unimol_tools/data/conformer.py | 96 | INFO | Uni-Mol(QSAR) | Failed to generate 3d conformers for 0.00% of molecules.
100%|██████████| 6/6 [00:00<00:00,  9.93it/s]
