### Binary prediction on untest peptide

1. This test will build a binary-test model on prediction and ee
2. use the builded model predict potentially highly yield and highly ee value peptide

In [1]:
import os
import numpy as np
import pandas as pd

# Import relevant scikit-learn modules
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from unimol_tools import UniMolRepr

import ModelFits
import utils

In [2]:
# read .csv format data
DATA_DIR = '../Data'

data_path= []
for file in os.listdir(DATA_DIR):
    if '.csv' in file:
        path = DATA_DIR+ '/' + file
        data_path.append(path)

# df = pd.read_csv(data_path[0])
# unimol_repr = pd.read_csv(data_path[-1])
# sol_repr = pd.read_csv(data_path[4])
# sol_oh = pd.read_csv(data_path[5])
df = pd.read_csv(data_path[1], index_col=0)
unimol_repr = pd.read_csv(data_path[4],index_col=0)
sol_repr = pd.read_csv(data_path[6],index_col=0)
sol_oh = pd.read_csv(data_path[-1],index_col=0)

In [3]:
fea_df = pd.concat([unimol_repr, sol_repr], axis=1)

yields = df['yields']
ee = df['ee']
ee = utils.class_label(ee)

## Calculate UniMolRepr of new peptides

In [5]:
seq_lib = pd.read_csv('./625seq_to_test.csv')

In [6]:
# Insert SMILES column
seq_lib.insert(1, column='pep_smiles', value='')
for i in seq_lib.index.tolist():
    seq_lib.iloc[i,1] = utils.pep_seq_transform(seq_lib.iloc[i,0])
# Generate UniMol Representations of peptides
clf = UniMolRepr(data_type='molecule')
smi_list = seq_lib['pep_smiles'].values.tolist()
unimol_repr = clf.get_repr(smi_list, return_atomic_reprs=False)
# CLS token repr
print(np.array(unimol_repr['cls_repr']).shape)

pep_repr = pd.DataFrame(np.array(unimol_repr['cls_repr']))
pep_repr

2024-04-17 10:23:53 | unimol_tools/models/unimol.py | 146 | INFO | Uni-Mol(QSAR) | Loading pretrained weights from /home/troy/miniconda3/envs/unimol-tool/lib/python3.9/site-packages/unimol_tools-1.0.0-py3.9.egg/unimol_tools/weights/mol_pre_all_h_220816.pt
2024-04-17 10:23:54 | unimol_tools/data/conformer.py | 90 | INFO | Uni-Mol(QSAR) | Start generating conformers...
625it [00:05, 113.14it/s]
2024-04-17 10:24:00 | unimol_tools/data/conformer.py | 94 | INFO | Uni-Mol(QSAR) | Failed to generate conformers for 0.00% of molecules.
2024-04-17 10:24:00 | unimol_tools/data/conformer.py | 96 | INFO | Uni-Mol(QSAR) | Failed to generate 3d conformers for 0.00% of molecules.
100%|██████████| 20/20 [00:14<00:00,  1.38it/s]

(625, 512)





Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,-0.193767,-0.729341,-0.157339,-0.253636,-0.639269,-1.660264,0.720261,0.796953,-0.758289,1.654507,...,1.145198,0.268207,-0.001041,-2.631841,1.230597,0.477417,-0.949774,2.158044,0.462428,-2.111053
1,-0.203259,-0.759756,0.130137,-0.388195,-0.545061,-1.425789,0.806302,0.804812,-0.554840,1.610545,...,0.988741,0.389493,0.000335,-2.627378,1.273545,0.517709,-0.743800,2.171710,0.806785,-2.115437
2,-0.132372,-0.491167,-0.556315,-0.380504,-0.212901,-1.459108,0.693115,0.722283,-0.470556,1.507656,...,1.067975,0.319110,0.282265,-2.660875,1.246622,0.429329,-0.903593,2.069259,0.572939,-2.106420
3,-0.213776,-0.698337,-0.324626,-0.359181,-0.677339,-1.583930,0.700558,0.741412,-0.567932,1.591989,...,0.980365,0.469497,0.064009,-2.614589,1.277596,0.479632,-0.699818,2.119950,0.531556,-2.071246
4,-0.054285,-0.648697,-0.271408,-0.157969,-0.435600,-1.578865,0.618388,0.665994,-0.597277,1.500693,...,1.019903,0.451326,0.115999,-2.570061,1.243790,0.578808,-0.754288,2.123183,0.368282,-2.114172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620,0.017210,-0.662277,-0.361206,-0.059502,-0.341837,-1.377011,0.525382,0.843277,-0.510994,1.325868,...,1.060690,0.317032,-0.011071,-2.574000,1.233315,0.701821,-1.035051,2.110108,0.708234,-2.103952
621,-0.257336,-0.676039,-0.372395,-0.209026,-0.400769,-1.200186,0.678697,0.893552,-0.450997,1.365652,...,0.966892,0.499098,-0.031221,-2.630594,1.267870,0.568557,-0.909159,2.100293,0.806213,-2.096262
622,-0.289984,-0.698834,-0.425091,-0.092799,-0.482267,-1.631833,0.761147,0.847009,-0.602706,1.299666,...,0.948559,0.481931,-0.070861,-2.596664,1.268055,0.408675,-0.783777,2.112125,0.686053,-2.135875
623,-0.008643,-0.663109,-0.399891,-0.150980,-0.276401,-1.341047,0.503627,0.741546,-0.584358,1.303920,...,1.160475,0.461803,-0.023883,-2.628439,1.234420,0.516692,-0.930415,2.129632,0.695689,-2.132756


In [8]:
dce_repr = sol_repr.iloc[4].copy()
dce_df = pd.DataFrame([dce_repr] * 625, index=range(625),columns=sol_repr.columns)
dce_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,1.046752,-0.654765,-0.624288,-0.709546,0.241608,-1.19603,1.033707,0.971329,-1.10554,0.868621,...,-0.032769,0.499183,0.588122,-2.801194,0.802722,-0.392907,0.52999,2.377181,0.318138,-1.967838
1,1.046752,-0.654765,-0.624288,-0.709546,0.241608,-1.19603,1.033707,0.971329,-1.10554,0.868621,...,-0.032769,0.499183,0.588122,-2.801194,0.802722,-0.392907,0.52999,2.377181,0.318138,-1.967838
2,1.046752,-0.654765,-0.624288,-0.709546,0.241608,-1.19603,1.033707,0.971329,-1.10554,0.868621,...,-0.032769,0.499183,0.588122,-2.801194,0.802722,-0.392907,0.52999,2.377181,0.318138,-1.967838
3,1.046752,-0.654765,-0.624288,-0.709546,0.241608,-1.19603,1.033707,0.971329,-1.10554,0.868621,...,-0.032769,0.499183,0.588122,-2.801194,0.802722,-0.392907,0.52999,2.377181,0.318138,-1.967838
4,1.046752,-0.654765,-0.624288,-0.709546,0.241608,-1.19603,1.033707,0.971329,-1.10554,0.868621,...,-0.032769,0.499183,0.588122,-2.801194,0.802722,-0.392907,0.52999,2.377181,0.318138,-1.967838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620,1.046752,-0.654765,-0.624288,-0.709546,0.241608,-1.19603,1.033707,0.971329,-1.10554,0.868621,...,-0.032769,0.499183,0.588122,-2.801194,0.802722,-0.392907,0.52999,2.377181,0.318138,-1.967838
621,1.046752,-0.654765,-0.624288,-0.709546,0.241608,-1.19603,1.033707,0.971329,-1.10554,0.868621,...,-0.032769,0.499183,0.588122,-2.801194,0.802722,-0.392907,0.52999,2.377181,0.318138,-1.967838
622,1.046752,-0.654765,-0.624288,-0.709546,0.241608,-1.19603,1.033707,0.971329,-1.10554,0.868621,...,-0.032769,0.499183,0.588122,-2.801194,0.802722,-0.392907,0.52999,2.377181,0.318138,-1.967838
623,1.046752,-0.654765,-0.624288,-0.709546,0.241608,-1.19603,1.033707,0.971329,-1.10554,0.868621,...,-0.032769,0.499183,0.588122,-2.801194,0.802722,-0.392907,0.52999,2.377181,0.318138,-1.967838


In [9]:
X_train = fea_df
x_test  = pd.concat([pep_repr,dce_df],axis=1)
x_test.columns = x_test.columns.astype(str)

In [10]:
le = LabelEncoder()
yield_le = le.fit_transform(df['yields'])
yield_classifier = RandomForestClassifier(random_state=42)
yield_classifier.fit(X_train, yield_le)
yield_pred = yield_classifier.predict(x_test)
yield_pred = le.inverse_transform(yield_pred)

seq_lib.insert(3,'yield_pred', yield_pred)

In [11]:
le = LabelEncoder()
ee_le = le.fit_transform(df['ee'])
ee_classifier = RandomForestClassifier(random_state=42)
ee_classifier.fit(X_train, ee_le)
ee_pred = ee_classifier.predict(x_test)
ee_pred = le.inverse_transform(ee_pred)
seq_lib.insert(4,'ee_pred', ee_pred)

In [12]:
seq_lib.to_csv('./Pred/Trained_on_140.csv')