# Oversampling on imbalanced datasets

Because the data is imbalanced, we will try oversampling methond.
We choose random forest model, for weighted training.
1. Duplicate small scale data, to balanced the ratil of datasets.
2. Try normal distribution data structure.

In [21]:
import os
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
import pandas as pd

# Import relevant scikit-learn modules
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import shuffle
from unimol_tools import UniMolRepr

import utils
from utils import oversample, class_label

In [22]:
# read .csv format data
DATA_DIR = '../Reprs'

data_path= []
for file in os.listdir(DATA_DIR):
    if '.csv' in file:
        path = DATA_DIR+ '/' + file
        data_path.append(path)

data_path

['../Reprs/Solvent_Repr.csv',
 '../Reprs/Solvent_Repr_160.csv',
 '../Reprs/Solvent_Repr_180.csv',
 '../Reprs/sol_oh.csv',
 '../Reprs/sol_oh_180.csv',
 '../Reprs/UniMolRepr.csv',
 '../Reprs/UniMolRepr_160.csv',
 '../Reprs/UniMolRepr_180.csv']

In [23]:
df = pd.read_csv('../Data/data_180_ori_2.csv', index_col=0) # original data
unimol_repr = pd.read_csv(data_path[-1],index_col=0) # unimol repr of peptides
sol_repr = pd.read_csv(data_path[2],index_col=0) # unimol repr of solvents

In [24]:
# concat required features
fea_df = pd.concat([unimol_repr,sol_repr], axis=1)

# Extract label colunm
label_df = df['yields'].copy()

In [25]:
for i in range(len(label_df)):
    if label_df.iloc[i] == 'low':
        label_df.iloc[i] = 2
    elif label_df.iloc[i] == 'medium':
        label_df.iloc[i] = 1
    else:
        label_df.iloc[i] = 0

label_df.astype(int)
label_df.value_counts()

yields
2    119
1     35
0     26
Name: count, dtype: int64

## Try oversampling on yields prediction

In [26]:
X = fea_df
y = label_df.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
X_train, y_train = oversample(X_train, y_train, X, y, high_scale=2, medium_scale=2) # oversampling

rf_classifier = RandomForestClassifier(random_state=42)
# y_train = y_train.values.ravel()

# Train the classifier on the resampled training data
rf_classifier.fit(X_train, y_train)

# Predictions on the original test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[ 5  1  2]
 [ 0  4  2]
 [ 0  3 19]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.62      0.77         8
           1       0.50      0.67      0.57         6
           2       0.83      0.86      0.84        22

    accuracy                           0.78        36
   macro avg       0.78      0.72      0.73        36
weighted avg       0.81      0.78      0.78        36



In [28]:
# random split 10 times, without setting class weights

X = fea_df
y = label_df.astype(int)

for seed in range(40,50):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

    X_train, y_train = oversample(X_train, y_train, X, y, high_scale=8, medium_scale=4) # oversampling

    rf_classifier = RandomForestClassifier(random_state=42)
    y_train = y_train.values.ravel()

    # Train the classifier on the resampled training data
    rf_classifier.fit(X_train, y_train)

    # Predictions on the original test set
    y_pred = rf_classifier.predict(X_test)

    # Evaluate the model
    print(f'******************************The {seed} seed run*********************************')
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

******************************The 40 seed run*********************************
Confusion Matrix:
 [[ 2  1  6]
 [ 0  5  3]
 [ 0  0 19]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.22      0.36         9
           1       0.83      0.62      0.71         8
           2       0.68      1.00      0.81        19

    accuracy                           0.72        36
   macro avg       0.84      0.62      0.63        36
weighted avg       0.79      0.72      0.68        36

******************************The 41 seed run*********************************
Confusion Matrix:
 [[ 4  1  1]
 [ 0  6  2]
 [ 0  3 19]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.67      0.80         6
           1       0.60      0.75      0.67         8
           2       0.86      0.86      0.86        22

    accuracy                           0.81        36
   macro avg       0.82    

## Try adjust classes weights on UniMolRepr

In [29]:
X = fea_df
y = label_df.astype(int)

for seed in range(40, 50):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

    X_train, y_train = oversample(X_train, y_train, X, y, high_scale=8, medium_scale=4) # oversampling
    
    class_weights = {0: 3, 1: 2, 2: 1} # O:High More important; 1: Medium Less important; 2: Low last important
    rf_classifier = RandomForestClassifier(random_state=seed, class_weight=class_weights) # class_weight=class_weights

    # Train the classifier on the sampled training data
    rf_classifier.fit(X_train, y_train)

    # Predictions on the original test set
    y_pred = rf_classifier.predict(X_test)

    # Evaluate the model
    print(f'*********************Random State: {seed}**********************')
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print('-------------------------------------------------------------')

*********************Random State: 40**********************
Confusion Matrix:
 [[ 2  2  5]
 [ 1  4  3]
 [ 0  0 19]]

Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.22      0.33         9
           1       0.67      0.50      0.57         8
           2       0.70      1.00      0.83        19

    accuracy                           0.69        36
   macro avg       0.68      0.57      0.58        36
weighted avg       0.69      0.69      0.65        36

-------------------------------------------------------------
*********************Random State: 41**********************
Confusion Matrix:
 [[ 4  0  2]
 [ 0  6  2]
 [ 0  2 20]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.67      0.80         6
           1       0.75      0.75      0.75         8
           2       0.83      0.91      0.87        22

    accuracy                           0.83        36
   

## Try on ee values

In [30]:
label_df = class_label(df['ee'].copy())

for i in range(len(label_df)):
    if label_df.iloc[i] == 'low':
        label_df.iloc[i] = 2
    elif label_df.iloc[i] == 'medium':
        label_df.iloc[i] = 1
    else:
        label_df.iloc[i] = 0

label_df.astype(int)
label_df.value_counts()

ee
1    107
2     54
0     19
Name: count, dtype: int64

In [31]:
# random split 10 times, without setting class weights

X = fea_df
y = label_df.astype(int)

for seed in range(40,50):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

    X_train, y_train = oversample(X_train, y_train, X, y, high_scale=4, low_scale=2) # oversampling

    # class_weights = {0: 5, 1: 2, 2: 1}
    rf_classifier = RandomForestClassifier(random_state=42)
    y_train = y_train.values.ravel()

    # Train the classifier on the resampled training data
    rf_classifier.fit(X_train, y_train)

    # Predictions on the original test set
    y_pred = rf_classifier.predict(X_test)

    # Evaluate the model
    print(f'******************************The {seed} seed run*********************************')
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

******************************The 40 seed run*********************************
Confusion Matrix:
 [[ 5  2  0]
 [ 1 13  4]
 [ 0  4  7]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.71      0.77         7
           1       0.68      0.72      0.70        18
           2       0.64      0.64      0.64        11

    accuracy                           0.69        36
   macro avg       0.72      0.69      0.70        36
weighted avg       0.70      0.69      0.70        36

******************************The 41 seed run*********************************
Confusion Matrix:
 [[ 3  1  0]
 [ 3 17  6]
 [ 1  2  3]]

Classification Report:
               precision    recall  f1-score   support

           0       0.43      0.75      0.55         4
           1       0.85      0.65      0.74        26
           2       0.33      0.50      0.40         6

    accuracy                           0.64        36
   macro avg       0.54    

In [32]:
# random split 10 times, without setting class weights

X = fea_df
y = label_df.astype(int)

for seed in range(40,50):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

    X_train, y_train = oversample(X_train, y_train, X, y, high_scale=4, low_scale=0) # oversampling

    class_weights = {0: 5, 1: 2, 2: 1}
    rf_classifier = RandomForestClassifier(random_state=42, class_weight=class_weights)
    y_train = y_train.values.ravel()

    # Train the classifier on the resampled training data
    rf_classifier.fit(X_train, y_train)

    # Predictions on the original test set
    y_pred = rf_classifier.predict(X_test)

    # Evaluate the model
    print(f'******************************The {seed} seed run*********************************')
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

******************************The 40 seed run*********************************
Confusion Matrix:
 [[ 2  5  0]
 [ 0 13  5]
 [ 0  4  7]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.29      0.44         7
           1       0.59      0.72      0.65        18
           2       0.58      0.64      0.61        11

    accuracy                           0.61        36
   macro avg       0.72      0.55      0.57        36
weighted avg       0.67      0.61      0.60        36

******************************The 41 seed run*********************************
Confusion Matrix:
 [[ 3  1  0]
 [ 2 17  7]
 [ 1  2  3]]

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.75      0.60         4
           1       0.85      0.65      0.74        26
           2       0.30      0.50      0.37         6

    accuracy                           0.64        36
   macro avg       0.55    

# Make Prediciton on Library

In [33]:
fea_df = pd.concat([unimol_repr, sol_repr], axis=1)

yields = df['yields'].copy()
for i in range(len(yields)):
    if yields.iloc[i] == 'low':
        yields.iloc[i] = 2
    elif yields.iloc[i] == 'medium':
        yields.iloc[i] = 1
    else:
        yields.iloc[i] = 0
yields = yields.astype(int)

ee = class_label(df['ee'].copy())


for i in range(len(ee)):
    if ee.iloc[i] == 'low':
        ee.iloc[i] = 2
    elif ee.iloc[i] == 'medium':
        ee.iloc[i] = 1
    else:
        ee.iloc[i] = 0

ee = ee.astype(int)

In [34]:
# load library
seq_lib = pd.read_csv('./1296seq_to_test.csv')

# Insert SMILES column
seq_lib.insert(1, column='pep_smiles', value='')
for i in seq_lib.index.tolist():
    seq_lib.iloc[i,1] = utils.pep_seq_transform(seq_lib.iloc[i,0])
# Generate UniMol Representations of peptides
clf = UniMolRepr(data_type='molecule')
smi_list = seq_lib['pep_smiles'].values.tolist()
unimol_repr = clf.get_repr(smi_list, return_atomic_reprs=False)
# CLS token repr
print(np.array(unimol_repr['cls_repr']).shape)

pep_repr = pd.DataFrame(np.array(unimol_repr['cls_repr']))
# pep_repr = pd.read_csv('../Unimol_model/cluster/results_VPGLA/VPGLA_feature.csv',header=None)

pep_repr

2024-04-25 10:07:27 | unimol_tools/models/unimol.py | 146 | INFO | Uni-Mol(QSAR) | Loading pretrained weights from /home/troy/miniconda3/envs/unimol-tool/lib/python3.9/site-packages/unimol_tools-1.0.0-py3.9.egg/unimol_tools/weights/mol_pre_all_h_220816.pt
2024-04-25 10:07:27 | unimol_tools/data/conformer.py | 90 | INFO | Uni-Mol(QSAR) | Start generating conformers...
1296it [00:11, 110.28it/s]
2024-04-25 10:07:39 | unimol_tools/data/conformer.py | 94 | INFO | Uni-Mol(QSAR) | Failed to generate conformers for 0.00% of molecules.
2024-04-25 10:07:39 | unimol_tools/data/conformer.py | 96 | INFO | Uni-Mol(QSAR) | Failed to generate 3d conformers for 0.00% of molecules.
100%|██████████| 41/41 [00:29<00:00,  1.38it/s]

(1296, 512)





Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,-0.187755,-0.757094,-0.230801,-0.263155,-0.378224,-1.489209,0.830283,0.751223,-0.411963,1.606491,...,0.975573,0.280888,0.151558,-2.630810,1.290600,0.540199,-0.783475,2.103269,0.628125,-2.140517
1,-0.147666,-0.721462,-0.427698,-0.452070,-0.341001,-1.340670,0.710630,0.798275,-0.462686,1.552800,...,0.992516,0.208612,0.193683,-2.656662,1.284773,0.515903,-0.736329,2.127645,0.620427,-2.132180
2,-0.112590,-0.782315,0.031480,-0.393942,-0.552017,-1.477100,0.822358,0.815920,-0.622068,1.670280,...,1.192423,0.282513,0.102725,-2.643875,1.256638,0.515042,-0.887655,2.158556,0.722484,-2.132981
3,-0.195905,-0.708368,-0.438068,-0.039942,-0.576910,-2.054896,0.743676,0.733531,-0.591675,1.497057,...,1.008958,0.267344,0.132643,-2.577739,1.266693,0.442742,-0.628048,2.121150,0.302955,-2.106254
4,-0.252859,-0.642398,-0.366567,-0.161933,-0.604376,-1.946272,0.809062,0.753072,-0.668638,1.584506,...,1.106878,0.349753,0.110153,-2.584254,1.268440,0.510725,-0.682220,2.122981,0.447711,-2.095988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1291,-0.251603,-0.666749,-0.353784,-0.113625,-0.695911,-1.973817,0.884866,0.887493,-0.557792,1.553054,...,0.825795,0.481325,0.153730,-2.653371,1.296592,0.352911,-0.581045,2.229146,0.502051,-2.088052
1292,-0.256360,-0.853312,-0.156872,-0.193501,-0.632463,-1.576671,0.971514,0.999443,-0.584494,1.470471,...,0.798278,0.626916,0.139105,-2.687439,1.343616,0.418711,-0.539786,2.281996,0.935813,-2.075972
1293,-0.167522,-0.795294,-0.521734,0.089814,-0.657123,-2.063783,0.897399,0.927285,-0.476006,1.265823,...,0.859283,0.344413,0.174654,-2.592965,1.277332,0.410306,-0.425973,2.205443,0.385370,-2.108131
1294,-0.187342,-0.751482,-0.416212,-0.010506,-0.699113,-2.133993,0.851930,0.911150,-0.507752,1.368326,...,0.855938,0.395879,0.205777,-2.620315,1.279013,0.320983,-0.544632,2.227513,0.447895,-2.109630


In [35]:
dce_repr = sol_repr.iloc[4].copy()
dce_df = pd.DataFrame([dce_repr] * 1296, index=range(1296),columns=sol_repr.columns)
dce_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,1.046752,-0.654765,-0.624287,-0.709546,0.241607,-1.19603,1.033706,0.971329,-1.105541,0.868621,...,-0.032768,0.499183,0.588123,-2.801194,0.802723,-0.392907,0.52999,2.377182,0.318138,-1.967838
1,1.046752,-0.654765,-0.624287,-0.709546,0.241607,-1.19603,1.033706,0.971329,-1.105541,0.868621,...,-0.032768,0.499183,0.588123,-2.801194,0.802723,-0.392907,0.52999,2.377182,0.318138,-1.967838
2,1.046752,-0.654765,-0.624287,-0.709546,0.241607,-1.19603,1.033706,0.971329,-1.105541,0.868621,...,-0.032768,0.499183,0.588123,-2.801194,0.802723,-0.392907,0.52999,2.377182,0.318138,-1.967838
3,1.046752,-0.654765,-0.624287,-0.709546,0.241607,-1.19603,1.033706,0.971329,-1.105541,0.868621,...,-0.032768,0.499183,0.588123,-2.801194,0.802723,-0.392907,0.52999,2.377182,0.318138,-1.967838
4,1.046752,-0.654765,-0.624287,-0.709546,0.241607,-1.19603,1.033706,0.971329,-1.105541,0.868621,...,-0.032768,0.499183,0.588123,-2.801194,0.802723,-0.392907,0.52999,2.377182,0.318138,-1.967838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1291,1.046752,-0.654765,-0.624287,-0.709546,0.241607,-1.19603,1.033706,0.971329,-1.105541,0.868621,...,-0.032768,0.499183,0.588123,-2.801194,0.802723,-0.392907,0.52999,2.377182,0.318138,-1.967838
1292,1.046752,-0.654765,-0.624287,-0.709546,0.241607,-1.19603,1.033706,0.971329,-1.105541,0.868621,...,-0.032768,0.499183,0.588123,-2.801194,0.802723,-0.392907,0.52999,2.377182,0.318138,-1.967838
1293,1.046752,-0.654765,-0.624287,-0.709546,0.241607,-1.19603,1.033706,0.971329,-1.105541,0.868621,...,-0.032768,0.499183,0.588123,-2.801194,0.802723,-0.392907,0.52999,2.377182,0.318138,-1.967838
1294,1.046752,-0.654765,-0.624287,-0.709546,0.241607,-1.19603,1.033706,0.971329,-1.105541,0.868621,...,-0.032768,0.499183,0.588123,-2.801194,0.802723,-0.392907,0.52999,2.377182,0.318138,-1.967838


In [36]:
X_train = fea_df
x_test  = pd.concat([pep_repr,dce_df],axis=1)
x_test.columns = x_test.columns.astype(str)


In [37]:
class_weights = {0: 3, 1: 2, 2: 1}
yield_classifier = RandomForestClassifier(random_state=42, class_weight= class_weights) #
yield_train, yields = oversample(X_train, yields, X_train, yields, high_scale=8, medium_scale=4) 
yield_classifier.fit(yield_train, yields)
yield_pred = yield_classifier.predict(x_test)
seq_lib.insert(3,'yield_pred', yield_pred)

In [38]:
class_weights = {0: 4, 1: 2, 2: 1}
ee_classifier = RandomForestClassifier(random_state=42, class_weight= class_weights)
ee_train, ee = oversample(X_train, ee, X_train, ee, high_scale=2, low_scale=1) 
ee_classifier.fit(ee_train, ee)
ee_pred = ee_classifier.predict(x_test)

seq_lib.insert(4,'ee_pred', ee_pred)


In [39]:
seq_lib.to_csv('./Pred/Trained_on_180_oversample_2.csv')