### Test aldol reaction ee value with classification models

1. check the data structure, make it less imbalanced.
2. try uninol repr peptide and onehot solvent 
3. try unimolrepr peptide and solvent(Best performance)
4. try classes weights and XGBoosting

In [1]:
import os
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
import pandas as pd

# Import relevant scikit-learn modules
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

import ModelFits

In [2]:
# read .csv format data
Data_path = '../Data/data_160_ori.csv'
REPR_DIR = '../Reprs'

repr_path= []
for file in os.listdir(REPR_DIR):
    if '.csv' in file:
        path = REPR_DIR+ '/' + file
        repr_path.append(path)

repr_path

['../Reprs/Solvent_Repr.csv',
 '../Reprs/Solvent_Repr_160.csv',
 '../Reprs/Solvent_Repr_180.csv',
 '../Reprs/sol_oh.csv',
 '../Reprs/sol_oh_180.csv',
 '../Reprs/UniMolRepr.csv',
 '../Reprs/UniMolRepr_160.csv',
 '../Reprs/UniMolRepr_180.csv']

In [3]:
df = pd.read_csv(Data_path, index_col=0)
unimol_repr = pd.read_csv(repr_path[-2],index_col=0)
sol_repr = pd.read_csv(repr_path[1],index_col=0)
sol_oh = pd.read_csv(repr_path[3],index_col=0)
sol_oh = sol_oh[0:160]

Check Data Sturcture and distribute to 3 level

In [4]:
ee = df['ee'].copy()

low = []
med = []
high = []

for i in range(len(ee)):
    if ee[i] >= 80:
        high.append(i)
        ee.iloc[i] = 'high'
    elif ee[i] <= 20:
        low.append(i)
        ee.iloc[i] = 'low'
    else:
        med.append(i)
        ee.iloc[i] = 'medium'

len(low),len(med),len(high)

(48, 100, 12)

In [5]:
# concat required features
fea_df = pd.concat([unimol_repr,sol_repr], axis=1)

# Extract label colunm
label_df = ee

In [6]:
# label Encoding
le = LabelEncoder()
label_le = le.fit_transform(label_df) # 2:Medium; 1: Low; 0:High

# one-hot Encoding
label_oh = pd.get_dummies(label_df, dtype=int)

In [7]:
label_le

array([2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1,
       2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2,
       0, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 2, 2, 1, 2, 2, 2, 0, 2, 0, 0, 0, 2, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 0, 2, 2, 2, 2, 1, 2, 2, 2, 1,
       2, 1, 1, 2, 2, 2, 1, 2, 0, 2, 0, 2, 0, 0, 0, 2, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 1, 0, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2])

## Try UniMOlRepr feature of peptide and solvent

In [8]:
X = fea_df
y = label_le
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
rf_classifier = RandomForestClassifier(random_state=42)

# Train the classifier on the resampled training data
rf_classifier.fit(X_train, y_train)

# Predictions on the original test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[ 2  0  1]
 [ 0  4  6]
 [ 0  3 16]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.57      0.40      0.47        10
           2       0.70      0.84      0.76        19

    accuracy                           0.69        32
   macro avg       0.76      0.64      0.68        32
weighted avg       0.69      0.69      0.67        32



## Try OneHot peptide feature and OneHot solvent 

In [15]:
# Onehot coding peptide
fea_oh = pd.get_dummies(df['Peptide'], dtype=int)
fea_oh = pd.concat([fea_oh,sol_oh], axis=1)
fea_oh


Unnamed: 0,AGVL,APGG,APGP,GAPV,GGGP,GGLG,GLAA,GLGG,GLGP,GLLL,...,PPVL,VAPL,VGLG,VPAL,VPGL,DCE,DCM,MeCN,MeOH,iPrOH
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
156,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
157,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
158,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0


In [19]:
X = fea_oh
y = label_le

for i in range (40,50):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    rf_classifier = RandomForestClassifier(random_state=42)

    # Train the classifier on the sampled training data
    rf_classifier.fit(X_train, y_train)

    # Predictions on the original test set
    y_pred = rf_classifier.predict(X_test)

    # Evaluate the model
    print(f'*********************Random State: {i}**********************')
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print('-------------------------------------------------------------')

*********************Random State: 40**********************
Confusion Matrix:
 [[ 0  0  2]
 [ 0  7  3]
 [ 0  2 18]]

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.78      0.70      0.74        10
           2       0.78      0.90      0.84        20

    accuracy                           0.78        32
   macro avg       0.52      0.53      0.52        32
weighted avg       0.73      0.78      0.75        32

-------------------------------------------------------------
*********************Random State: 41**********************
Confusion Matrix:
 [[ 0  0  0]
 [ 0  3  6]
 [ 1  2 20]]

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.60      0.33      0.43         9
           2       0.77      0.87      0.82        23

    accuracy                           0.72        32
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


*********************Random State: 43**********************
Confusion Matrix:
 [[ 1  0  1]
 [ 0  7  5]
 [ 0  0 18]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       1.00      0.58      0.74        12
           2       0.75      1.00      0.86        18

    accuracy                           0.81        32
   macro avg       0.92      0.69      0.75        32
weighted avg       0.86      0.81      0.80        32

-------------------------------------------------------------
*********************Random State: 44**********************
Confusion Matrix:
 [[ 1  0  1]
 [ 0  6  2]
 [ 2  4 16]]

Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.50      0.40         2
           1       0.60      0.75      0.67         8
           2       0.84      0.73      0.78        22

    accuracy                           0.72        32
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


*********************Random State: 47**********************
Confusion Matrix:
 [[ 1  0  3]
 [ 0  6  3]
 [ 0  0 19]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.25      0.40         4
           1       1.00      0.67      0.80         9
           2       0.76      1.00      0.86        19

    accuracy                           0.81        32
   macro avg       0.92      0.64      0.69        32
weighted avg       0.86      0.81      0.79        32

-------------------------------------------------------------
*********************Random State: 48**********************
Confusion Matrix:
 [[ 3  0  0]
 [ 0  6  4]
 [ 0  0 19]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      0.60      0.75        10
           2       0.83      1.00      0.90        19

    accuracy                           0.88        32
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Try UnimolRepr peptide feature and OneHot solvent 

In [21]:
# concat feature
fea_Uni_Oh = pd.concat([unimol_repr,sol_oh],axis=1)

X = fea_Uni_Oh
y = label_le
for i in range(40,50):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)

    rf_classifier = RandomForestClassifier(random_state=42)

    # Train the classifier on the sampled training data
    rf_classifier.fit(X_train, y_train)

    # Predictions on the original test set
    y_pred = rf_classifier.predict(X_test)

    # Evaluate the model
    print(f'*********************Random State: {i}**********************')
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print('-------------------------------------------------------------')

*********************Random State: 40**********************
Confusion Matrix:
 [[ 0  0  2]
 [ 0  5  5]
 [ 0  2 18]]

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.71      0.50      0.59        10
           2       0.72      0.90      0.80        20

    accuracy                           0.72        32
   macro avg       0.48      0.47      0.46        32
weighted avg       0.67      0.72      0.68        32

-------------------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


*********************Random State: 41**********************
Confusion Matrix:
 [[ 0  0  0]
 [ 1  5  3]
 [ 2  4 17]]

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.56      0.56      0.56         9
           2       0.85      0.74      0.79        23

    accuracy                           0.69        32
   macro avg       0.47      0.43      0.45        32
weighted avg       0.77      0.69      0.72        32

-------------------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


*********************Random State: 42**********************
Confusion Matrix:
 [[ 2  0  1]
 [ 0  3  7]
 [ 0  3 16]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.50      0.30      0.37        10
           2       0.67      0.84      0.74        19

    accuracy                           0.66        32
   macro avg       0.72      0.60      0.64        32
weighted avg       0.65      0.66      0.63        32

-------------------------------------------------------------
*********************Random State: 43**********************
Confusion Matrix:
 [[ 1  0  1]
 [ 0  8  4]
 [ 0  2 16]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.80      0.67      0.73        12
           2       0.76      0.89      0.82        18

    accuracy                           0.78        32
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Try adjust classes weights on UniMolRepr

In [22]:

X = fea_df
y = label_le

for i in range(40, 50):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    class_weights = {0: 4, 1: 1, 2: 2} # O:High More important; 1: Low Last important; 2: Medium: less important
    rf_classifier = RandomForestClassifier(random_state=42,class_weight=class_weights) # class_weight=class_weights

    # Train the classifier on the sampled training data
    rf_classifier.fit(X_train, y_train)

    # Predictions on the original test set
    y_pred = rf_classifier.predict(X_test)

    # Evaluate the model
    print(f'*********************Random State: {i}**********************')
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print('-------------------------------------------------------------')

*********************Random State: 40**********************
Confusion Matrix:
 [[ 0  0  2]
 [ 0  6  4]
 [ 0  3 17]]

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.67      0.60      0.63        10
           2       0.74      0.85      0.79        20

    accuracy                           0.72        32
   macro avg       0.47      0.48      0.47        32
weighted avg       0.67      0.72      0.69        32

-------------------------------------------------------------
*********************Random State: 41**********************
Confusion Matrix:
 [[ 0  0  0]
 [ 1  5  3]
 [ 3  2 18]]

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.71      0.56      0.63         9
           2       0.86      0.78      0.82        23

    accuracy                           0.72        32
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


*********************Random State: 42**********************
Confusion Matrix:
 [[ 2  0  1]
 [ 0  6  4]
 [ 0  3 16]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.67      0.60      0.63        10
           2       0.76      0.84      0.80        19

    accuracy                           0.75        32
   macro avg       0.81      0.70      0.74        32
weighted avg       0.75      0.75      0.75        32

-------------------------------------------------------------
*********************Random State: 43**********************
Confusion Matrix:
 [[ 1  0  1]
 [ 0  7  5]
 [ 0  4 14]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.64      0.58      0.61        12
           2       0.70      0.78      0.74        18

    accuracy                           0.69        32
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Try different models

In [8]:
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier


models = [RidgeClassifier(),
          SGDClassifier(),
          KNeighborsClassifier(), # use k = 7 as in papers
          SVC(),
          MLPClassifier(hidden_layer_sizes=(100), # 5-neurons are used in the initial
                       activation='logistic',  # release of the paper
                       solver='lbfgs',
                       max_iter=1000,
                       random_state=42),
         ]

In [9]:
X = fea_df
y = label_le

for model in models:

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # class_weights = {0: 3, 1: 2, 2: 1} # O:High More important; 1: Medium Less important; 2: Low last important
    classifier = model

    # Train the classifier on the sampled training data
    classifier.fit(X_train, y_train)

    # Predictions on the original test set
    y_pred = classifier.predict(X_test)

    # Evaluate the model
    print(f'*********************Trained Model: {model}**********************')
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print('-------------------------------------------------------------')

*********************Trained Model: RidgeClassifier()**********************
Confusion Matrix:
 [[ 2  0  1]
 [ 0  4  6]
 [ 0  3 16]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.57      0.40      0.47        10
           2       0.70      0.84      0.76        19

    accuracy                           0.69        32
   macro avg       0.76      0.64      0.68        32
weighted avg       0.69      0.69      0.67        32

-------------------------------------------------------------
*********************Trained Model: SGDClassifier()**********************
Confusion Matrix:
 [[ 0  0  3]
 [ 0  1  9]
 [ 0  0 19]]

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       1.00      0.10      0.18        10
           2       0.61      1.00      0.76        19

    accuracy               

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


*********************Trained Model: MLPClassifier(activation='logistic', hidden_layer_sizes=100, max_iter=1000,
              random_state=42, solver='lbfgs')**********************
Confusion Matrix:
 [[ 1  0  2]
 [ 0  6  4]
 [ 0  5 14]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.55      0.60      0.57        10
           2       0.70      0.74      0.72        19

    accuracy                           0.66        32
   macro avg       0.75      0.56      0.60        32
weighted avg       0.68      0.66      0.65        32

-------------------------------------------------------------


Try Xgboost

In [32]:
import xgboost as xgb

fea_df = pd.concat([unimol_repr,sol_repr], axis=1)
fea_df.columns = range(fea_df.shape[1])

X = fea_df

for i in range(40, 50):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)

    # scale_pos_weight = len(y_train[y_train == 2]) / len(y_train[y_train == 0])
    xgb_classifier = xgb.XGBClassifier(objective='multi:softprob', num_class=3, random_state=i)

    xgb_classifier.fit(X_train, y_train)
    y_pred = xgb_classifier.predict(X_test)

    # Evaluate the model
    print(f'*********************Random State: {i}**********************')
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print('-------------------------------------------------------------')

*********************Random State: 40**********************
Confusion Matrix:
 [[ 4  0  0]
 [ 1  6  5]
 [ 2  2 15]]

Classification Report:
               precision    recall  f1-score   support

           0       0.57      1.00      0.73         4
           1       0.75      0.50      0.60        12
           2       0.75      0.79      0.77        19

    accuracy                           0.71        35
   macro avg       0.69      0.76      0.70        35
weighted avg       0.73      0.71      0.71        35

-------------------------------------------------------------
*********************Random State: 41**********************
Confusion Matrix:
 [[ 2  0  3]
 [ 1  6  2]
 [ 1  7 13]]

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.40      0.44         5
           1       0.46      0.67      0.55         9
           2       0.72      0.62      0.67        21

    accuracy                           0.60        35
   