### Test aldol reaction data yields with classification models

Because the data is imbalanced, we choose random forest model.
1. try one-hot coding peptide and solvent
2. try uninol repr peptide and onehot solvent
3. try unimolrepr peptide and solvent 
4. try classes weights 

In [1]:
import os
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
import pandas as pd

# Import relevant scikit-learn modules
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# read .csv format data
Data_path = '../Data/data_160_ori.csv'
REPR_DIR = '../Reprs'

repr_path= []
for file in os.listdir(REPR_DIR):
    if '.csv' in file:
        path = REPR_DIR+ '/' + file
        repr_path.append(path)

repr_path

['../Reprs/Solvent_Repr.csv',
 '../Reprs/Solvent_Repr_160.csv',
 '../Reprs/Solvent_Repr_180.csv',
 '../Reprs/sol_oh.csv',
 '../Reprs/sol_oh_180.csv',
 '../Reprs/UniMolRepr.csv',
 '../Reprs/UniMolRepr_160.csv',
 '../Reprs/UniMolRepr_180.csv']

In [3]:
df = pd.read_csv(Data_path, index_col=0)
unimol_repr = pd.read_csv(repr_path[-2],index_col=0)
sol_repr = pd.read_csv(repr_path[1],index_col=0)
sol_oh = pd.read_csv(repr_path[4],index_col=0)

In [4]:
sol_oh = sol_oh.iloc[:160]

In [5]:
# concat required features
fea_df = pd.concat([unimol_repr,sol_repr], axis=1)

# Extract label colunm
label_df = df['yields_label']

In [6]:
label_df.value_counts()

low       111
medium     32
high       17
Name: yields_label, dtype: int64

In [7]:
# label Encoding
le = LabelEncoder()
label_le = le.fit_transform(label_df)

# one-hot Encoding
label_oh = pd.get_dummies(label_df, dtype=int)

In [8]:
label_le

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
       2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 0, 1, 1, 2, 2, 2,
       2, 1, 1, 1, 1, 1, 1, 2, 2, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 0, 1, 2, 0, 0, 2, 1,
       2, 0, 2, 1, 1, 1, 1, 1, 0, 1, 2, 0, 0, 1, 1, 2, 0, 0, 1, 1, 2, 1,
       2, 0, 1, 2, 0, 0])

## Try UniMOlRepr feature of peptide and solvent

In [18]:
X = fea_df
y = label_le
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
rf_classifier = RandomForestClassifier(random_state=42)

# Train the classifier on the resampled training data
rf_classifier.fit(X_train, y_train)

# Predictions on the original test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[ 3  0  0]
 [ 0 27  0]
 [ 0  0  2]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00        27
           2       1.00      1.00      1.00         2

    accuracy                           1.00        32
   macro avg       1.00      1.00      1.00        32
weighted avg       1.00      1.00      1.00        32



## Try OneHot peptide feature and OneHot solvent 

In [20]:
# Onehot coding peptide
fea_oh = pd.get_dummies(df['Peptide'], dtype=int)
fea_oh = pd.concat([fea_oh,sol_oh], axis=1)
fea_oh.head()


Unnamed: 0,AGVL,APGG,APGP,GAPV,GGGP,GGLG,GLAA,GLGG,GLGP,GLLL,...,PPVL,VAPL,VGLG,VPAL,VPGL,DCE,DCM,MeCN,MeOH,iPrOH
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [21]:
X = fea_oh
y = label_le
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(random_state=42)

# Train the classifier on the sampled training data
rf_classifier.fit(X_train, y_train)

# Predictions on the original test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[ 2  1  0]
 [ 1 26  0]
 [ 1  1  0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.67      0.57         3
           1       0.93      0.96      0.95        27
           2       0.00      0.00      0.00         2

    accuracy                           0.88        32
   macro avg       0.48      0.54      0.51        32
weighted avg       0.83      0.88      0.85        32



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Try UnimolRepr peptide feature and OneHot solvent 

In [22]:
# concat feature
fea_Uni_Oh = pd.concat([unimol_repr,sol_oh],axis=1)

X = fea_Uni_Oh
y = label_le
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(random_state=42)

# Train the classifier on the sampled training data
rf_classifier.fit(X_train, y_train)

# Predictions on the original test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[ 3  0  0]
 [ 2 25  0]
 [ 1  1  0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         3
           1       0.96      0.93      0.94        27
           2       0.00      0.00      0.00         2

    accuracy                           0.88        32
   macro avg       0.49      0.64      0.54        32
weighted avg       0.86      0.88      0.86        32



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Try adjust classes weights on UniMolRepr

In [20]:
X = fea_df
y = label_le

for i in range(40, 50):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    class_weights = {0: 3, 1: 1, 2: 2} # O:High More important; 1: Medium Less important; 2: Low last important
    rf_classifier = RandomForestClassifier(random_state=i, class_weight=class_weights) # class_weight=class_weights

    # Train the classifier on the sampled training data
    rf_classifier.fit(X_train, y_train)

    # Predictions on the original test set
    y_pred = rf_classifier.predict(X_test)

    # Evaluate the model
    print(f'*********************Random State: {i}**********************')
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print('-------------------------------------------------------------')

*********************Random State: 40**********************
Confusion Matrix:
 [[ 4  1  0]
 [ 0 21  0]
 [ 0  3  3]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.80      0.89         5
           1       0.84      1.00      0.91        21
           2       1.00      0.50      0.67         6

    accuracy                           0.88        32
   macro avg       0.95      0.77      0.82        32
weighted avg       0.90      0.88      0.86        32

-------------------------------------------------------------
*********************Random State: 41**********************
Confusion Matrix:
 [[ 2  1  1]
 [ 0 24  0]
 [ 2  2  0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.50      0.50         4
           1       0.89      1.00      0.94        24
           2       0.00      0.00      0.00         4

    accuracy                           0.81        32
   

In [21]:
# without weighted
X = fea_df
y = label_le

for i in range(40, 50):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    # class_weights = {0: 9, 1: 1, 2: 3} # O:High More important; 1: Medium Less important; 2: Low last important
    rf_classifier = RandomForestClassifier(random_state=i) # class_weight=class_weights

    # Train the classifier on the sampled training data
    rf_classifier.fit(X_train, y_train)

    # Predictions on the original test set
    y_pred = rf_classifier.predict(X_test)

    # Evaluate the model
    print(f'*********************Random State: {i}**********************')
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print('-------------------------------------------------------------')

*********************Random State: 40**********************
Confusion Matrix:
 [[ 3  1  1]
 [ 0 21  0]
 [ 0  2  4]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.60      0.75         5
           1       0.88      1.00      0.93        21
           2       0.80      0.67      0.73         6

    accuracy                           0.88        32
   macro avg       0.89      0.76      0.80        32
weighted avg       0.88      0.88      0.87        32

-------------------------------------------------------------
*********************Random State: 41**********************
Confusion Matrix:
 [[ 3  1  0]
 [ 0 24  0]
 [ 2  1  1]]

Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.75      0.67         4
           1       0.92      1.00      0.96        24
           2       1.00      0.25      0.40         4

    accuracy                           0.88        32
   

## Try different models

In [9]:
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier


models = [RidgeClassifier(),
          SGDClassifier(),
          KNeighborsClassifier(n_neighbors=7), # use k = 7 as in papers
          SVC(),
          MLPClassifier(hidden_layer_sizes=(100), # 5-neurons are used in the initial
                       activation='logistic',  # release of the paper
                       solver='lbfgs',
                       max_iter=1000,
                       random_state=42),
         ]

In [10]:
X = fea_df
y = label_le

for model in models:

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # class_weights = {0: 3, 1: 2, 2: 1} # O:High More important; 1: Medium Less important; 2: Low last important
    classifier = model

    # Train the classifier on the sampled training data
    classifier.fit(X_train, y_train)

    # Predictions on the original test set
    y_pred = classifier.predict(X_test)

    # Evaluate the model
    print(f'*********************Trained Model: {model}**********************')
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print('-------------------------------------------------------------')

*********************Trained Model: RidgeClassifier()**********************
Confusion Matrix:
 [[ 3  0  0]
 [ 1 23  1]
 [ 1  3  0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.60      1.00      0.75         3
           1       0.88      0.92      0.90        25
           2       0.00      0.00      0.00         4

    accuracy                           0.81        32
   macro avg       0.49      0.64      0.55        32
weighted avg       0.75      0.81      0.77        32

-------------------------------------------------------------
*********************Trained Model: SGDClassifier()**********************
Confusion Matrix:
 [[ 3  0  0]
 [10  8  7]
 [ 0  1  3]]

Classification Report:
               precision    recall  f1-score   support

           0       0.23      1.00      0.38         3
           1       0.89      0.32      0.47        25
           2       0.30      0.75      0.43         4

    accuracy               

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


*********************Trained Model: MLPClassifier(activation='logistic', hidden_layer_sizes=100, max_iter=1000,
              random_state=42, solver='lbfgs')**********************
Confusion Matrix:
 [[ 1  2  0]
 [ 0 23  2]
 [ 0  2  2]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.85      0.92      0.88        25
           2       0.50      0.50      0.50         4

    accuracy                           0.81        32
   macro avg       0.78      0.58      0.63        32
weighted avg       0.82      0.81      0.80        32

-------------------------------------------------------------


In [16]:
import xgboost as xgb

fea_df = pd.concat([unimol_repr,sol_repr], axis=1)
fea_df.columns = range(fea_df.shape[1])

X = fea_df
y = label_le

for i in range(40, 50):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)

    # scale_pos_weight = len(y_train[y_train == 2]) / len(y_train[y_train == 0])
    xgb_classifier = xgb.XGBClassifier(objective='multi:softprob', num_class=3, random_state=i)

    xgb_classifier.fit(X_train, y_train)
    y_pred = xgb_classifier.predict(X_test)

    # Evaluate the model
    print(f'*********************Random State: {i}**********************')
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print('-------------------------------------------------------------')

*********************Random State: 40**********************
Confusion Matrix:
 [[ 1  1  1]
 [ 1 18  2]
 [ 1  1  2]]

Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.33      0.33         3
           1       0.90      0.86      0.88        21
           2       0.40      0.50      0.44         4

    accuracy                           0.75        28
   macro avg       0.54      0.56      0.55        28
weighted avg       0.77      0.75      0.76        28

-------------------------------------------------------------
*********************Random State: 41**********************
Confusion Matrix:
 [[ 3  1  0]
 [ 0 20  2]
 [ 0  2  0]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.75      0.86         4
           1       0.87      0.91      0.89        22
           2       0.00      0.00      0.00         2

    accuracy                           0.82        28
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


*********************Random State: 45**********************
Confusion Matrix:
 [[ 1  0  0]
 [ 1 22  0]
 [ 0  2  2]]

Classification Report:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.92      0.96      0.94        23
           2       1.00      0.50      0.67         4

    accuracy                           0.89        28
   macro avg       0.81      0.82      0.76        28
weighted avg       0.91      0.89      0.89        28

-------------------------------------------------------------
*********************Random State: 46**********************
Confusion Matrix:
 [[ 1  0  1]
 [ 0 19  2]
 [ 0  2  3]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.90      0.90      0.90        21
           2       0.50      0.60      0.55         5

    accuracy                           0.82        28
   