In [1]:
import pandas as pd
import numpy as np 
from rdkit import Chem 
from rdkit import DataStructs 
from rdkit.Chem import AllChem 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA

In [3]:
#reads the csv file to a pd data frame (df) 
df = pd.read_csv("/Users/schwegler2/Downloads/es9b04833_si_004.csv")

In [5]:
#Create a FingerprintGenerator object 
fpgen = AllChem.GetRDKitFPGenerator()

#create a list of SMILES (simplified molecular-input line entry system) strings from the df data frame 
smiles_list=df["smiles"].values.tolist()

#Construct a rdkit.Chem.rdchem.Mol object from each SMILES string
mol_list=[Chem.MolFromSmiles(x) for x in smiles_list]

#convert rdkit.Chem.rdchem.Mol objects to fingerprint bit vectors
fps = [fpgen.GetFingerprint(x) for x in mol_list]

In [7]:
#create a list of np arrays of Morgan Fingerprints for each molecule
fp = []
for i in range(len(mol_list)):
    fp.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol_list[i], radius=3, nBits=2048)))

In [9]:
#set the column 'fingerprint' to the array fp 
df["fingerprint"]= fp
df = df.fillna(-1)

In [11]:
#Pull all of the relevant columns from the dataframe
column_names = [col for col in df.columns if col not in ["smiles", "fingerprint"]]

#set X to the column 'fingerprint' (input) 
X = df["fingerprint"].copy()

#set Y to the columns in column names (output) 
Y = df[column_names].copy()

#Convert X from a pandas series to a np array 
fp_list = [np.array(fp) for fp in X]
X = np.vstack(fp_list)

In [13]:
#create the training and test split (train = 70% test = 30%) 
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    train_size=.7,
                                                    random_state=25)

In [15]:
#Logistic Regression estimates the probability of an event occurring 
#often used for classification and predictive analysis
#liblinear: library for large-scale linear classification 
logistic_regression = LogisticRegression(solver='liblinear')

#fitting one classifer per target, extending logistic regression to support multi-target claification 
multi_output_classifier_LR = MultiOutputClassifier(logistic_regression)
multi_output_classifier_LR.fit(X_train, Y_train)

In [17]:
#convert Y_test from a pandas data frame to a np array 
Y_test_np = Y_test.to_numpy()

In [19]:
#Test the model with X_test to get predicted Y 
Y_pred_test = multi_output_classifier_LR.predict(X_test)

#calculate the accuracy for each output and add to the list accuracies 
accuracies = []
for i in range(Y_test_np.shape[1]):
    accuracy = accuracy_score(Y_test_np[:, i], Y_pred_test[:, i])
    #print(i, ": ", accuracy)
    accuracies.append(accuracy)

# Average accuracy across all outputs
mean_accuracy = sum(accuracies) / len(accuracies)
print("Mean Accuracy:", mean_accuracy)

#convert Y_test_np and Y_pred_test from a 2D array to a 1D array ([[0,1,0],[0,1,0]] -> [0,1,0,0,1,0])
Y_test_flattened = Y_test_np.flatten()
Y_pred_flattened = Y_pred_test.flatten()

#create a classification report
print(classification_report(Y_test_flattened, Y_pred_flattened))

Mean Accuracy: 0.7540485829959516
              precision    recall  f1-score   support

        -1.0       0.76      0.73      0.75      3569
         0.0       0.76      0.82      0.79      4033
         1.0       0.38      0.15      0.22       302

    accuracy                           0.75      7904
   macro avg       0.63      0.57      0.58      7904
weighted avg       0.75      0.75      0.75      7904



In [21]:
#Combines the output of multiple decision trees to reach a single result 
#Used for both classification and regression problems 
RandomForestClassifier = RandomForestClassifier(n_estimators=250)
RandomForestClassifier.fit(X_train, Y_train)

In [23]:
Y_pred_test = RandomForestClassifier.predict(X_test)


accuracies = []
for i in range(Y_test_np.shape[1]):
    accuracy = accuracy_score(Y_test_np[:, i], Y_pred_test[:, i])
    #print(i, ": ", accuracy)
    accuracies.append(accuracy)

mean_accuracy = sum(accuracies) / len(accuracies)
print("Mean Accuracy:", mean_accuracy)

Y_test_flattened = Y_test_np.flatten()
Y_pred_flattened = Y_pred_test.flatten()

print(classification_report(Y_test_flattened, Y_pred_flattened))

Mean Accuracy: 0.7701163967611337
              precision    recall  f1-score   support

        -1.0       0.84      0.67      0.75      3569
         0.0       0.73      0.91      0.81      4033
         1.0       0.41      0.12      0.19       302

    accuracy                           0.77      7904
   macro avg       0.66      0.57      0.58      7904
weighted avg       0.77      0.77      0.76      7904



In [25]:
#Multi-layer perceptron 
#neural network that can be used to perform classification 
MLPClassifier = MLPClassifier()
multi_output_classifier_MLP = MultiOutputClassifier(MLPClassifier)
multi_output_classifier_MLP.fit(X_train, Y_train)

In [26]:
Y_pred_test = multi_output_classifier_MLP.predict(X_test)


accuracies = []
for i in range(Y_test_np.shape[1]):
    accuracy = accuracy_score(Y_test_np[:, i], Y_pred_test[:, i])
    #print(i, ": ", accuracy)
    accuracies.append(accuracy)

mean_accuracy = sum(accuracies) / len(accuracies)
print("Mean Accuracy:", mean_accuracy)

Y_test_flattened = Y_test_np.flatten()
Y_pred_flattened = Y_pred_test.flatten()

print(classification_report(Y_test_flattened, Y_pred_flattened))

Mean Accuracy: 0.7453188259109312
              precision    recall  f1-score   support

        -1.0       0.75      0.73      0.74      3569
         0.0       0.76      0.81      0.78      4033
         1.0       0.34      0.16      0.22       302

    accuracy                           0.75      7904
   macro avg       0.61      0.56      0.58      7904
weighted avg       0.74      0.75      0.74      7904

