# Import libraries

In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read dataset

In [80]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [81]:
path = '/content/drive/MyDrive/Colab Notebooks/AntimicrobialML_Varun/cyrille_data/polymer_data.xlsx'

data = pd.read_excel(path)
data.head()

Unnamed: 0,polymer,type_A,type_B1,type_B2,type_C,composition_A,composition_B1,composition_B2,dpn,mol_weight,cLogP,MIC
0,0,Boc-AEAm,PEAm,,HEAm,0.5,0.3,0.0,100,9000.0,0.6763,32-64
1,1,Boc-AEAm,PEAm,,HEAm,0.5,0.3,0.0,40,6300.0,0.6763,64
2,2,Boc-AEAm,PEAm,,HEAm,0.5,0.3,0.0,20,3700.0,0.6763,64
3,3,Boc-AEAm,PEAm,,,0.7,0.3,0.0,100,17900.0,1.0527,128
4,4,Boc-AEAm,PEAm,,,0.7,0.3,0.0,40,6100.0,1.0527,64


# Data preprocessing

### Modify the MIC column

In [82]:
data = data.replace({'>128':128,'>256':256, '32-64':64, '64-128':128,'128-256':256})

In [83]:
data.head()

Unnamed: 0,polymer,type_A,type_B1,type_B2,type_C,composition_A,composition_B1,composition_B2,dpn,mol_weight,cLogP,MIC
0,0,Boc-AEAm,PEAm,,HEAm,0.5,0.3,0.0,100,9000.0,0.6763,64
1,1,Boc-AEAm,PEAm,,HEAm,0.5,0.3,0.0,40,6300.0,0.6763,64
2,2,Boc-AEAm,PEAm,,HEAm,0.5,0.3,0.0,20,3700.0,0.6763,64
3,3,Boc-AEAm,PEAm,,,0.7,0.3,0.0,100,17900.0,1.0527,128
4,4,Boc-AEAm,PEAm,,,0.7,0.3,0.0,40,6100.0,1.0527,64


### Assign classes based on MIC values and turn this into a classification problem

In [84]:
# Any MIC value equal to or below 64 is assigned class 1 (good antimicrobial polymer) and beyond 64 is assigned 0 (bad antimicrobial polymer)

cutoff = 64
data['Category'] = data['MIC'].apply(lambda x: 1 if x <= cutoff else 0)

In [85]:
data.head()

Unnamed: 0,polymer,type_A,type_B1,type_B2,type_C,composition_A,composition_B1,composition_B2,dpn,mol_weight,cLogP,MIC,Category
0,0,Boc-AEAm,PEAm,,HEAm,0.5,0.3,0.0,100,9000.0,0.6763,64,1
1,1,Boc-AEAm,PEAm,,HEAm,0.5,0.3,0.0,40,6300.0,0.6763,64,1
2,2,Boc-AEAm,PEAm,,HEAm,0.5,0.3,0.0,20,3700.0,0.6763,64,1
3,3,Boc-AEAm,PEAm,,,0.7,0.3,0.0,100,17900.0,1.0527,128,0
4,4,Boc-AEAm,PEAm,,,0.7,0.3,0.0,40,6100.0,1.0527,64,1


In [86]:
# remove MIC and polymer columns

data = data.drop(['polymer', 'MIC'], axis=1)

In [87]:
data.head()

Unnamed: 0,type_A,type_B1,type_B2,type_C,composition_A,composition_B1,composition_B2,dpn,mol_weight,cLogP,Category
0,Boc-AEAm,PEAm,,HEAm,0.5,0.3,0.0,100,9000.0,0.6763,1
1,Boc-AEAm,PEAm,,HEAm,0.5,0.3,0.0,40,6300.0,0.6763,1
2,Boc-AEAm,PEAm,,HEAm,0.5,0.3,0.0,20,3700.0,0.6763,1
3,Boc-AEAm,PEAm,,,0.7,0.3,0.0,100,17900.0,1.0527,0
4,Boc-AEAm,PEAm,,,0.7,0.3,0.0,40,6100.0,1.0527,1


### Create dummy variables

In [88]:
data_with_dummies = pd.get_dummies(data, drop_first=True)

In [89]:
data_with_dummies.head()

Unnamed: 0,composition_A,composition_B1,composition_B2,dpn,mol_weight,cLogP,Category,type_A_Boc-AEAm,type_A_DMAEA,type_B1_PEAm,type_B2_None,type_C_HEAm,type_C_None,type_C_PEGA
0,0.5,0.3,0.0,100,9000.0,0.6763,1,1,0,1,1,1,0,0
1,0.5,0.3,0.0,40,6300.0,0.6763,1,1,0,1,1,1,0,0
2,0.5,0.3,0.0,20,3700.0,0.6763,1,1,0,1,1,1,0,0
3,0.7,0.3,0.0,100,17900.0,1.0527,0,1,0,1,1,0,1,0
4,0.7,0.3,0.0,40,6100.0,1.0527,1,1,0,1,1,0,1,0


### Extract features and target

In [90]:
inputs = data_with_dummies.drop(columns=['Category'])
target = data_with_dummies['Category']
target.value_counts()

0    130
1     27
Name: Category, dtype: int64

### Feature scaling

In [91]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(inputs)

In [92]:
inputs_scaled = scaler.transform(inputs)

### Train-test split (with stratify)

In [93]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(inputs_scaled, target, test_size=0.2, random_state=12, stratify=target)

In [94]:
y_train.value_counts()

0    104
1     21
Name: Category, dtype: int64

In [95]:
y_test.value_counts()

0    26
1     6
Name: Category, dtype: int64

In [96]:
27/130, 21/104, 6/26

(0.2076923076923077, 0.20192307692307693, 0.23076923076923078)

# Logistic regression model

### Get the optimal parameter set

In [166]:
lr_grid = {'C': [0.001,0.01,0.1,1,10, 100, 1000],
          'max_iter': [100, 500, 1000],
          'class_weight': ['balanced']}

lr_grid

{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
 'max_iter': [100, 500, 1000],
 'class_weight': ['balanced']}

In [167]:
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=5, random_state=10)

In [168]:
# Create the model to be tuned
from sklearn.linear_model import LogisticRegression

lr_base = LogisticRegression()

In [169]:
# Create the random search Random Forest
from sklearn.model_selection import GridSearchCV

lr_grid = GridSearchCV(estimator = lr_base, param_grid = lr_grid, cv = cv, scoring = 'accuracy', verbose = 1, n_jobs = -1)

In [170]:
# Fit the model
lr_grid.fit(x_train,y_train)

Fitting 20 folds for each of 21 candidates, totalling 420 fits


In [171]:
# View the best parameters from the grid search
lr_grid.best_params_

{'C': 0.1, 'class_weight': 'balanced', 'max_iter': 100}

### Optimal logistic regression model

In [172]:
lr_optimal = LogisticRegression(C=0.1, max_iter = 100, class_weight = 'balanced', random_state=10)

In [173]:
lr_optimal.fit(x_train, y_train)

In [174]:
y_pred = lr_optimal.predict(x_test)

In [175]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[20,  6],
       [ 0,  6]])

In [176]:
features_table = pd.DataFrame()
features_table['Features'] = inputs.columns
weights = lr_optimal.coef_[0,:]
features_table['Weights'] = np.round(weights, 2)

features_table

Unnamed: 0,Features,Weights
0,composition_A,-0.22
1,composition_B1,0.38
2,composition_B2,0.16
3,dpn,-0.11
4,mol_weight,0.06
5,cLogP,0.41
6,type_A_Boc-AEAm,0.7
7,type_A_DMAEA,-0.38
8,type_B1_PEAm,0.0
9,type_B2_None,0.59


# Repeat the above using a Decision Tree classifier

### Get the optimal parameter set

In [148]:
dt_grid = {'min_samples_leaf': [1, 2, 3, 4],
          'max_depth': [1, 2, 3, 4],
          'class_weight': ['balanced']}

dt_grid

{'min_samples_leaf': [1, 2, 3, 4],
 'max_depth': [1, 2, 3, 4],
 'class_weight': ['balanced']}

In [149]:
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=5, random_state=10)

In [150]:
# Create the model to be tuned
from sklearn.tree import DecisionTreeClassifier

dt_base = DecisionTreeClassifier()

In [151]:
# Create the random search Random Forest
from sklearn.model_selection import GridSearchCV

dt_grid = GridSearchCV(estimator = dt_base, param_grid = dt_grid, cv = cv, scoring = 'accuracy', verbose = 1, n_jobs = -1)

In [152]:
# Fit the model
dt_grid.fit(x_train,y_train)

Fitting 20 folds for each of 16 candidates, totalling 320 fits


In [153]:
# View the best parameters from the grid search
dt_grid.best_params_

{'class_weight': 'balanced', 'max_depth': 4, 'min_samples_leaf': 1}

### Optimal logistic regression model

In [156]:
dt_optimal = DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 1, class_weight = 'balanced', random_state=10)

In [157]:
dt_optimal.fit(x_train, y_train)

In [158]:
y_pred = dt_optimal.predict(x_test)

In [159]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[23,  3],
       [ 0,  6]])