## Imports

In [148]:
import datetime
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold, chi2, RFE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, MinMaxScaler
from collections import Counter
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
import random


## Data Ingestion

In [149]:
df_train = pd.read_csv("af2_dataset_training_labeled.csv.gz", index_col=0)
df_train

Unnamed: 0,annotation_sequence,feat_A,feat_C,feat_D,feat_E,feat_F,feat_G,feat_H,feat_I,feat_K,...,feat_DSSP_10,feat_DSSP_11,feat_DSSP_12,feat_DSSP_13,coord_X,coord_Y,coord_Z,entry,entry_index,y_Ligand
0,M,False,False,False,False,False,False,False,False,False,...,0,0.0,47,-0.0,-26.499001,-4.742000,-35.189999,GEMI5_HUMAN,0,False
1,G,False,False,False,False,False,True,False,False,False,...,0,0.0,0,0.0,-25.158001,-1.342000,-34.104000,GEMI5_HUMAN,1,False
2,Q,False,False,False,False,False,False,False,False,False,...,1,-0.0,-1,-0.0,-21.926001,-1.641000,-32.175999,GEMI5_HUMAN,2,False
3,E,False,False,False,True,False,False,False,False,False,...,706,-0.1,705,-0.0,-22.073999,0.654000,-29.171000,GEMI5_HUMAN,3,False
4,P,False,False,False,False,False,False,False,False,False,...,0,0.0,705,-0.2,-19.783001,2.670000,-26.858999,GEMI5_HUMAN,4,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,S,False,False,False,False,False,False,False,False,False,...,-3,-0.1,2,-0.4,-19.742001,20.796000,-12.319000,AOC3_HUMAN,755,False
756,H,False,False,False,False,False,False,True,False,False,...,-358,-0.1,-330,-0.1,-16.299000,19.153999,-12.640000,AOC3_HUMAN,756,False
757,G,False,False,False,False,False,True,False,False,False,...,-360,-0.2,-1,-0.1,-13.404000,19.502001,-10.121000,AOC3_HUMAN,757,False
758,G,False,False,False,False,False,True,False,False,False,...,0,0.0,0,0.0,-10.986000,20.320000,-13.016000,AOC3_HUMAN,758,False


### Preprocess and analyze data

In [150]:
print(df_train.dtypes)

annotation_sequence     object
feat_A                    bool
feat_C                    bool
feat_D                    bool
feat_E                    bool
feat_F                    bool
feat_G                    bool
feat_H                    bool
feat_I                    bool
feat_K                    bool
feat_L                    bool
feat_M                    bool
feat_N                    bool
feat_P                    bool
feat_Q                    bool
feat_R                    bool
feat_S                    bool
feat_T                    bool
feat_V                    bool
feat_W                    bool
feat_Y                    bool
annotation_atomrec      object
feat_PHI               float64
feat_PSI               float64
feat_TAU               float64
feat_THETA             float64
feat_BBSASA            float64
feat_SCSASA            float64
feat_pLDDT             float64
feat_DSSP_H               bool
feat_DSSP_B               bool
feat_DSSP_E               bool
feat_DSS

In [151]:
df_train['y_Ligand'].value_counts()

False    479912
True      17254
Name: y_Ligand, dtype: int64

In [152]:
df_train['feat_PHI']

0      0.000000
1     -1.100680
2     -1.295398
3     -2.352796
4     -1.134474
         ...   
755   -2.378927
756   -2.122860
757   -1.124856
758    1.651085
759   -1.935096
Name: feat_PHI, Length: 497166, dtype: float64

In [153]:
df_train['entry']

0      GEMI5_HUMAN
1      GEMI5_HUMAN
2      GEMI5_HUMAN
3      GEMI5_HUMAN
4      GEMI5_HUMAN
          ...     
755     AOC3_HUMAN
756     AOC3_HUMAN
757     AOC3_HUMAN
758     AOC3_HUMAN
759     AOC3_HUMAN
Name: entry, Length: 497166, dtype: object

In [154]:
df_train['entry'].value_counts()

MACF1_HUMAN    7385
HUWE1_HUMAN    4371
DMD_HUMAN      3682
NF1_HUMAN      2836
SETD2_HUMAN    2561
               ... 
TVB9_HUMAN      111
CISD1_HUMAN     105
THIO_HUMAN      102
S1A7A_HUMAN      98
ACBD7_HUMAN      85
Name: entry, Length: 723, dtype: int64

## Transformation of Data

#### Amino Acids Converted:

In [155]:
df_aminoAcids = df_train[['feat_A', 'feat_C', 'feat_D', 'feat_E', 'feat_F',
       'feat_G', 'feat_H', 'feat_I', 'feat_K', 'feat_L', 'feat_M', 'feat_N',
       'feat_P', 'feat_Q', 'feat_R', 'feat_S', 'feat_T', 'feat_V', 'feat_W',
       'feat_Y', 'y_Ligand']]


df_aminoAcids = df_aminoAcids * 1
df_aminoAcids

Unnamed: 0,feat_A,feat_C,feat_D,feat_E,feat_F,feat_G,feat_H,feat_I,feat_K,feat_L,...,feat_N,feat_P,feat_Q,feat_R,feat_S,feat_T,feat_V,feat_W,feat_Y,y_Ligand
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
756,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
757,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
758,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [156]:
# Add extra columns
df_cleaned = df_aminoAcids
df_cleaned

Unnamed: 0,feat_A,feat_C,feat_D,feat_E,feat_F,feat_G,feat_H,feat_I,feat_K,feat_L,...,feat_N,feat_P,feat_Q,feat_R,feat_S,feat_T,feat_V,feat_W,feat_Y,y_Ligand
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
756,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
757,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
758,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Make Undersampled Dataset

In [157]:
# Split Data
X = df_cleaned.iloc[:,0:len(df_cleaned.columns)-1]
Y = df_cleaned.iloc[:,-1]
X

Unnamed: 0,feat_A,feat_C,feat_D,feat_E,feat_F,feat_G,feat_H,feat_I,feat_K,feat_L,feat_M,feat_N,feat_P,feat_Q,feat_R,feat_S,feat_T,feat_V,feat_W,feat_Y
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
756,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
757,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
758,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [158]:
Y

0      0
1      0
2      0
3      0
4      0
      ..
755    0
756    0
757    0
758    0
759    0
Name: y_Ligand, Length: 497166, dtype: int64

In [159]:
# Undersample, apply random sampler
print(Counter(Y))

undersample = RandomUnderSampler(sampling_strategy='majority')
X_US, Y_US = undersample.fit_resample(X, Y)
print(Counter(Y_US))

Counter({0: 479912, 1: 17254})
Counter({0: 17254, 1: 17254})


## What Factors predict drug binding or non-drug binding?

#### Hypothesis Testing (Univariate Selection with ChiSquared Test)

In [160]:
# apply the SelectKBest class to extract the top features 
def SelectKBestFunc(X, Y):
    """
    SelectKBestFunc takes in feature data, and target data, returns a fitted chi2 model between features and target
    """
    bestFeatures = SelectKBest(score_func=chi2, k='all')
    fit = bestFeatures.fit(X, Y)
    return fit


def printSummary(type, fit):
    """
    printSummary takes in a type of dataset, and a fitted model, and prints the 10 best features
    """
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(X.columns)
    dfpvalues = pd.DataFrame(fit.pvalues_)

    featureScores = pd.concat([dfcolumns, dfscores, dfpvalues], axis=1)
    featureScores.columns = ['Specs', 'Score', 'p-value']

    print(type)
    print(featureScores.nlargest(20, 'Score'))

In [161]:
featureScoreUndersampled = SelectKBestFunc(X_US, Y_US)
printSummary("Undersample Dataset", featureScoreUndersampled)

Undersample Dataset
     Specs       Score       p-value
12  feat_P  206.687770  7.253161e-47
5   feat_G  144.865762  2.297782e-33
19  feat_Y  136.225759  1.780866e-31
4   feat_F  103.632266  2.435779e-24
13  feat_Q   89.696319  2.776712e-21
3   feat_E   56.244261  6.400439e-14
18  feat_W   41.231465  1.352281e-10
15  feat_S   26.397709  2.778699e-07
10  feat_M   18.029252  2.175365e-05
0   feat_A   15.292705  9.207137e-05
6   feat_H   11.261491  7.913161e-04
9   feat_L   10.515556  1.183738e-03
8   feat_K    2.313503  1.282547e-01
14  feat_R    2.089130  1.483505e-01
2   feat_D    2.082261  1.490193e-01
7   feat_I    1.486326  2.227874e-01
17  feat_V    1.273070  2.591916e-01
11  feat_N    0.715084  3.977613e-01
16  feat_T    0.194190  6.594516e-01
1   feat_C    0.001350  9.706956e-01


#### Recursive Feature Elimination using RandomForestClassifier, DecisionTreeClassifier

In [162]:
def RFEFunc(X_one, Y_one, model):
    rfe = RFE(estimator=model, n_features_to_select=10)
    fit = rfe.fit(X_one, Y_one)
    arrFinal = []

    print("Num Features:", fit.n_features_)
    print("Best Selected Features:")
    arrBool = fit.support_
    for i in range(0, len(arrBool)):
        if(arrBool[i] == True):
            arrFinal.append(X.columns[i])

    print(*arrFinal, sep='\n')
    return arrFinal

In [163]:
arrUSRFC = RFEFunc(X_US, Y_US, RandomForestClassifier())

Num Features: 10
Best Selected Features:
feat_E
feat_F
feat_G
feat_H
feat_M
feat_P
feat_Q
feat_S
feat_W
feat_Y


#### Run Random Forest Model

In [164]:
x_train_us, x_test_us, y_train_us, y_test_us = train_test_split(X_US, Y_US, test_size = 0.25, random_state=42)

In [165]:
randomForestUS = RandomForestClassifier()
randomForestUS.fit(x_train_us, y_train_us)
predictionsTwo = randomForestUS.predict(x_test_us)
randomForestUS.score(x_test_us, y_test_us)

classificationReportTwo = metrics.classification_report(y_test_us, predictionsTwo)
confusionMatrixTwo = metrics.confusion_matrix(y_test_us, predictionsTwo)
print("Classification Report on Undersampled Dataset")
print(classificationReportTwo)
print(confusionMatrixTwo)
print('F1 Score: %.3f' % metrics.f1_score(y_test_us, predictionsTwo))

Classification Report on Undersampled Dataset
              precision    recall  f1-score   support

           0       0.55      0.60      0.58      4307
           1       0.56      0.51      0.54      4320

    accuracy                           0.56      8627
   macro avg       0.56      0.56      0.56      8627
weighted avg       0.56      0.56      0.56      8627

[[2587 1720]
 [2098 2222]]
F1 Score: 0.538


In [166]:
fpr, tpr, thresholds = metrics.roc_curve(y_test_us, predictionsTwo, pos_label=1)
auc_roc = metrics.auc(fpr, tpr)

precision, recall, _ = metrics.precision_recall_curve(y_test_us, predictionsTwo)
auc_pr = metrics.auc(recall, precision)

print(f"ROC-AUC: {auc_roc} \n PR-AUC {auc_pr}")

ROC-AUC: 0.5575009781664645 
 PR-AUC 0.6606075495431408
