# Model training

## Import libraries

In [1]:
import pandas as pd 

## Read ML ready dataset

In [2]:
df = pd.read_csv('data/ml_ready_genomics.csv', index_col=False)
df.head()

Unnamed: 0,Clinical_Significance_Encoded,Chromosome_Encoded,Clinical_Review_Status_Encoded,Gene_Symbol_Encoded,POS_Percentile,IS_SNP,IS_INDEL
0,1,1,1,12,3e-06,0,1
1,0,1,1,12,6e-06,1,0
2,1,1,1,12,9e-06,1,0
3,1,1,1,12,1.2e-05,1,0
4,1,1,1,12,1.5e-05,1,0


## Split dataset into dependent and independent features

In [3]:
X = df.drop('Clinical_Significance_Encoded', axis=1)
y = df['Clinical_Significance_Encoded']
X, y

(         Chromosome_Encoded  Clinical_Review_Status_Encoded  \
 0                         1                               1   
 1                         1                               1   
 2                         1                               1   
 3                         1                               1   
 4                         1                               1   
 ...                     ...                             ...   
 3682181                  27                               1   
 3682182                  27                               1   
 3682183                  27                               1   
 3682184                  27                               1   
 3682185                  30                               1   
 
          Gene_Symbol_Encoded  POS_Percentile  IS_SNP  IS_INDEL  
 0                         12        0.000003       0         1  
 1                         12        0.000006       1         0  
 2                         12   

## Splitting dataset into train/test split

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_test

(         Chromosome_Encoded  Clinical_Review_Status_Encoded  \
 2252313                  12                               1   
 1105919                   5                               1   
 540507                    2                               2   
 2724236                  16                               2   
 3154590                  19                               1   
 ...                     ...                             ...   
 2392564                  13                               1   
 200829                    1                               1   
 75500                     1                               1   
 2342151                  13                               1   
 738676                    3                               2   
 
          Gene_Symbol_Encoded  POS_Percentile  IS_SNP  IS_INDEL  
 2252313                   85        0.521106       1         0  
 1105919                  388        0.620232       1         0  
 540507                 36437   

In [5]:
y_train, y_test

(2252313    0
 1105919    1
 540507     2
 2724236    1
 3154590    1
           ..
 2392564    1
 200829     1
 75500      1
 2342151    0
 738676     2
 Name: Clinical_Significance_Encoded, Length: 2945748, dtype: int64,
 1597416    0
 2262377    0
 840549     0
 873582     1
 451629     1
           ..
 3472256    1
 1296667    1
 1229269    1
 1782198    1
 3565806    0
 Name: Clinical_Significance_Encoded, Length: 736438, dtype: int64)

## Train the RandomForestClassifier

In [6]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced', 
                                       n_jobs=-1)
rf_classifier.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## Evaluate the model metrics

In [9]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
y_pred = rf_classifier.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.46      0.78      0.58    253225
           1       0.82      0.44      0.57    387994
           2       0.69      0.63      0.66     92775
           4       0.01      0.54      0.01       103
           5       0.42      0.92      0.58       372
           6       0.01      0.77      0.03        92
           7       0.92      0.99      0.95      1877

    accuracy                           0.58    736438
   macro avg       0.48      0.73      0.48    736438
weighted avg       0.68      0.58      0.59    736438

[[196609  35607  14384   4143    156   2272     54]
 [199416 171057  11826   3917    183   1553     42]
 [ 27111   2895  58751   2681    139   1141     57]
 [     5      1      8     56      0     33      0]
 [     3      0      1     13    344     11      0]
 [     0      4      2     14      1     71      0]
 [     1      0      2     14      0      5   1855]]
