##### feature set 5 chromosomes 2, 3, 4 

## In this notebook:

* single input dataframe 
* using feature set 5 (2+mutation counts)
* random forest

#### Highlights:
* Exac_RVIS seems to be a better metric than SORVA
* LINSIGHT doesn't have enough data points to be that impactful it seems
* Accuracy level seems to be fairly desirable

In [67]:
import torch
import pandas as pd
import numpy as np

# using sklearn goodies
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_roc_curve
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, LeaveOneGroupOut

from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, recall_score


## pull in pickled dataframe:
df = pd.read_pickle("./pickled/chr2-4_featureset5.pkl")




print("shape before", df.shape)
df = df.replace({'.': 0})  ## FIXME: atm replace with nan and drop: later find a way to impute, maybe 0 but not sure
#print("SUM", pd.isnull(df).sum())  ## Use this to see how many elements have nan 
df = df.dropna()
print("shape after dropping all data with score = '.'", df.shape)
#print(df.head())

## still contains some mixed labels, see them by: df[~df['clinvar_clnsig'].isin(list(clinvar_labels.keys()))]

## Map labels to 1/0 and remove Unkn
clinvar_labels={
    'Pathogenic': 1,
    'Likely_pathogenic': 1,
    'Pathogenic/Likely_pathogenic': 1,
    'Benign/Likely_benign': 0,
    'Likely_benign': 0,
    'Benign': 0
}

#df.head()
#df = df[df['clinvar_clnsig'].isin(list(clinvar_labels.keys()))]  # best to drop everything that isn't in the dict
#df['Problematic'] = df['clinvar_clnsig'].map(clinvar_labels)

#df= df.drop(columns=['clinvar_clnsig'])#, 'clinvar_id'])  #tbh, I don't know why I kept them. consider adding to index

df = df.replace({'-': 0}) 
print("shape after dropping Unk", df.shape)
df.head()

### pickle to reuse dataframe:
#df.to_pickle("./pickled_df/all_scores_chr18,20.pkl")


shape before (13820, 14)
shape after dropping all data with score = '.' (13820, 14)
shape after dropping Unk (13820, 14)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,aapos,codonpos,Polyphen2_HVAR_pred,GenoCanyon_score,LINSIGHT,RVIS_ExAC,ExAC_cnv.score,SORVA_LOF_MAF0.005_HetOrHom,Problematic,Mutation<1kb,Mutation<5kb,Mutation<10kb,Mutation<30kb,Mutation<100kb
chr,pos,ref,alt,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2,277003,A,G,106,2,1,0.9777236555582,0,-0.080196141,-1.38628707843331,0.000399,0,1,1,1,1,5
2,1456232,G,T,257,1,1,1.18891219212902e-05,0,0.064587428,-2.47162789907313,0.004792,0,1,1,1,6,19
2,1477383,G,T,373,1,1,0.0023544075713734,0,0.064587428,-2.47162789907313,0.004792,0,2,2,5,18,19
2,1477459,G,C,398,2,-2,0.149918937837505,0,0.064587428,-2.47162789907313,0.004792,0,2,2,5,18,19
2,1484596,A,T,447,1,-2,0.0853173782519132,0,0.064587428,-2.47162789907313,0.004792,1,3,5,11,18,19


In [70]:

dfc = df.copy()
dfc = dfc.reset_index()
group_by_chr = dfc['chr'].to_numpy()
y = dfc.pop('Problematic')
X = dfc


## Extract the group-by-chromosome column:
logo = LeaveOneGroupOut()
print(logo.get_n_splits(X, y, group_by_chr))
print(logo.get_n_splits(groups = group_by_chr))

print("To split by chromosomes into %s groups" % logo.get_n_splits(groups = group_by_chr))


## Setup forest:

forest = RandomForestClassifier(n_estimators=1000, random_state=42)
metric_by_test_chr = {}

#### fit a model for each chromosome:

for train_index, test_index in logo.split(X, y, group_by_chr):
    #print("TRAIN:", train_index.tolist(), "TEST:", test_index.tolist())
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    test_chrom = X_test.iloc[0]['chr']
    print("Completed split for testing chromosome %s" % test_chrom) ## hah, at least I'm sure it works now 

    X_train = X_train.set_index(['chr','pos', 'ref', 'alt'])
    X_test = X_test.set_index(['chr','pos', 'ref', 'alt'])

    forest.fit(X_train, y_train)
    y_true = y_test.to_numpy()
    y_pred = forest.predict(X_test)

    print("Accuracy:", accuracy_score(y_true, y_pred))
    #F1 = 2 * (precision * recall) / (precision + recall)
    metric_by_test_chr[test_chrom] = {
        "Confusion matrix:": confusion_matrix(y_true, y_pred),
        "Accuracy:": accuracy_score(y_true, y_pred),
        "Recall TP/(TP+FN):": recall_score(y_true, y_pred, average=None),
        "Precision TP/(TP+FP):": precision_score(y_true, y_pred, average=None)
        
    }
    

3
3
To split by chromosomes into 3 groups
Completed split for testing chromosome 2
Accuracy: 0.8112185686653772
Completed split for testing chromosome 3
Accuracy: 0.8602391629297459
Completed split for testing chromosome 4
Accuracy: 0.7815699658703071


In [72]:
import pprint as pp
pp.pprint(metric_by_test_chr)

{2: {'Accuracy:': 0.8112185686653772,
     'Confusion matrix:': array([[1291,  982],
       [ 482, 5000]]),
     'Precision TP/(TP+FP):': array([0.72814439, 0.83584086]),
     'Recall TP/(TP+FN):': array([0.56797184, 0.91207588])},
 3: {'Accuracy:': 0.8602391629297459,
     'Confusion matrix:': array([[ 516,  421],
       [ 140, 2937]]),
     'Precision TP/(TP+FP):': array([0.78658537, 0.87462775]),
     'Recall TP/(TP+FN):': array([0.5506937 , 0.95450114])},
 4: {'Accuracy:': 0.7815699658703071,
     'Confusion matrix:': array([[ 450,  297],
       [ 151, 1153]]),
     'Precision TP/(TP+FP):': array([0.74875208, 0.79517241]),
     'Recall TP/(TP+FN):': array([0.60240964, 0.88420245])}}


In [73]:
## Save model for future use:

#import pickle 
#pickle.dump(forest, open('./pickled_models/18.5.2_4.RF.pkl', 'wb'))
#print("saved model")