## Antagonism and Synergy prediction in Salmonella

In [None]:
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import pandas as pd
import sys
import os
from sklearn.preprocessing import label_binarize

sys.path.append('..')
import base.chemgen_utils as utl
import MLmod.predictor as prd

## Cross Validation

In [None]:
drugleg_fname = "../data/chemicals/legend_gramnegpos.txt"
gene_subset = '../data/interaction-genes-Salmonella'
gene_subset = pd.read_csv(gene_subset, header=None)[0].values

In [None]:
X_chemgen = pd.read_csv('../data/chemgenetics/salmonella_binarized.csv', index_col=0)
X_chemgen = X_chemgen.iloc[:,np.where(np.isin(X_chemgen.columns, gene_subset))[0]]
targets = pd.read_csv("../data/chemgenetics/salmonella_y.csv")
combs = targets['comb'].values
y = targets['type'].values

X_df = pd.DataFrame([utl.get_comb_feat(X_chemgen, c) for c in combs])

drugclasses = pd.read_csv(drugleg_fname, sep='\t')
druglegend = drugclasses.loc[:,['Drug', 'Class']]

comb_drugs = pd.DataFrame(np.array([utl.split_vec(i) for i in combs]),
                          columns=['d1', 'd2'])
comb_drugs = utl.add_class(strain=comb_drugs,
                           druglegend=druglegend)
# an array with all drug class labels
class_arr = np.unique(np.union1d(pd.unique(comb_drugs.class1),
                                 pd.unique(comb_drugs.class2)))

In [None]:
comb_drugs[:10]

In [None]:
# one vs rest classification
y[y=='none'] = 0
y[y=='antagonism']=1
y[y=='synergy']=2

y=y.astype('int')
y = label_binarize(y, classes=[0, 1, 2])

In [None]:
param_dict = {'n_estimators': 200,
 'min_samples_split': 7,
 'min_samples_leaf': 3,
 'max_depth': None,
 'class_weight': None}

In [None]:
pr = prd.MultiClassPredictions(X=X_df.to_numpy(), y=y,
                                   combs=combs,
                                  **param_dict,
                                   clf='randomforest')

In [None]:
pr.crossval_drugclass(class_arr=class_arr, leg_class=comb_drugs)

In [None]:
pr.save_topfeat(outdir='../data/', fname="topfeat-multiclass-Salmonella",
                    featname=X_df.columns.values)

In [None]:
auc_df = (pd.concat({k: pd.DataFrame(v.values(),
                                   index=['AUCROC_none',
                                          'AUCROC_antag',
                                          'AUCROC_syn']).T \
                   for k,v in pr.auc.items()}).
         reset_index().rename(columns={"level_0": "cvfold"}).
         drop(columns=["level_1"]))

ap_df = (pd.concat({k: pd.DataFrame(v.values(),
                           index=['AP_none',
                                  'AP_antag',
                                  'AP_syn']).T \
           for k,v in pr.avprec.items()}).
 reset_index().rename(columns={"level_0": "cvfold"}).
 drop(columns=["level_1"]))

metrics = pd.merge(auc_df, ap_df, on='cvfold', how='inner')

In [None]:
(metrics.
 sort_values('AP_antag', ascending=False).
 reset_index(drop=True))

In [None]:
(metrics.
 sort_values('AP_syn', ascending=False).
 reset_index(drop=True))

In [None]:
topvars = (pd.concat(pr.topfeat).
                   reset_index().
                   rename(columns={"level_0": "cvfold"}).
                   drop(columns=['level_1']))

In [None]:
featname=X_df.columns.values

In [None]:
topvars = (topvars.assign(feature=featname[topvars.feat]).
           drop(columns=['feat']))

**Top genes for antagonism prediction**

In [None]:
(topvars[topvars.type == 'antagonism'].
 groupby('feature').agg('count').
 query('cvfold > 1').
 sort_values('cvfold', ascending=False).iloc[:20,0])

**Top genes for synergy prediction**

In [None]:
(topvars[topvars.type == 'synergy'].
 groupby('feature').agg('count').
 query('cvfold > 1').
 sort_values('cvfold', ascending=False).iloc[:20,0])

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_score

In [None]:
TP_df = list()

In [None]:
for cl in pr.predicted.keys():
    #print(cl)
    ycl = y[np.isin(combs, pr.predicted[cl]['comb'].values)]
    gt = pd.DataFrame(ycl, columns=['none', 'antagonism', 'synergy'])
    gt['comb'] = combs[np.isin(combs, pr.predicted[cl]['comb'].values)]
    pred_df = pd.merge(left=pr.predicted[cl], right=gt, how='inner', on='comb')
    
    precision, recall, thresh = precision_recall_curve(pred_df['antagonism'].values,
                                                        pred_df['prob_ant'].values)
    if not np.any(np.isnan(precision)) and not np.any(np.isnan(recall)) and len(thresh) > 1:
        q3 = np.quantile(recall, 0.75)
        if np.any(recall == q3):
            pmax = np.max(precision[recall == q3])
        else:
            ind = np.floor(np.quantile(range(len(recall)),0.75)).astype(int)
            q3 = recall[ind]
            pmax = precision[ind]
        antag_thresh = thresh[np.where(np.logical_and(precision == pmax, recall == q3))[0]+1][0]
        antag_tp = pred_df[(pred_df.prob_ant > antag_thresh) == pred_df.antagonism]
        antag_tp = antag_tp[antag_tp.antagonism == 1]
        antag_tp['thresh'] = antag_thresh
        antag_tp['precision'] = pmax
        antag_tp['recall'] = q3
        antag_tp['cvfold'] = cl
        TP_df.append(antag_tp)
    
    precision, recall, thresh = precision_recall_curve(pred_df['synergy'].values,
                                                        pred_df['prob_syn'].values)
    if not np.any(np.isnan(precision)) and not np.any(np.isnan(recall)) and len(thresh) > 1:
        q3 = np.quantile(recall, 0.75)
        if np.any(recall == q3):
            pmax = np.max(precision[recall == q3])
        else:
            ind = np.floor(np.quantile(range(len(recall)),0.75)).astype(int)
            q3 = recall[ind]
            pmax = precision[ind]
        syn_thresh = thresh[np.where(np.logical_and(precision == pmax, recall == q3))[0]][0]

        syn_tp = pred_df[(pred_df.prob_syn > syn_thresh) == pred_df.synergy]
        syn_tp = syn_tp[syn_tp.synergy == 1]
        syn_tp['thresh'] = syn_thresh
        syn_tp['precision'] = pmax
        syn_tp['recall'] = q3
        syn_tp['cvfold'] = cl
        TP_df.append(syn_tp)

In [None]:
TP_df = pd.concat(TP_df).reset_index(drop=True)

In [None]:
TP_df.to_csv('Salmonella-true-positives-CVfold.csv')

## Train the Classifier with Cross-Validated Hyperparameters

In [None]:
Xtrain = X_df
ytrain = y

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = OneVsRestClassifier(RandomForestClassifier(bootstrap=True,
                                                max_features='sqrt',
                                                **param_dict,
                                                random_state=2305,
                                              n_jobs=-1))

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(Xtrain, ytrain, test_size=0.3,
                                                    random_state=2305)

In [None]:
clf = clf.fit(X_train, y_train)

In [None]:
probas_ = clf.predict_proba(X_val)

Plot precision-recall for the training set:

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()
precision = dict()
recall = dict()
thresh = dict()
average_precision = dict()

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_score

In [None]:
n_classes = 3
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_val[:, i], probas_[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    precision[i], recall[i], thresh[i] = precision_recall_curve(y_val[:, i],
                                                        probas_[:, i])
    average_precision[i] = average_precision_score(y_val[:, i], probas_[:, i])


In [None]:
from itertools import cycle
class_names = ['none', 'antagonism', 'synergy']
colors = cycle(['#808080','#FFCC33', '#009999'])


plt.figure(figsize=(10,10))
f_scores = np.linspace(0.2, 0.8, num=4)

for f_score in f_scores:
    x = np.linspace(0.01, 1)
    y_ = f_score * x / (2 * x - f_score)
    plt.plot(x[y_ >= 0], y_[y_ >= 0], color='gray', alpha=0.2,
             label='iso-F1 curves')
    plt.annotate('f1={0:0.1f}'.format(f_score), xy=(0.9, y_[45] + 0.02))
for i, color in zip(range(n_classes), colors):
    plt.plot(recall[i], precision[i], color=color, lw=2,
             label='Precision-recall of class {0} (area = {1:0.2f})'
             ''.format(class_names[i], average_precision[i]))

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc="lower right")

In [None]:
from sklearn.calibration import calibration_curve
fraction_of_positives = dict()
mean_predicted_value = dict()
for i in range(n_classes):
    proba_val = clf.predict_proba(X_val)[:, i]
    fraction_of_positives[i], mean_predicted_value[i] = calibration_curve(y_val[:,i],
                                                                proba_val,
                                                                n_bins=4)

In [None]:
fig, ax = plt.subplots(1, figsize=(12, 6))
plt.plot(mean_predicted_value[0], fraction_of_positives[0], 's-', label='none')
plt.plot([0, 1], [0, 1], '--', color='gray')
plt.plot(mean_predicted_value[1], fraction_of_positives[1], 's-', label='antagonism')
plt.plot(mean_predicted_value[2], fraction_of_positives[2], 's-', label='synergy')
plt.xlabel('Mean predicted probability')
plt.ylabel('Fraction of positives')
plt.title('Uncalibrated probabilities')
plt.legend()

In [None]:
from sklearn.calibration import CalibratedClassifierCV

In [None]:
fraction_of_positives = dict()
mean_predicted_value = dict()
for i in range(n_classes):
    #proba_val = clf.predict_proba(X_val)[:, i]
    clf_calib = CalibratedClassifierCV(clf.estimators_[i], cv=5, method='sigmoid')
    proba_val = clf_calib.fit(X_train, y_train[:,i]).predict_proba(X_val)[:,1]
    fraction_of_positives[i], mean_predicted_value[i] = calibration_curve(y_val[:,i],
                                                                proba_val,
                                                                n_bins=5)

In [None]:
fig, ax = plt.subplots(1, figsize=(12, 6))
plt.plot(mean_predicted_value[0], fraction_of_positives[0], 's-', label='none')
plt.plot([0, 1], [0, 1], '--', color='gray')
plt.plot(mean_predicted_value[1], fraction_of_positives[1], 's-', label='antagonism')
plt.plot(mean_predicted_value[2], fraction_of_positives[2], 's-', label='synergy')
plt.title('Calibrated probabilities')
plt.legend()

Somehow calibrated probabilities are worse than the "uncalibrated" ones.

## Generate Predictions on the Test Set

In [None]:
X_drugs = pd.read_csv('../data/chemgenetics/salmonella_testset_binarized.csv', index_col=0)

In [None]:
X_drugs = X_drugs.iloc[:,np.where(np.isin(X_drugs.columns, gene_subset))[0]]

In [None]:
X_drugs.shape

In [None]:
test_drugs = X_drugs.index.values

In [None]:
import itertools
combs_test = list(itertools.combinations(test_drugs, 2))
combs_test = np.array([i[0]+"_"+i[1] for i in combs_test])

In [None]:
len(combs_test)

In [None]:
X_test = pd.DataFrame([utl.get_comb_feat(X_drugs, c) for c in combs_test])

In [None]:
X_test.shape

In [None]:
np.all(X_test.columns == Xtrain.columns)

In [None]:
# without probability calibration
y_test_proba = clf.fit(Xtrain, ytrain).predict_proba(X_test)


In [None]:
antag = combs_test[y_test_proba[:,1] > 0.24]
syn = combs_test[y_test_proba[:,2] > 0.2]

In [None]:
prob_uncalibr = pd.DataFrame(y_test_proba, index=combs_test,
             columns=['none', 'antag', 'synergy'])

In [None]:
prob_uncalibr.to_csv('salmonella_test_pred.csv')

In [None]:
prob_uncalibr.sort_values('antag', ascending=False).iloc[:30,:]

In [None]:
prob_uncalibr.sort_values('synergy', ascending=False).iloc[:30,:]

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=6)

In [None]:
# with calibration
probs = dict()
for i in range(n_classes):
    clf_calib = CalibratedClassifierCV(clf.estimators_[i], cv=skf, method='isotonic')
    probs[i] = clf_calib.fit(Xtrain, ytrain[:,i]).predict_proba(X_test)

In [None]:
antag = combs_test[probs[1][:,1] > 0.2]
syn = combs_test[probs[2][:,1] > 0.2]
not_none = combs_test[probs[0][:,1] < 0.6]

In [None]:
np.setdiff1d(antag, syn)

In [None]:
np.setdiff1d(syn, antag)

In [None]:
np.intersect1d(antag, syn)