In [1]:
from skempi_utils import *

df = skempi_df
df_multi = df[~np.asarray([len(s)>8 for s in df.Protein])]
s_multi = set([s[:4] for s in df_multi.Protein])
s_groups = set([s[:4] for s in G1 + G2 + G3 + G4 + G5])
len(s_multi & s_groups), len(s_multi), len(s_groups)
df_multi.head()

Unnamed: 0,Protein,Mutation(s)_PDB,Mutation(s)_cleaned,Location(s),Hold_out_type,Hold_out_proteins,Affinity_mut (M),Affinity_wt (M),DDG,Reference,...,Temperature,kon_mut (M^(-1)s^(-1)),kon_wt (M^(-1)s^(-1)),koff_mut (s^(-1)),koff_wt (s^(-1)),dH_mut (kcal mol^(-1)),dH_wt (kcal mol^(-1)),dS_mut (cal mol^(-1) K^(-1)),dS_wt (cal mol^(-1) K^(-1)),Notes
0,1CSE_E_I,LI45G,LI38G,COR,PI,PI,5.26e-11,1.12e-12,2.280577,9048543,...,294,,,,,,,,,
1,1CSE_E_I,LI45S,LI38S,COR,PI,PI,8.33e-12,1.12e-12,1.188776,9048543,...,294,,,,,,,,,
2,1CSE_E_I,LI45P,LI38P,COR,PI,PI,1.02e-07,1.12e-12,6.765446,9048543,...,294,,,,,,,,,
3,1CSE_E_I,LI45I,LI38I,COR,PI,PI,1.72e-10,1.12e-12,2.982502,9048543,...,294,,,,,,,,,
4,1CSE_E_I,LI45D,LI38D,COR,PI,PI,1.92e-09,1.12e-12,4.411843,9048543,...,294,,,,,,,,,


In [2]:
from sklearn.preprocessing import StandardScaler
from itertools import combinations as comb
from sklearn.externals import joblib
import numpy as np

def evaluate(group_str, y_true, y_pred, names, ix):
    y_pred_pos = y_pred[ix == 0]
    y_pred_neg = y_pred[ix == 1]
    y_true_pos = y_true[ix == 0]
    y_true_neg = y_true[ix == 1]
    names_pos = names[ix == 0]
    names_neg = names[ix == 1]
    cor_pos, _ = pearsonr(y_true_pos, y_pred_pos)
    cor_neg, _ = pearsonr(y_true_neg, y_pred_neg)
    print("[%s:%d] cor_pos:%.3f, cor_neg:%.3f" % (group_str, len(y_true), cor_pos, cor_neg))
    return cor_pos, cor_neg, zip(names_pos, names_neg, y_true_pos, y_true_neg, y_pred_pos, y_pred_neg)
                                 
def run_cv_test(skempi_data, get_regressor, modelname, normalize=1):
    X, y, ix, mnames, _ = skempi_data
    groups_data = []
    preds_data = []
    for i, pair in enumerate(comb(range(NUM_GROUPS), 2)):
        g1, g2 = np.asarray(pair) + 1
        group = list((g1, g2))
        rest = list(set([1, 2, 3, 4, 5]) - set((g1, g2)))
        indx_tst = np.isin(ix[:, 0], group)
        indx_trn = np.isin(ix[:, 0], rest)
        y_trn = y[indx_trn]
        y_true = y[indx_tst]
        X_trn = X[indx_trn]
        X_tst = X[indx_tst]
        if normalize == 1:
            scaler = StandardScaler()
            scaler.fit(X_trn)
            X_trn = scaler.transform(X_trn)
            X_tst = scaler.transform(X_tst)
        regressor = get_regressor()
        regressor.fit(X_trn, y_trn)
        joblib.dump(regressor, 'models/%s%s.pkl' % (modelname, i))
        regressor = joblib.load('models/%s%s.pkl' % (modelname, i))
        y_pred = regressor.predict(X_tst)
        pos, neg, data = evaluate("G%d,G%d" % (g1, g2), y_true, y_pred, mnames[indx_tst], ix[indx_tst, 1])
        groups_data.append([g1, g2, pos, neg])
        preds_data.extend(data)
    df1 = pd.DataFrame(preds_data, columns=["MNAME_POS", "MNAME_NEG", "DDG_POS", "DDG_NEG", "PRED_POS", "PRED_NEG"])
    df2 = pd.DataFrame(groups_data, columns=["GROUP1", "GROUP2", "PCC_POS", "PCC_NEG"])
    return df1, df2

def run_cv_test_ensemble(skempi_data, alpha=0.5, normalize=1):
    X, y, ix, mnames, _ = skempi_data
    groups_data = []
    preds_data = []
    for i, pair in enumerate(comb(range(NUM_GROUPS), 2)):
        g1, g2 = np.asarray(pair) + 1
        group = list((g1, g2))
        rest = list(set([1, 2, 3, 4, 5]) - set((g1, g2)))
        indx_tst = np.isin(ix[:, 0], group)
        indx_trn = np.isin(ix[:, 0], rest)
        y_true = y[indx_tst]
        X_trn = X[indx_trn]
        X_tst = X[indx_tst]
        svr = joblib.load('models/svr%d.pkl' % i)
        rfr = joblib.load('models/rfr%d.pkl' % i)
        if normalize == 1:
            scaler = StandardScaler()
            scaler.fit(X_trn)
            X_tst = scaler.transform(X_tst)
        y_pred_svr = svr.predict(X_tst)
        y_pred_rfr = rfr.predict(X_tst)
        y_pred = alpha * y_pred_svr + (1-alpha) * y_pred_rfr
        pos, neg, data = evaluate("G%d,G%d" % (g1, g2), y_true, y_pred, mnames[indx_tst], ix[indx_tst, 1])
        groups_data.append([g1, g2, pos, neg])
        preds_data.extend(data)
    df1 = pd.DataFrame(preds_data, columns=["MNAME_POS", "MNAME_NEG", "DDG_POS", "DDG_NEG", "PRED_POS", "PRED_NEG"])
    df2 = pd.DataFrame(groups_data, columns=["GROUP1", "GROUP2", "PCC_POS", "PCC_NEG"])
    return df1, df2

In [3]:
skempi_records = load_skempi_records(load_skempi_structs("../data/pdbs", compute_dist_mat=False))
data_pos = records_to_xy(skempi_records, load_negative=False)
data_neg = records_to_xy(skempi_records, load_negative=True)

skempi structures processed: 100%|██████████| 158/158 [01:10<00:00,  2.25it/s]
skempi records processed: 100%|██████████| 3047/3047 [00:00<00:00, 3340.65it/s]
records processed: 100%|██████████| 3047/3047 [13:18<00:00,  3.82it/s]
records processed: 100%|██████████| 3047/3047 [37:46<00:00,  1.34it/s]


In [4]:
import random
random.seed(0)
indx = range(len(data_pos))
random.shuffle(indx)
data_pos = [data_pos[i] for i in indx]
data_neg = [data_neg[i] for i in indx]

In [5]:
data = [np.asarray(d) for d in zip(*data_pos)]

print("----->SVR")
from sklearn.svm import SVR
def get_regressor(): return SVR(kernel='rbf')
run_cv_test(data, get_regressor, 'svr', normalize=1)

print("----->RFR")
from sklearn.ensemble import RandomForestRegressor
def get_regressor(): return RandomForestRegressor(n_estimators=50, random_state=0)
run_cv_test(data, get_regressor, 'rfr', normalize=1)

# alpha = cor1/(cor1+cor2)
alpha = 0.5
print("----->%.2f*SVR + %.2f*RFR" % (alpha, 1-alpha))
df1, df2 = run_cv_test_ensemble(data, normalize=1)
np.mean(df2.PCC_POS)

----->SVR


  mx = x.mean()
  ret = ret.dtype.type(ret / rcount)
  my = y.mean()
  r = r_num / r_den


[G1,G2:734] cor_pos:0.510, cor_neg:nan
[G1,G3:790] cor_pos:0.530, cor_neg:nan
[G1,G4:815] cor_pos:0.454, cor_neg:nan
[G1,G5:910] cor_pos:0.572, cor_neg:nan
[G2,G3:734] cor_pos:0.522, cor_neg:nan
[G2,G4:759] cor_pos:0.494, cor_neg:nan
[G2,G5:854] cor_pos:0.533, cor_neg:nan
[G3,G4:815] cor_pos:0.464, cor_neg:nan
[G3,G5:910] cor_pos:0.495, cor_neg:nan
[G4,G5:935] cor_pos:0.551, cor_neg:nan
----->RFR
[G1,G2:734] cor_pos:0.432, cor_neg:nan
[G1,G3:790] cor_pos:0.614, cor_neg:nan
[G1,G4:815] cor_pos:0.402, cor_neg:nan
[G1,G5:910] cor_pos:0.567, cor_neg:nan
[G2,G3:734] cor_pos:0.525, cor_neg:nan
[G2,G4:759] cor_pos:0.351, cor_neg:nan
[G2,G5:854] cor_pos:0.531, cor_neg:nan
[G3,G4:815] cor_pos:0.428, cor_neg:nan
[G3,G5:910] cor_pos:0.531, cor_neg:nan
[G4,G5:935] cor_pos:0.518, cor_neg:nan
----->0.50*SVR + 0.50*RFR
[G1,G2:734] cor_pos:0.495, cor_neg:nan
[G1,G3:790] cor_pos:0.622, cor_neg:nan
[G1,G4:815] cor_pos:0.452, cor_neg:nan
[G1,G5:910] cor_pos:0.603, cor_neg:nan
[G2,G3:734] cor_pos:0.557, c

0.53505321575572284

In [6]:
data = [np.asarray(d) for d in zip(*data_neg)]

print("----->SVR")
from sklearn.svm import SVR
def get_regressor(): return SVR(kernel='rbf')
run_cv_test(data, get_regressor, 'svr', normalize=1)

print("----->RFR")
from sklearn.ensemble import RandomForestRegressor
def get_regressor(): return RandomForestRegressor(n_estimators=50, random_state=0)
run_cv_test(data, get_regressor, 'rfr', normalize=1)

# alpha = cor1/(cor1+cor2)
alpha = 0.5
print("----->%.2f*SVR + %.2f*RFR" % (alpha, 1-alpha))
df1, df2 = run_cv_test_ensemble(data, normalize=1)
np.mean(df2.PCC_NEG)

----->SVR
[G1,G2:734] cor_pos:nan, cor_neg:0.497
[G1,G3:790] cor_pos:nan, cor_neg:0.520
[G1,G4:815] cor_pos:nan, cor_neg:0.470
[G1,G5:910] cor_pos:nan, cor_neg:0.574
[G2,G3:734] cor_pos:nan, cor_neg:0.517
[G2,G4:759] cor_pos:nan, cor_neg:0.460
[G2,G5:854] cor_pos:nan, cor_neg:0.449
[G3,G4:815] cor_pos:nan, cor_neg:0.414
[G3,G5:910] cor_pos:nan, cor_neg:0.500
[G4,G5:935] cor_pos:nan, cor_neg:0.531
----->RFR
[G1,G2:734] cor_pos:nan, cor_neg:0.374
[G1,G3:790] cor_pos:nan, cor_neg:0.602
[G1,G4:815] cor_pos:nan, cor_neg:0.412
[G1,G5:910] cor_pos:nan, cor_neg:0.574
[G2,G3:734] cor_pos:nan, cor_neg:0.531
[G2,G4:759] cor_pos:nan, cor_neg:0.371
[G2,G5:854] cor_pos:nan, cor_neg:0.516
[G3,G4:815] cor_pos:nan, cor_neg:0.436
[G3,G5:910] cor_pos:nan, cor_neg:0.560
[G4,G5:935] cor_pos:nan, cor_neg:0.538
----->0.50*SVR + 0.50*RFR
[G1,G2:734] cor_pos:nan, cor_neg:0.459
[G1,G3:790] cor_pos:nan, cor_neg:0.629
[G1,G4:815] cor_pos:nan, cor_neg:0.469
[G1,G5:910] cor_pos:nan, cor_neg:0.610
[G2,G3:734] cor_po

0.53093435191132188

In [7]:
data = [np.asarray(d) for d in zip(*data_pos + data_neg)]

print("----->SVR")
from sklearn.svm import SVR
def get_regressor(): return SVR(kernel='rbf')
df1, df2 = run_cv_test(data, get_regressor, 'svr', normalize=1)
print(np.mean(df2.PCC_POS), np.mean(df2.PCC_NEG))

print("----->RFR")
from sklearn.ensemble import RandomForestRegressor
def get_regressor(): return RandomForestRegressor(n_estimators=50, random_state=0)
df1, df2 = run_cv_test(data, get_regressor, 'rfr', normalize=1)
print(np.mean(df2.PCC_POS), np.mean(df2.PCC_NEG))

# alpha = cor1/(cor1+cor2)
alpha = 0.5
print("----->%.2f*SVR + %.2f*RFR" % (alpha, 1-alpha))
df1, df2 = run_cv_test_ensemble(data, normalize=1)
print(np.mean(df2.PCC_POS), np.mean(df2.PCC_NEG))

----->SVR
[G1,G2:1468] cor_pos:0.256, cor_neg:0.338
[G1,G3:1580] cor_pos:0.311, cor_neg:0.319
[G1,G4:1630] cor_pos:0.161, cor_neg:0.218
[G1,G5:1820] cor_pos:0.400, cor_neg:0.438
[G2,G3:1468] cor_pos:0.315, cor_neg:0.427
[G2,G4:1518] cor_pos:0.246, cor_neg:0.331
[G2,G5:1708] cor_pos:0.374, cor_neg:0.378
[G3,G4:1630] cor_pos:0.325, cor_neg:0.241
[G3,G5:1820] cor_pos:0.494, cor_neg:0.511
[G4,G5:1870] cor_pos:0.442, cor_neg:0.497
(0.33241475881941046, 0.36981930047226708)
----->RFR
[G1,G2:1468] cor_pos:0.254, cor_neg:0.309
[G1,G3:1580] cor_pos:0.288, cor_neg:0.256
[G1,G4:1630] cor_pos:0.174, cor_neg:0.209
[G1,G5:1820] cor_pos:0.447, cor_neg:0.464
[G2,G3:1468] cor_pos:0.368, cor_neg:0.401
[G2,G4:1518] cor_pos:0.230, cor_neg:0.308
[G2,G5:1708] cor_pos:0.434, cor_neg:0.523
[G3,G4:1630] cor_pos:0.346, cor_neg:0.379
[G3,G5:1820] cor_pos:0.490, cor_neg:0.530
[G4,G5:1870] cor_pos:0.450, cor_neg:0.482
(0.34811050808224958, 0.38607303251632008)
----->0.50*SVR + 0.50*RFR
[G1,G2:1468] cor_pos:0.268, 

In [8]:
df1.to_csv("csvs/predictions.csv", index = False)
df1.head()

Unnamed: 0,MNAME_POS,MNAME_NEG,DDG_POS,DDG_NEG,PRED_POS,PRED_NEG
0,1JTG,1JTG_KB74A_FB142A_YB143A,3.895401,-3.895401,1.491831,-0.377763
1,2G2U,2G2U_WB150A,1.785222,-1.785222,-0.084771,0.153243
2,2SGQ,2SGQ_QI13L,-2.603541,2.603541,0.504453,-0.475882
3,3SGB,3SGB_LI12R,3.366955,-3.366955,-1.547921,1.675196
4,1A4Y,1A4Y_KB40G_YA434A_DA435A,6.213886,-6.213886,4.877817,-2.415524


In [9]:
df2.to_csv("csvs/groups.csv", index = False)
df2.head()

Unnamed: 0,GROUP1,GROUP2,PCC_POS,PCC_NEG
0,1,2,0.267921,0.338689
1,1,3,0.320797,0.30265
2,1,4,0.176402,0.223584
3,1,5,0.447472,0.474201
4,2,3,0.364,0.438744


In [10]:
skempi_structs = load_skempi_structs(BPX_PDB_PATH, compute_dist_mat=False)
skempi_records = load_skempi_records(skempi_structs)

skempi structures processed: 100%|██████████| 158/158 [01:22<00:00,  1.91it/s]
skempi records processed: 100%|██████████| 3047/3047 [00:00<00:00, 4355.58it/s]


In [11]:
df_bpx_pos_ddg = bindprofx_predict(skempi_records, evlaute_negative=False)
df_bpx_pos_ddg.head()

records processed: 100%|██████████| 3047/3047 [23:28<00:00,  2.16it/s]


Unnamed: 0,MODEL_NAME,MUTATIONS,GROUP,DDG,BINDPROFX
0,1CSE,LI38G,3,2.280577,2.529
1,1CSE,LI38S,3,1.188776,2.471
2,1CSE,LI38P,3,6.765446,2.557
3,1CSE,LI38I,3,2.982502,2.269
4,1CSE,LI38D,3,4.411843,2.499


In [12]:
df_bpx_pos_groups = bindprofx_evaluate(df_bpx_pos_ddg)
df_bpx_pos_groups.head(10)

Unnamed: 0,Group1,Group2,PCC
0,G1,G2,0.525638
1,G1,G3,0.496424
2,G1,G4,0.597905
3,G1,G5,0.553932
4,G2,G3,0.431352
5,G2,G4,0.601494
6,G2,G5,0.558547
7,G3,G4,0.586227
8,G3,G5,0.542303
9,G4,G5,0.671628


In [13]:
df_bpx_neg_ddg = bindprofx_predict(skempi_records, evlaute_negative=True)
df_bpx_neg_ddg.head()

records processed: 100%|██████████| 3047/3047 [40:25<00:00,  1.26it/s]


Unnamed: 0,MODEL_NAME,MUTATIONS,GROUP,DDG,BINDPROFX
0,1CSE_LI38G,GI38L,3,-2.280577,-2.529
1,1CSE_LI38S,SI38L,3,-1.188776,-2.471
2,1CSE_LI38P,PI38L,3,-6.765446,-2.557
3,1CSE_LI38I,II38L,3,-2.982502,-2.269
4,1CSE_LI38D,DI38L,3,-4.411843,-2.499


In [14]:
df_bpx_neg_groups = bindprofx_evaluate(df_bpx_neg_ddg)
df_bpx_neg_groups.head(10)

Unnamed: 0,Group1,Group2,PCC
0,G1,G2,0.525638
1,G1,G3,0.496424
2,G1,G4,0.597905
3,G1,G5,0.553932
4,G2,G3,0.431352
5,G2,G4,0.601494
6,G2,G5,0.558547
7,G3,G4,0.586227
8,G3,G5,0.542303
9,G4,G5,0.671628


In [15]:
# df_bpx_pos_ddg.to_csv("csvs/bpx_pos_ddg.csv", index = False)
# df_bpx_neg_ddg.to_csv("csvs/bpx_neg_ddg.csv", index = False)
# df_bpx_pos_groups.to_csv("csvs/bpx_pos_groups.csv", index = False)
# df_bpx_neg_groups.to_csv("csvs/bpx_neg_groups.csv", index = False)
# pearsonr(df_bpx_pos_ddg.DDG, df_bpx_pos_ddg.BINDPROFX), pearsonr(df_bpx_neg_ddg.DDG, df_bpx_neg_ddg.BINDPROFX)

In [16]:
bpx_paper_df = pd.read_excel(osp.join(BINDPROFX_DATA, 'all.xlsx'))
bpx_paper_df.head(100)

Unnamed: 0,pdbID,chainA,chainB,mut,region,EXP_ddG,mutNum,xprof,foldx,0.9*xprof+0.4*foldx,FoldX3,FoldX4,BindProf,Dcomplex,beatmusic
0,1A22,A,B,CA171A,COR,1.010,1.0,2.880479,-0.4270,2.421631,-0.4168,-0.0005,1.074270,0.122854,0.85
1,1A22,A,B,CB67A,SUP,0.000,1.0,1.498721,0.0288,1.360369,0.0288,0.1781,0.764937,0.001054,1.62
2,1A22,A,B,CB81A,COR,0.000,1.0,1.514268,-1.9451,0.584801,-0.2304,0.2366,0.626583,0.204846,2.21
3,1A22,A,B,DA160A,COR,0.791,1.0,1.333332,1.5186,1.807439,1.9083,1.7535,0.074800,0.191700,1.10
4,1A22,A,B,DB119A,SUP,1.543,1.0,2.703353,3.2822,3.745897,2.4683,2.2667,0.821316,0.183706,0.28
5,1A22,A,B,DB85A,SUP,0.992,1.0,1.357758,-0.0781,1.190742,-0.0240,-0.1383,0.909657,0.439026,0.93
6,1A22,A,B,EA163A,COR,-0.925,1.0,1.330211,-0.5755,0.966990,-0.0568,-0.6834,0.648559,0.558217,0.30
7,1A22,A,B,EA56A,RIM,0.411,1.0,0.297532,1.4889,0.863339,1.0840,1.8703,0.487930,0.570821,1.15
8,1A22,A,B,EA65A,RIM,-0.473,1.0,0.791480,-0.6217,0.463652,-0.0950,-0.1396,0.784316,0.000000,0.40
9,1A22,A,B,EB12A,RIM,1.381,1.0,2.322097,1.0953,2.528007,0.5317,-0.0607,1.332662,0.542085,1.06


In [17]:
pearsonr(bpx_paper_df["0.9*xprof+0.4*foldx"], bpx_paper_df.EXP_ddG)
pearsonr(bpx_paper_df.foldx, bpx_paper_df.EXP_ddG)
pearsonr(bpx_paper_df.xprof, bpx_paper_df.EXP_ddG)

(0.65080778351508151, 8.4702226405834587e-170)

In [18]:
df_bpx_pos_ddg = pd.read_csv("csvs/bpx_pos_ddg.csv")
df_bpx_neg_ddg = pd.read_csv("csvs/bpx_neg_ddg.csv")

df_bpx_ddg = pd.DataFrame({
    "MNAME_POS": df_bpx_pos_ddg.MODEL_NAME, 
    "MNAME_NEG": df_bpx_neg_ddg.MODEL_NAME, 
    "MUTATIONS_POS": df_bpx_pos_ddg.MUTATIONS,
    "MUTATIONS_NEG": df_bpx_neg_ddg.MUTATIONS,
    "GROUP": df_bpx_pos_ddg.GROUP,
    "BPX_POS": df_bpx_pos_ddg.BINDPROFX,
    "BPX_NEG": df_bpx_neg_ddg.BINDPROFX,
    "DDG_POS": df_bpx_pos_ddg.DDG,
    "DDG_NEG": df_bpx_neg_ddg.DDG,
})

df_bpx_ddg.to_csv("csvs/bpx_ddg.csv", index=False)
df_bpx_ddg.head(100)

Unnamed: 0,BPX_NEG,BPX_POS,DDG_NEG,DDG_POS,GROUP,MNAME_NEG,MNAME_POS,MUTATIONS_NEG,MUTATIONS_POS
0,-2.529,2.529,-2.280577,2.280577,3,1CSE_LI38G,1CSE,GI38L,LI38G
1,-2.471,2.471,-1.188776,1.188776,3,1CSE_LI38S,1CSE,SI38L,LI38S
2,-2.557,2.557,-6.765446,6.765446,3,1CSE_LI38P,1CSE,PI38L,LI38P
3,-2.269,2.269,-2.982502,2.982502,3,1CSE_LI38I,1CSE,II38L,LI38I
4,-2.499,2.499,-4.411843,4.411843,3,1CSE_LI38D,1CSE,DI38L,LI38D
5,-2.300,2.300,-2.382746,2.382746,3,1CSE_LI38E,1CSE,EI38L,LI38E
6,-2.974,2.974,-6.118069,6.118069,1,1ACB_LI38G,1ACB,GI38L,LI38G
7,-2.899,2.899,-5.020848,5.020848,1,1ACB_LI38S,1ACB,SI38L,LI38S
8,-2.995,2.995,-6.949068,6.949068,1,1ACB_LI38P,1ACB,PI38L,LI38P
9,-2.757,2.757,-4.290150,4.290150,1,1ACB_LI38I,1ACB,II38L,LI38I


In [19]:
indx1 = df_bpx_ddg.MUTATIONS_POS.isin(bpx_paper_df.mut) 
indx2 = df_bpx_ddg.MNAME_POS.isin(bpx_paper_df.pdbID)
indx = indx1 & indx2
pearsonr(df_bpx_ddg[indx].BPX_POS, df_bpx_ddg[indx].DDG_POS), sum(indx)

((0.64842531013903137, 8.9982805477801137e-185), 1543)