In [1]:
from skempi_utils import *
from scipy.stats import pearsonr

  from .murmurhash import murmurhash3_32
  from ._logistic_sigmoid import _log_logistic_sigmoid
  from .sparsefuncs_fast import csr_row_norms
  from .expected_mutual_info_fast import expected_mutual_information
  from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan


In [2]:
df = skempi_df
df_multi = df[~np.asarray([len(s)>8 for s in df.Protein])]
s_multi = set([s[:4] for s in df_multi.Protein])
s_groups = set([s[:4] for s in G1 + G2 + G3 + G4 + G5])
len(s_multi & s_groups), len(s_multi), len(s_groups)
df_multi.head()

Unnamed: 0,Protein,Mutation(s)_PDB,Mutation(s)_cleaned,Location(s),Hold_out_type,Hold_out_proteins,Affinity_mut (M),Affinity_wt (M),DDG,Reference,...,Temperature,kon_mut (M^(-1)s^(-1)),kon_wt (M^(-1)s^(-1)),koff_mut (s^(-1)),koff_wt (s^(-1)),dH_mut (kcal mol^(-1)),dH_wt (kcal mol^(-1)),dS_mut (cal mol^(-1) K^(-1)),dS_wt (cal mol^(-1) K^(-1)),Notes
0,1CSE_E_I,LI45G,LI38G,COR,PI,PI,5.26e-11,1.12e-12,2.280577,9048543,...,294,,,,,,,,,
1,1CSE_E_I,LI45S,LI38S,COR,PI,PI,8.33e-12,1.12e-12,1.188776,9048543,...,294,,,,,,,,,
2,1CSE_E_I,LI45P,LI38P,COR,PI,PI,1.02e-07,1.12e-12,6.765446,9048543,...,294,,,,,,,,,
3,1CSE_E_I,LI45I,LI38I,COR,PI,PI,1.72e-10,1.12e-12,2.982502,9048543,...,294,,,,,,,,,
4,1CSE_E_I,LI45D,LI38D,COR,PI,PI,1.92e-09,1.12e-12,4.411843,9048543,...,294,,,,,,,,,


In [3]:
from sklearn.preprocessing import StandardScaler
from itertools import combinations as comb
from sklearn.externals import joblib
import numpy as np

def evaluate(group_str, y_true, y_pred, ix):
    y_pred_pos = y_pred[ix == 0]
    y_pred_neg = y_pred[ix == 1]
    y_true_pos = y_true[ix == 0]
    y_true_neg = y_true[ix == 1]
    cor_all, _ = pearsonr(y_true, y_pred)
    cor_pos, _ = pearsonr(y_true_pos, y_pred_pos)
    cor_neg, _ = pearsonr(y_true_neg, y_pred_neg)
    print("[%s:%d] cor_all:%.3f, cor_pos:%.3f, cor_neg:%.3f" % (group_str, len(y_true), cor_all, cor_pos, cor_neg))
    return cor_all, cor_pos, cor_neg

def run_cv_test(X, y, ix, get_regressor, modelname, normalize=1):
    gt, preds, indx, cors = [], [], [], []
    groups = [G1, G2, G3, G4, G5]
    prots = G1 + G2 + G3 + G4 + G5
    for i, pair in enumerate(comb(range(NUM_GROUPS), 2)):
        group = groups[pair[0]] + groups[pair[1]]
        g1, g2 = np.asarray(pair) + 1
        indx_tst = (ix[:, 0] == g1) | (ix[:, 0]  == g2)
        indx_trn = np.logical_not(indx_tst)
        y_trn = y[indx_trn]
        y_true = y[indx_tst]
        X_trn = X[indx_trn]
        X_tst = X[indx_tst]
        if normalize == 1:
            scaler = StandardScaler()
            scaler.fit(X_trn)
            X_trn = scaler.transform(X_trn)
            X_tst = scaler.transform(X_tst)
        regressor = get_regressor()
        regressor.fit(X_trn, y_trn)
        joblib.dump(regressor, 'models/%s%s.pkl' % (modelname, i))
        regressor = joblib.load('models/%s%s.pkl' % (modelname, i))
        y_pred = regressor.predict(X_tst)
        cor, pos, neg = evaluate("G%d,G%d" % (g1, g2), y_true, y_pred, ix[indx_tst, 1])
        cors.append([cor, pos, neg])
        indx.extend(ix[indx_tst, 1])
        preds.extend(y_pred)
        gt.extend(y_true)
    return [np.asarray(a) for a in [gt, preds, indx, cors]]


def run_cv_test_ensemble(X, y, ix, alpha=0.5, normalize=1):
    gt, preds, indx, cors = [], [], [], []
    groups = [G1, G2, G3, G4, G5]
    prots = G1 + G2 + G3 + G4 + G5
    for i, pair in enumerate(comb(range(NUM_GROUPS), 2)):
        group = groups[pair[0]] + groups[pair[1]]
        g1, g2 = np.asarray(pair) + 1
        indx_tst = (ix[:, 0] == g1) | (ix[:, 0]  == g2)
        indx_trn = (ix[:, 0] != 0) & ((ix[:, 0] == g1) | (ix[:, 0]  == g2))
        y_trn = y[indx_trn]
        y_true = y[indx_tst]
        X_trn = X[indx_trn]
        X_tst = X[indx_tst]
        svr = joblib.load('models/svr%d.pkl' % i)
        rfr = joblib.load('models/rfr%d.pkl' % i)
        if normalize == 1:
            scaler = StandardScaler()
            scaler.fit(X_trn)
            X_trn = scaler.transform(X_trn)
            X_tst = scaler.transform(X_tst)
        y_pred_svr = svr.predict(X_tst)
        y_pred_rfr = rfr.predict(X_tst)
        y_pred = alpha * y_pred_svr + (1-alpha) * y_pred_rfr
        cor, pos, neg = evaluate("G%d,G%d" % (g1, g2), y_true, y_pred, ix[indx_tst, 1])
        cors.append([cor, pos, neg])
        indx.extend(ix[indx_tst, 1])
        preds.extend(y_pred)
        gt.extend(y_true)
    return [np.asarray(a) for a in [gt, preds, indx, cors]]


def records_to_xy(skempi_records, load_neg=True):
    data = []
    for record in tqdm(skempi_records, desc="records processed"):
        r = record
        assert r.struct is not None
        data.append([r.features(True), [r.ddg], [r.group, r.is_minus]])
        if not load_neg: continue 
        rr = reversed(record)
        assert rr.struct is not None
        data.append([rr.features(True), [rr.ddg], [rr.group, rr.is_minus]])
    X, y, ix = [np.asarray(d) for d in zip(*data)]
    return X, y, ix

In [4]:
def get_temperature_array(records, agg=np.min):
    arr = []
    pbar = tqdm(range(len(skempi_df)), desc="row processed")
    for i, row in skempi_df.iterrows():
        arr_obs_mut = []
        for mutation in row["Mutation(s)_cleaned"].split(','):
            mut = Mutation(mutation)
            res_i, chain_id = mut.i, mut.chain_id
            t = tuple(row.Protein.split('_'))
            skempi_record = records[t]
            res = skempi_record[chain_id][res_i]
            temps = [a.temp for a in res.atoms]
            arr_obs_mut.append(np.mean(temps))
        arr.append(agg(arr_obs_mut))
        pbar.update(1)
    pbar.close()
    return arr

skempi_records = load_skempi_structs(pdb_path="../data/pdbs_n", compute_dist_mat=False)
temp_arr = get_temperature_array(skempi_records, agg=np.min)

skempi structures processed: 100%|██████████| 158/158 [00:08<00:00, 17.60it/s]
row processed: 100%|██████████| 3047/3047 [00:00<00:00, 5533.66it/s]


In [5]:
skempi_structs = load_skempi_structs("../data/pdbs", compute_dist_mat=False)
skempi_records = load_skempi_records(skempi_structs)

skempi structures processed: 100%|██████████| 158/158 [00:06<00:00, 25.06it/s]
skempi records processed: 100%|██████████| 3047/3047 [00:00<00:00, 5530.34it/s]


In [6]:
# X_pos, y_pos, ix_pos = records_to_xy(skempi_records)
# X_pos.shape, y_pos.shape, ix_pos.shape

In [7]:
X_, y_, ix_ = records_to_xy(skempi_records)

records processed: 100%|██████████| 3047/3047 [2:27:48<00:00,  2.91s/it]  


In [8]:
X = X_[:, :]
# X = np.concatenate([X.T, [temp_arr]], axis=0).T
y = y_[:, 0]
ix = ix_
X.shape, y.shape, ix.shape

((6094, 11), (6094,), (6094, 2))

In [9]:
print("----->SVR")
from sklearn.svm import SVR
def get_regressor(): return SVR(kernel='rbf')
gt, preds, indx, cors = run_cv_test(X, y, ix, get_regressor, 'svr', normalize=1)
cor1, _, _ = evaluate("CAT", gt, preds, indx)
print(np.mean(cors, axis=0))

print("----->RFR")
from sklearn.ensemble import RandomForestRegressor
def get_regressor(): return RandomForestRegressor(n_estimators=50, random_state=0)
gt, preds, indx, cors = run_cv_test(X, y, ix, get_regressor, 'rfr', normalize=1)
cor2, _, _ = evaluate("CAT", gt, preds, indx)
print(np.mean(cors, axis=0))

# alpha = cor1/(cor1+cor2)
alpha = 0.5
print("----->%.2f*SVR + %.2f*RFR" % (alpha, 1-alpha))
gt, preds, indx, cors = run_cv_test_ensemble(X, y, ix, normalize=1)
cor, _, _ = evaluate("CAT", gt, preds, indx)
print(np.mean(cors, axis=0))

  from . import libsvm, liblinear
  from . import libsvm_sparse
  from ..utils.seq_dataset import ArrayDataset, CSRDataset
  from ..utils import arrayfuncs, as_float_array, check_X_y, deprecated
  from ._random import sample_without_replacement
  from . import cd_fast
  from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
  from .sag_fast import sag
  from .ball_tree import BallTree
  from .kd_tree import KDTree


----->SVR
[G1,G2:1468] cor_all:0.604, cor_pos:0.401, cor_neg:0.426
[G1,G3:1580] cor_all:0.501, cor_pos:0.357, cor_neg:0.373
[G1,G4:1630] cor_all:0.447, cor_pos:0.255, cor_neg:0.326
[G1,G5:1820] cor_all:0.587, cor_pos:0.453, cor_neg:0.464
[G2,G3:1468] cor_all:0.621, cor_pos:0.398, cor_neg:0.508
[G2,G4:1518] cor_all:0.564, cor_pos:0.341, cor_neg:0.414
[G2,G5:1708] cor_all:0.675, cor_pos:0.525, cor_neg:0.584
[G3,G4:1630] cor_all:0.482, cor_pos:0.293, cor_neg:0.366
[G3,G5:1820] cor_all:0.619, cor_pos:0.477, cor_neg:0.520
[G4,G5:1870] cor_all:0.599, cor_pos:0.451, cor_neg:0.503
[CAT:16512] cor_all:0.563, cor_pos:0.388, cor_neg:0.436
[0.56983956 0.39499973 0.44830233]
----->RFR


  from ._criterion import Criterion
  from numpy.core.umath_tests import inner1d
  from ._gradient_boosting import predict_stages


[G1,G2:1468] cor_all:0.662, cor_pos:0.523, cor_neg:0.426
[G1,G3:1580] cor_all:0.634, cor_pos:0.502, cor_neg:0.466
[G1,G4:1630] cor_all:0.578, cor_pos:0.447, cor_neg:0.400
[G1,G5:1820] cor_all:0.670, cor_pos:0.603, cor_neg:0.522
[G2,G3:1468] cor_all:0.693, cor_pos:0.537, cor_neg:0.488
[G2,G4:1518] cor_all:0.618, cor_pos:0.438, cor_neg:0.410
[G2,G5:1708] cor_all:0.683, cor_pos:0.551, cor_neg:0.568
[G3,G4:1630] cor_all:0.619, cor_pos:0.425, cor_neg:0.504
[G3,G5:1820] cor_all:0.693, cor_pos:0.605, cor_neg:0.578
[G4,G5:1870] cor_all:0.660, cor_pos:0.533, cor_neg:0.577
[CAT:16512] cor_all:0.633, cor_pos:0.498, cor_neg:0.477
[0.65100622 0.51639772 0.49397302]
----->0.50*SVR + 0.50*RFR
[G1,G2:1468] cor_all:0.616, cor_pos:0.417, cor_neg:0.395
[G1,G3:1580] cor_all:0.605, cor_pos:0.490, cor_neg:0.448
[G1,G4:1630] cor_all:0.494, cor_pos:0.352, cor_neg:0.343
[G1,G5:1820] cor_all:0.655, cor_pos:0.566, cor_neg:0.518
[G2,G3:1468] cor_all:0.690, cor_pos:0.503, cor_neg:0.528
[G2,G4:1518] cor_all:0.593, 

In [17]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
def run_holdout_test_ensemble(X, y, ix, alpha=0.5, normalize=1):
    indx_tst = ix[:, 0] == 0
    indx_trn = np.logical_not(indx_tst)
    y_trn = y[indx_trn]
    y_true = y[indx_tst]
    X_trn = X[indx_trn]
    X_tst = X[indx_tst]
    svr = SVR(kernel='rbf')
    rfr = RandomForestRegressor(n_estimators=50, random_state=0)
    if normalize == 1:
        scaler = StandardScaler()
        scaler.fit(X_trn)
        X_trn = scaler.transform(X_trn)
        X_tst = scaler.transform(X_tst)
    svr.fit(X_trn, y_trn)
    rfr.fit(X_trn, y_trn)
    y_pred_svr = svr.predict(X_tst)
    y_pred_rfr = rfr.predict(X_tst)
    y_pred = alpha * y_pred_svr + (1-alpha) * y_pred_rfr
    cor, pos, neg = evaluate("holdout", y_true, y_pred, ix[indx_tst, 1])
    return cor, pos, neg

In [18]:
alpha = 0.5
run_holdout_test_ensemble(X, y, ix, alpha=0.5, normalize=1)

[holdout:1966] cor_all:0.669, cor_pos:0.512, cor_neg:0.475


(0.6692042710734145, 0.5115657470680267, 0.47510955646195174)