In [None]:
import os
import sys
import subprocess

import numpy as np
import pandas as pd

from io import StringIO

os.getcwd()

'/home/yotamfr/development/skempi/src'

In [2]:
skempi_df = pd.read_excel(os.path.join('../data', 'SKEMPI_1.1.xlsx'))
skempi_df.columns

Index([u'Protein', u'Mutation(s)_PDB', u'Mutation(s)_cleaned', u'Location(s)',
       u'Hold_out_type', u'Hold_out_proteins', u'Affinity_mut (M)',
       u'Affinity_wt (M)', u'DDG', u'Reference', u'Protein 1', u'Protein 2',
       u'Temperature 298.15', u'Temperature Num', u'Temperature',
       u'kon_mut (M^(-1)s^(-1))', u'kon_wt (M^(-1)s^(-1))',
       u'koff_mut (s^(-1))', u'koff_wt (s^(-1))', u'dH_mut (kcal mol^(-1))',
       u'dH_wt (kcal mol^(-1))', u'dS_mut (cal mol^(-1) K^(-1))',
       u'dS_wt (cal mol^(-1) K^(-1))', u'Notes'],
      dtype='object')

In [3]:
skempi_df[skempi_df.Protein=="2VLJ_ABC_DE"]

Unnamed: 0,Protein,Mutation(s)_PDB,Mutation(s)_cleaned,Location(s),Hold_out_type,Hold_out_proteins,Affinity_mut (M),Affinity_wt (M),DDG,Reference,...,Temperature,kon_mut (M^(-1)s^(-1)),kon_wt (M^(-1)s^(-1)),koff_mut (s^(-1)),koff_wt (s^(-1)),dH_mut (kcal mol^(-1)),dH_wt (kcal mol^(-1)),dS_mut (cal mol^(-1) K^(-1)),dS_wt (cal mol^(-1) K^(-1)),Notes
2703,2VLJ_ABC_DE,DE32A,DE28A,COR,AB,AB,7.4e-05,5e-06,1.573201,18275829,...,298,,,,,,,,,
2704,2VLJ_ABC_DE,IE53V,IE49V,COR,AB,AB,7e-06,5e-06,0.200969,18275829,...,298,17100.0,31000.0,0.13,0.16,-22.0,-23.0,-50.3,-50.3,
2705,2VLJ_ABC_DE,IE53L,IE49L,COR,AB,AB,5.7e-05,5e-06,1.418563,18275829,...,298,7000.0,31000.0,0.4,0.16,,,,,
2706,2VLJ_ABC_DE,NE55A,NE51A,RIM,AB,AB,3.5e-05,5e-06,1.129622,18275829,...,298,18000.0,31000.0,0.63,0.16,-18.0,-23.0,-40.2,-50.3,
2707,2VLJ_ABC_DE,NE55D,NE51D,RIM,AB,AB,1.2e-05,5e-06,0.495437,18275829,...,298,32300.0,31000.0,0.38,0.16,,,,,
2708,2VLJ_ABC_DE,DE56A,DE52A,COR,AB,AB,6e-06,5e-06,0.132202,18275829,...,298,38000.0,31000.0,0.25,0.16,-24.0,-23.0,-57.0,-50.3,
2709,2VLJ_ABC_DE,QE58A,QE54A,COR,AB,AB,1.2e-05,5e-06,0.495437,18275829,...,298,28000.0,31000.0,0.34,0.16,-18.0,-23.0,-36.9,-50.3,
2710,2VLJ_ABC_DE,QE58E,QE54E,COR,AB,AB,5e-06,5e-06,0.0,18275829,...,298,50000.0,31000.0,0.26,0.16,-20.0,-23.0,-43.6,-50.3,
2711,2VLJ_ABC_DE,SE99A,SE95A,SUP,AB,AB,5e-06,5e-06,-0.035206,18275829,...,298,59000.0,31000.0,0.29,0.16,-18.0,-23.0,-33.5,-50.3,
2712,2VLJ_ABC_DE,YE101A,YE97A,COR,AB,AB,8e-06,5e-06,0.232574,18275829,...,298,,,,,-29.0,-23.0,-73.8,-50.3,


In [4]:
pdb_and_chains = set([tuple(pdb_str.split('_')) for pdb_str in skempi_df.Protein.values])

In [5]:
from skempi import *
import skempi as skempi
skempi.PDB_PATH = "../data/pdbs"

In [535]:
class Stride(object):

    def __init__(self, pdb):
        df = pd.read_csv('../data/stride/%s.out' % pdb)
        self._dict = {}
        for i, row in df.iterrows():
            self._dict[(row.Chain, row.Res-1)] = row.to_dict()

    def __getitem__(self, t):
        chain, res = t
        return self._dict[(chain, res)]
    
class SkempiRecord(object):
    def __init__(self, pdb, chains_a, chains_b):
        fd = open(osp.join(PDB_PATH, "%s.pdb" % pdb), 'r')
        self.struct = parse_pdb(pdb, fd)
        self.pdb = pdb
        self.chains_a = {c: self.struct[c] for c in chains_a}
        self.chains_b = {c: self.struct[c] for c in chains_b}
        self.res_chain_to_atom_indices = {}
        self.atom_indices_to_chain_res = {}
        self.atoms = []
        self.init_dictionaries()
        self.dist_mat = None
        self._profiles = {}
        self.init_profiles()
        self._stride = Stride(self.pdb)

    @property
    def chains(self):
        return self.struct.chains

    def init_profiles(self):
        self._profiles = {c: Profile(self.pdb, c) for c in self.chains}

    def get_profile(self, chain_id):
        return self._profiles[chain_id]

    @property
    def stride(self):
        return self._stride

    def init_dictionaries(self):
        for chain in self.struct:
            for res_i, res in enumerate(chain):
                for atom in res:
                    chain_id = chain.chain_id
                    if (chain_id, res_i) in self.res_chain_to_atom_indices:
                        self.res_chain_to_atom_indices[(chain_id, res_i)].append(len(self.atoms))
                    else:
                        self.res_chain_to_atom_indices[(chain_id, res_i)] = [len(self.atoms)]
                    self.atom_indices_to_chain_res[len(self.atoms)] = (chain_id, res_i)
                    self.atoms.append(atom)

    def compute_dist_mat(self):
        atoms = self.atoms
        self.dist_mat = get_distance_matrix(atoms1=atoms, atoms2=atoms)

    def __getitem__(self, chain):
        return self.chains[chain]

    def get_sphere_indices(self, chain, res_i, threshold):
        mat = self.dist_mat
        row_indices = self.res_chain_to_atom_indices[(chain, res_i)]
        col_indices = []
        for row_i in row_indices:
            col_indices.extend([ix for ix in np.where(mat[row_i] <= threshold)[0]])
        return set([self.atom_indices_to_chain_res[col_i] for col_i in col_indices])

    def get_stride(self):
        pass

    def __iter__(self):
        for chain in self.chains.values():
            yield chain

    def to_fasta(self):
        pass

In [536]:
record = SkempiRecord("2VLJ", "ABC", "DE")
# record = SkempiRecord("1CSE", "E", "I")

In [537]:
record.chains_a['A'].seq

'GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRMEPRAPWIEQEGPEYWDGETRKVKAHSQTHRVDLGTLRGYYNQSEAGSHTVQRMYGCDVGSDWRFLRGYHQYAYDGKDYIALKEDLRSWTAADMAAQTTKHKWEAAHVAEQLRAYLEGTCVEWLRRYLENGKETLQRTDAPKTHMTHHAVSDHEATLRCWALSFYPAEITLTWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGQEQRYTCHVQHEGLPKPLTLRWEP'

In [538]:
record.compute_dist_mat()
sphere_ix = record.get_sphere_indices('A', 2, 6)
print(sphere_ix)
print(record.get_profile('A')[(2, 'A')])

set([('A', 1), ('A', 27), ('A', 0), ('A', 179), ('A', 100), ('A', 178), ('A', 103), ('A', 106), ('A', 102), ('A', 171), ('A', 101), ('A', 167), ('A', 4), ('A', 3), ('A', 104), ('A', 2), ('A', 28)])
0.00423728813559


In [540]:
record.stride[('A', 0)]

{'AA': 'GLY',
 'ASA': 52.1,
 'ASA_Chain': 52.1,
 'Chain': 'A',
 'Phi': 360.0,
 'Psi': -173.62,
 'Res': 1,
 'SS': 'C'}

In [541]:
record.dist_mat.shape

(6618, 6618)

In [542]:
from aaindex import *
B = BLOSUM62
C = SKOJ970101

def EI(m, w, P, i, B):
    return sum([P[(i, a)] * (B[(a, m)] - B[(a, w)]) for a in amino_acids])

def CP(mut, skempi, C, radius=6):
    
    i, chain_a = mut.i, mut.chain_id
    m, w = mut.m, mut.w
    
    def helper(P, j):
        return sum([P[(j, a)] * (C[(a, m)] - C[(a, w)]) for a in amino_acids])
    
    ret = 0
    for chain_b, j in skempi.get_sphere_indices(chain_a, i,radius):
        
        if j == i and chain_b == chain_a:
            a = skempi[chain_b][j].name
            assert a == w
            continue
        
        P = skempi.get_profile(chain_b) 
        
        ret += helper(P, j)
    
    return ret


# def CP(mut, skempi, C, radius=6):
    
#     i, chain_a = mut.i, mut.chain_id
#     m, w = mut.m, mut.w
    
#     ret = 0
#     for chain_b, j in skempi.get_sphere_indices(chain_a, i,radius):
        
#         a = skempi[chain_b][j].name
        
#         assert a == w
                
#         ret += C[(a, m)] - C[(a, w)]
    
#     return ret


class Mutation(object):
    
    def __init__(self, mutation):
        try:
            self.w = mutation[0]
            self.chain_id = mutation[1]
            self.i = int(mutation[2:-1]) - 1
            self.m = mutation[-1]
            self.ins_code = None

        except ValueError:
            self.w = mutation[0]
            self.chain_id = mutation[1]
            self.i = int(mutation[2:-2]) - 1
            self.m = mutation[-1]
            self.ins_code = mutation[-2]
            
    def __str__(self):
        return str(vars(self))


In [543]:
skempi_df = pd.read_excel(osp.join('../data', 'SKEMPI_1.1.xlsx'))

prots = skempi_df.Protein.values
skempi_records = {}

for t in tqdm(set([tuple(pdb_str.split('_')) for pdb_str in prots]),
              desc="skempi entries processed"):
    skempi_records[t] = SkempiRecord(*t)







skempi entries processed:   0%|          | 0/158 [00:00<?, ?it/s][A[A[A[A[A[A





skempi entries processed:   1%|          | 1/158 [00:00<00:17,  8.82it/s][A[A[A[A[A[A





skempi entries processed:   1%|▏         | 2/158 [00:00<00:17,  8.79it/s][A[A[A[A[A[A





skempi entries processed:   2%|▏         | 3/158 [00:00<00:18,  8.48it/s][A[A[A[A[A[A





skempi entries processed:   3%|▎         | 4/158 [00:00<00:18,  8.43it/s][A[A[A[A[A[A





skempi entries processed:   3%|▎         | 5/158 [00:00<00:18,  8.28it/s][A[A[A[A[A[A





skempi entries processed:   4%|▍         | 7/158 [00:00<00:16,  8.95it/s][A[A[A[A[A[A





skempi entries processed:   6%|▌         | 9/158 [00:00<00:16,  9.01it/s][A[A[A[A[A[A





skempi entries processed:   7%|▋         | 11/158 [00:01<00:15,  9.44it/s][A[A[A[A[A[A





skempi entries processed:   8%|▊         | 12/158 [00:01<00:15,  9.31it/s][A[A[A[A[A[A





skempi entries processed:   8%

In [12]:
def comp_ei(mut, skempi_record, B, radius):
    P = skempi_record.get_profile(mut.chain_id)
    return EI(mut.m, mut.w, P, mut.i, B)

def comp_cp(mut, skempi_record, C, radius):
    return CP(mut, skempi_record, C, radius)


def get_ddg_ei_cp_arrays(M, func, radius=None):
    arr_ddg = []
    arr_obs = []
    pbar = tqdm(range(len(skempi_df)), desc="row processed")
    for i, row in skempi_df.iterrows():
        ddg = row.DDG
        arr_ddg.append(ddg)
        arr_obs_mut = []
        for mutation in row["Mutation(s)_cleaned"].split(','):
            mut = Mutation(mutation)
            t = tuple(row.Protein.split('_'))
            skempi_record = skempi_records[t]
            skempi_record.compute_dist_mat()
            obs = func(mut, skempi_record, M, radius)
            arr_obs_mut.append(obs)
        arr_obs.append(np.sum(arr_obs_mut))
        pbar.update(1)
    pbar.close()
    return arr_ddg, arr_obs

In [13]:
from scipy.stats import pearsonr

In [14]:
from itertools import product

def grid_search_cp(matrices=[SKOJ970101, BASU010101], radiuses=[4, 5, 6, 7, 8]):
    res_dict = {}
    for C, angs in product(matrices, radiuses):
        key = (str(C), angs)
        arr_ddg, arr_cp = get_ddg_ei_cp_arrays(C, comp_cp, angs)
        res_dict[key] = (arr_ddg, arr_cp)
        cor_cp = pearsonr(arr_ddg, arr_cp)
        print("%s: CP: %s" % (key, cor_cp,))
    return res_dict

def grid_search_ei(matrices=[BLOSUM62, SKOJ970101, BASU010101]):
    res_dict = {}
    for B in matrices:
        key = str(B)
        arr_ddg, arr_ei = get_ddg_ei_cp_arrays(B, comp_ei)
        res_dict[key] = (arr_ddg, arr_ei)
        cor_ei = pearsonr(arr_ddg, arr_ei)
        print("%s: EI: %s" % (key, cor_ei,))
    return res_dict


In [15]:
eis = grid_search_ei()

row processed: 100%|██████████| 3047/3047 [08:42<00:00,  5.83it/s]
row processed:   0%|          | 1/3047 [00:00<05:30,  9.22it/s]

BLOSUM62: EI: (-0.18348266887605677, 1.7735594893088174e-24)


row processed: 100%|██████████| 3047/3047 [08:43<00:00,  5.82it/s]
row processed:   0%|          | 1/3047 [00:00<05:06,  9.95it/s]

SKOJ970101: EI: (0.19216942446949448, 9.8286921678709035e-27)


row processed: 100%|██████████| 3047/3047 [08:52<00:00,  5.72it/s]

BASU010101: EI: (0.17535505843986804, 1.8228134582699703e-22)





In [16]:
# cps = grid_search_cp()

In [17]:
def comp_cp_a_b(mut, skempi_record, C, radius):
    return CP_A_B(mut, skempi_record, C, radius)


def get_ddg_cp_a_b_arrays(M, func, radius=None):
    arr_ddg = []
    arr_obs_a = []
    arr_obs_b = []
    pbar = tqdm(range(len(skempi_df)), desc="row processed")
    for i, row in skempi_df.iterrows():
        ddg = row.DDG
        arr_ddg.append(ddg)
        arr_obs_mut_a = []
        arr_obs_mut_b = []
        for mutation in row["Mutation(s)_cleaned"].split(','):
            mut = Mutation(mutation)
            t = tuple(row.Protein.split('_'))
            skempi_record = skempi_records[t]
            skempi_record.compute_dist_mat()
            obs_a, obs_b = func(mut, skempi_record, M, radius)
            arr_obs_mut_a.append(obs_a)
            arr_obs_mut_b.append(obs_b)
        arr_obs_a.append(np.sum(arr_obs_mut_a))
        arr_obs_b.append(np.sum(arr_obs_mut_b))
        pbar.update(1)
    pbar.close()
    return arr_ddg, arr_obs_a, arr_obs_b


def grid_search_cp_a_b(matrices=[SKOJ970101, BASU010101], radiuses=[4, 5, 6, 7, 8, 9, 10]):
    res_dict = {}
    for C, angs in product(matrices, radiuses):
        key = (str(C), angs)
        arr_ddg, arr_cp_a, arr_cp_b  = get_ddg_cp_a_b_arrays(C, comp_cp_a_b, angs)
        arr_cp = np.asarray(arr_cp_a) + np.asarray(arr_cp_b)
        res_dict[key] = (arr_ddg, arr_cp_a, arr_cp_b)
        cor_cp_a = pearsonr(arr_ddg, arr_cp_a)
        cor_cp_b = pearsonr(arr_ddg, arr_cp_b)
        cor_cp = pearsonr(arr_ddg, arr_cp)
        print("%s: CP_A: %s, CP_B: %s, CP %s" % (key, cor_cp_a, cor_cp_b, cor_cp))
    return res_dict

In [18]:
def CP_A_B(mut, skempi, C, radius=6):
    
    i, chain_a = mut.i, mut.chain_id
    m, w = mut.m, mut.w
    
#     def helper(P, j):
#         return sum([P[(j, a)] * (C[(a, m)] - C[(a, w)]) for a in amino_acids])
    
    def helper(a, j):
        return C[(a, m)] - C[(a, w)]
    
    retA, retB = 0, 0
    for chain_b, j in skempi.get_sphere_indices(chain_a, i,radius):

        a = skempi[chain_b][j].name
        if j == i and chain_b == chain_a:
            assert a == w
            continue
                
        P = skempi.get_profile(chain_b) 

        if chain_b == chain_a:  
            retA += helper(a, j)
        
        else:
            retB += helper(a, j)
    
    return retA, retB


In [19]:
cp_a_b_s_no_profile = grid_search_cp_a_b(matrices=[SKOJ970101, BASU010101], radiuses=[6, 7])

row processed: 100%|██████████| 3047/3047 [08:52<00:00,  5.72it/s]
row processed:   0%|          | 2/3047 [00:00<04:40, 10.85it/s]

('SKOJ970101', 6): CP_A: (0.12899241387456756, 8.8507553354571917e-13), CP_B: (0.31510033967058948, 3.2909472514044773e-71), CP (0.23025093858330695, 5.9899593202034257e-38)


row processed: 100%|██████████| 3047/3047 [08:52<00:00,  5.72it/s]
row processed:   0%|          | 2/3047 [00:00<04:19, 11.75it/s]

('SKOJ970101', 7): CP_A: (0.12813871212570935, 1.2512498088379469e-12), CP_B: (0.30767202856302794, 8.2220457876862489e-68), CP (0.22746400267848199, 4.7068139278480544e-37)


row processed: 100%|██████████| 3047/3047 [08:47<00:00,  5.78it/s]
row processed:   0%|          | 2/3047 [00:00<04:37, 10.98it/s]

('BASU010101', 6): CP_A: (0.1126933207499565, 4.4332068307731485e-10), CP_B: (0.35501818492812953, 3.3990017934275672e-91), CP (0.24516152877860625, 6.0580342404484527e-43)


row processed: 100%|██████████| 3047/3047 [08:44<00:00,  5.81it/s]

('BASU010101', 7): CP_A: (0.13643805479717855, 3.9195895664075284e-14), CP_B: (0.33861062950737597, 1.2780416763987716e-82), CP (0.25483942919026048, 2.2527136127542865e-46)





In [20]:
def CP_A_B(mut, skempi, C, radius=6):
    
    i, chain_a = mut.i, mut.chain_id
    m, w = mut.m, mut.w
    
    def helper(P, j):
        return sum([P[(j, a)] * (C[(a, m)] - C[(a, w)]) for a in amino_acids])
    
#     def helper(a, j):
#         return C[(a, m)] - C[(a, w)]
    
    retA, retB = 0, 0
    for chain_b, j in skempi.get_sphere_indices(chain_a, i,radius):

        a = skempi[chain_b][j].name
        if j == i and chain_b == chain_a:
            assert a == w
            continue
                
        P = skempi.get_profile(chain_b) 

        if chain_b == chain_a:  
            retA += helper(P, j)
        
        else:
            retB += helper(P, j)
    
    return retA, retB

In [21]:
cp_a_b_s_orig = grid_search_cp_a_b(matrices=[SKOJ970101, BASU010101], radiuses=[6, 7])

row processed: 100%|██████████| 3047/3047 [08:50<00:00,  5.74it/s]
row processed:   0%|          | 2/3047 [00:00<04:33, 11.12it/s]

('SKOJ970101', 6): CP_A: (0.15922361393701534, 9.4069290943909113e-19), CP_B: (0.28193363413720945, 8.7942190111787501e-57), CP (0.23091446921072051, 3.6517239867096613e-38)


row processed: 100%|██████████| 3047/3047 [08:49<00:00,  5.75it/s]
row processed:   0%|          | 2/3047 [00:00<04:33, 11.12it/s]

('SKOJ970101', 7): CP_A: (0.15680392577872013, 3.1514065258845439e-18), CP_B: (0.2977055920083635, 2.0945759996836172e-63), CP (0.23752569390737782, 2.4195946509145309e-40)


row processed: 100%|██████████| 3047/3047 [08:47<00:00,  5.78it/s]
row processed:   0%|          | 2/3047 [00:00<04:19, 11.73it/s]

('BASU010101', 6): CP_A: (0.16604593209712934, 2.8085223686026527e-20), CP_B: (0.3424606427389556, 1.381754790590439e-84), CP (0.26530083607511301, 2.9946474562890449e-50)


row processed: 100%|██████████| 3047/3047 [08:50<00:00,  5.75it/s]

('BASU010101', 7): CP_A: (0.17621764015649943, 1.1265247125320121e-22), CP_B: (0.34222839700718038, 1.8189732609036253e-84), CP (0.27527019137816483, 4.1318263822884498e-54)





In [22]:
def CP_A_B(mut, skempi, C, radius=6):
    
    i, chain_a = mut.i, mut.chain_id
    m, w = mut.m, mut.w
    
    def helper(P, j):
        return sum([0.05 * (C[(a, m)] - C[(a, w)]) for a in amino_acids])
    
#     def helper(a, j):
#         return C[(a, m)] - C[(a, w)]
    
    retA, retB = 0, 0
    for chain_b, j in skempi.get_sphere_indices(chain_a, i,radius):

        a = skempi[chain_b][j].name
        if j == i and chain_b == chain_a:
            assert a == w
            continue
                
        P = skempi.get_profile(chain_b) 

        if chain_b == chain_a:  
            retA += helper(P, j)
        
        else:
            retB += helper(P, j)
    
    return retA, retB

In [23]:
cp_a_b_s_uniform = grid_search_cp_a_b(matrices=[SKOJ970101, BASU010101], radiuses=[6, 7])

row processed: 100%|██████████| 3047/3047 [08:44<00:00,  5.81it/s]
row processed:   0%|          | 2/3047 [00:00<04:19, 11.73it/s]

('SKOJ970101', 6): CP_A: (0.16123595170241231, 3.3922690400735878e-19), CP_B: (0.26672526849365225, 8.6058008697708072e-51), CP (0.2251320913909437, 2.586235766630754e-36)


row processed: 100%|██████████| 3047/3047 [08:52<00:00,  5.73it/s]
row processed:   0%|          | 1/3047 [00:00<05:42,  8.88it/s]

('SKOJ970101', 7): CP_A: (0.1620582028142715, 2.2276542254997738e-19), CP_B: (0.29191128666928023, 6.3620596148568711e-61), CP (0.23883741861785063, 8.7773810986036118e-41)


row processed: 100%|██████████| 3047/3047 [08:51<00:00,  5.73it/s]
row processed:   0%|          | 2/3047 [00:00<04:11, 12.10it/s]

('BASU010101', 6): CP_A: (0.21300246807984247, 1.3416778766703381e-32), CP_B: (0.34737111879464089, 3.9086171731054629e-87), CP (0.29624906619169722, 8.9240722429331053e-63)


row processed: 100%|██████████| 3047/3047 [08:52<00:00,  5.72it/s]

('BASU010101', 7): CP_A: (0.21351384288462372, 9.4522018304968719e-33), CP_B: (0.36314745255523262, 1.2296267842765852e-95), CP (0.30740335992410711, 1.0865183409662947e-67)





In [24]:
cp_a_b_s_uniform.keys(), cp_a_b_s_orig.keys(), cp_a_b_s_no_profile.keys(), eis.keys()

([('SKOJ970101', 7), ('BASU010101', 7), ('BASU010101', 6), ('SKOJ970101', 6)],
 [('SKOJ970101', 7), ('BASU010101', 7), ('BASU010101', 6), ('SKOJ970101', 6)],
 [('SKOJ970101', 7), ('BASU010101', 7), ('BASU010101', 6), ('SKOJ970101', 6)],
 ['BLOSUM62', 'SKOJ970101', 'BASU010101'])

In [544]:
len(cp_a_b_s_uniform[('SKOJ970101', 7)])

all_features = {}

def register_cp_a_b(cp_a_b, prefix):
    for key, val in cp_a_b.iteritems():
        _, cp_a, cp_b = val
        mat, rad = key
        all_features[(prefix, "CP_A", mat, rad)] = cp_a
        all_features[(prefix, "CP_B", mat, rad)] = cp_b
        
def register_eis(eis):
    for key, val in eis.iteritems():
        _, ei = val
        all_features[("EI", key)] = ei

In [545]:
register_cp_a_b(cp_a_b_s_uniform, "uniform")
register_cp_a_b(cp_a_b_s_orig, "original")
register_cp_a_b(cp_a_b_s_no_profile, "no_profile")
register_eis(eis)

In [546]:
num_muts = np.asarray([len(mut.split(",")) for mut in skempi_df["Mutation(s)_cleaned"]])
pearsonr(skempi_df.DDG, np.log(num_muts)), pearsonr(skempi_df.DDG, num_muts)

((0.21924597848778335, 1.7513323606150017e-34),
 (0.13427861416945261, 9.8559457179327132e-14))

In [547]:
all_features["#mutations"] = np.log(num_muts)

In [717]:
def get_stride_array(func, agg=np.sum):
    arr_stride = []
    pbar = tqdm(range(len(skempi_df)), desc="row processed")
    for i, row in skempi_df.iterrows():
        arr_obs_mut = []
        for mutation in row["Mutation(s)_cleaned"].split(','):
            mut = Mutation(mutation)
            res_i, chain_id = mut.i, mut.chain_id
            t = tuple(row.Protein.split('_'))
            skempi_record = skempi_records[t]
            stride = skempi_record.stride[(chain_id, res_i)]
            skempi_record.compute_dist_mat()
            obs = func(stride)
            arr_obs_mut.append(obs)
        arr_stride.append(agg(arr_obs_mut))
        pbar.update(1)
    pbar.close()
    return arr_stride

In [710]:
def asa_diff(stride):
    return abs(stride["ASA"] - stride["ASA_Chain"])

stride_arr = get_stride_array(asa_diff)











row processed:   0%|          | 0/3047 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A









row processed:   0%|          | 1/3047 [00:00<07:08,  7.11it/s][A[A[A[A[A[A[A[A[A[A









row processed:   0%|          | 2/3047 [00:00<06:41,  7.59it/s][A[A[A[A[A[A[A[A[A[A

(1,)
(2,)












row processed:   0%|          | 4/3047 [00:00<05:59,  8.46it/s][A[A[A[A[A[A[A[A[A[A









row processed:   0%|          | 5/3047 [00:00<05:48,  8.73it/s][A[A[A[A[A[A[A[A[A[A

(3,)
(4,)
(5,)












row processed:   0%|          | 6/3047 [00:00<05:38,  8.98it/s][A[A[A[A[A[A[A[A[A[A









row processed:   0%|          | 8/3047 [00:00<05:07,  9.88it/s][A[A[A[A[A[A[A[A[A[A

(6,)
(7,)
(8,)












row processed:   0%|          | 10/3047 [00:00<04:47, 10.56it/s][A[A[A[A[A[A[A[A[A[A








row processed:   3%|▎         | 94/3047 [00:25<04:40, 10.55it/s][A[A[A[A[A[A[A[A[A

(9,)
(10,)
(11,)












row processed:   0%|          | 12/3047 [00:01<04:39, 10.86it/s][A[A[A[A[A[A[A[A[A[A









row processed:   0%|          | 14/3047 [00:01<04:41, 10.77it/s][A[A[A[A[A[A[A[A[A[A

(12,)
(13,)
(14,)












row processed:   1%|          | 16/3047 [00:01<04:52, 10.35it/s][A[A[A[A[A[A[A[A[A[A

(15,)
(16,)












row processed:   1%|          | 18/3047 [00:01<05:02, 10.00it/s][A[A[A[A[A[A[A[A[A[A

(17,)
(18,)
(19,)












row processed:   1%|          | 20/3047 [00:02<06:08,  8.22it/s][A[A[A[A[A[A[A[A[A[A









row processed:   1%|          | 21/3047 [00:02<05:51,  8.62it/s][A[A[A[A[A[A[A[A[A[A

(20,)
(21,)












row processed:   1%|          | 22/3047 [00:02<05:57,  8.46it/s][A[A[A[A[A[A[A[A[A[A









row processed:   1%|          | 23/3047 [00:02<05:52,  8.58it/s][A[A[A[A[A[A[A[A[A[A

(22,)
(23,)












row processed:   1%|          | 24/3047 [00:02<07:23,  6.82it/s][A[A[A[A[A[A[A[A[A[A

(24,)












row processed:   1%|          | 25/3047 [00:02<08:24,  5.99it/s][A[A[A[A[A[A[A[A[A[A









row processed:   1%|          | 26/3047 [00:02<07:36,  6.62it/s][A[A[A[A[A[A[A[A[A[A

(25,)
(26,)












row processed:   1%|          | 27/3047 [00:03<07:16,  6.92it/s][A[A[A[A[A[A[A[A[A[A









row processed:   1%|          | 28/3047 [00:03<06:50,  7.36it/s][A[A[A[A[A[A[A[A[A[A

(27,)
(28,)












row processed:   1%|          | 29/3047 [00:03<06:27,  7.78it/s][A[A[A[A[A[A[A[A[A[A









row processed:   1%|          | 30/3047 [00:03<06:08,  8.19it/s][A[A[A[A[A[A[A[A[A[A

(29,)
(30,)












row processed:   1%|          | 31/3047 [00:03<07:00,  7.18it/s][A[A[A[A[A[A[A[A[A[A









row processed:   1%|          | 32/3047 [00:03<06:25,  7.82it/s][A[A[A[A[A[A[A[A[A[A

(31,)
(32,)












row processed:   1%|          | 33/3047 [00:03<06:07,  8.20it/s][A[A[A[A[A[A[A[A[A[A









row processed:   1%|          | 34/3047 [00:03<05:57,  8.42it/s][A[A[A[A[A[A[A[A[A[A

(33,)
(34,)












row processed:   1%|          | 35/3047 [00:04<06:29,  7.74it/s][A[A[A[A[A[A[A[A[A[A









row processed:   1%|          | 36/3047 [00:04<06:09,  8.15it/s][A[A[A[A[A[A[A[A[A[A

(35,)
(36,)












row processed:   1%|          | 37/3047 [00:04<05:59,  8.37it/s][A[A[A[A[A[A[A[A[A[A









row processed:   1%|          | 38/3047 [00:04<06:10,  8.12it/s][A[A[A[A[A[A[A[A[A[A

(37,)
(38,)












row processed:   1%|▏         | 39/3047 [00:04<07:37,  6.57it/s][A[A[A[A[A[A[A[A[A[A









row processed:   1%|▏         | 40/3047 [00:04<07:32,  6.64it/s][A[A[A[A[A[A[A[A[A[A

(39,)
(40,)












row processed:   1%|▏         | 41/3047 [00:04<07:17,  6.87it/s][A[A[A[A[A[A[A[A[A[A









row processed:   1%|▏         | 42/3047 [00:05<06:47,  7.37it/s][A[A[A[A[A[A[A[A[A[A

(41,)
(42,)












row processed:   1%|▏         | 43/3047 [00:05<06:36,  7.58it/s][A[A[A[A[A[A[A[A[A[A









row processed:   1%|▏         | 44/3047 [00:05<06:37,  7.55it/s][A[A[A[A[A[A[A[A[A[A

(43,)
(44,)












row processed:   1%|▏         | 45/3047 [00:05<06:22,  7.85it/s][A[A[A[A[A[A[A[A[A[A









row processed:   2%|▏         | 46/3047 [00:05<06:07,  8.17it/s][A[A[A[A[A[A[A[A[A[A

(45,)
(46,)












row processed:   2%|▏         | 47/3047 [00:05<06:05,  8.20it/s][A[A[A[A[A[A[A[A[A[A









row processed:   2%|▏         | 48/3047 [00:05<05:56,  8.42it/s][A[A[A[A[A[A[A[A[A[A

(47,)
(48,)












row processed:   2%|▏         | 49/3047 [00:05<06:34,  7.59it/s][A[A[A[A[A[A[A[A[A[A









row processed:   2%|▏         | 50/3047 [00:06<06:29,  7.70it/s][A[A[A[A[A[A[A[A[A[A

(49,)
(50,)












row processed:   2%|▏         | 51/3047 [00:06<06:45,  7.39it/s][A[A[A[A[A[A[A[A[A[A









row processed:   2%|▏         | 52/3047 [00:06<06:25,  7.77it/s][A[A[A[A[A[A[A[A[A[A

(51,)
(52,)












row processed:   2%|▏         | 53/3047 [00:06<06:20,  7.87it/s][A[A[A[A[A[A[A[A[A[A









row processed:   2%|▏         | 54/3047 [00:06<06:18,  7.90it/s][A[A[A[A[A[A[A[A[A[A

(53,)
(54,)












row processed:   2%|▏         | 55/3047 [00:06<06:21,  7.85it/s][A[A[A[A[A[A[A[A[A[A









row processed:   2%|▏         | 56/3047 [00:06<06:16,  7.94it/s][A[A[A[A[A[A[A[A[A[A

(55,)
(56,)












row processed:   2%|▏         | 57/3047 [00:06<06:28,  7.71it/s][A[A[A[A[A[A[A[A[A[A









row processed:   2%|▏         | 58/3047 [00:07<06:09,  8.10it/s][A[A[A[A[A[A[A[A[A[A

(57,)
(58,)












row processed:   2%|▏         | 59/3047 [00:07<06:11,  8.04it/s][A[A[A[A[A[A[A[A[A[A









row processed:   2%|▏         | 60/3047 [00:07<06:12,  8.02it/s][A[A[A[A[A[A[A[A[A[A

(59,)
(60,)












row processed:   2%|▏         | 61/3047 [00:07<06:14,  7.98it/s][A[A[A[A[A[A[A[A[A[A









row processed:   2%|▏         | 62/3047 [00:07<06:06,  8.15it/s][A[A[A[A[A[A[A[A[A[A

(61,)
(62,)












row processed:   2%|▏         | 63/3047 [00:07<06:21,  7.83it/s][A[A[A[A[A[A[A[A[A[A









row processed:   2%|▏         | 64/3047 [00:07<06:22,  7.79it/s][A[A[A[A[A[A[A[A[A[A

(63,)
(64,)












row processed:   2%|▏         | 65/3047 [00:07<06:02,  8.23it/s][A[A[A[A[A[A[A[A[A[A









row processed:   2%|▏         | 66/3047 [00:08<06:01,  8.24it/s][A[A[A[A[A[A[A[A[A[A

(65,)
(66,)












row processed:   2%|▏         | 67/3047 [00:08<05:59,  8.30it/s][A[A[A[A[A[A[A[A[A[A









row processed:   2%|▏         | 68/3047 [00:08<05:47,  8.57it/s][A[A[A[A[A[A[A[A[A[A

(67,)
(68,)












row processed:   2%|▏         | 69/3047 [00:08<05:55,  8.37it/s][A[A[A[A[A[A[A[A[A[A









row processed:   2%|▏         | 70/3047 [00:08<05:56,  8.36it/s][A[A[A[A[A[A[A[A[A[A

(69,)
(70,)












row processed:   2%|▏         | 71/3047 [00:08<06:10,  8.02it/s][A[A[A[A[A[A[A[A[A[A









row processed:   2%|▏         | 72/3047 [00:08<05:50,  8.50it/s][A[A[A[A[A[A[A[A[A[A

(71,)
(72,)












row processed:   2%|▏         | 73/3047 [00:08<05:45,  8.62it/s][A[A[A[A[A[A[A[A[A[A









row processed:   2%|▏         | 74/3047 [00:08<05:39,  8.76it/s][A[A[A[A[A[A[A[A[A[A

(73,)
(74,)












row processed:   2%|▏         | 75/3047 [00:09<05:53,  8.41it/s][A[A[A[A[A[A[A[A[A[A









row processed:   2%|▏         | 76/3047 [00:09<06:01,  8.21it/s][A[A[A[A[A[A[A[A[A[A

(75,)
(76,)












row processed:   3%|▎         | 77/3047 [00:09<05:52,  8.43it/s][A[A[A[A[A[A[A[A[A[A









row processed:   3%|▎         | 78/3047 [00:09<05:39,  8.73it/s][A[A[A[A[A[A[A[A[A[A

(77,)
(78,)












row processed:   3%|▎         | 79/3047 [00:09<05:44,  8.61it/s][A[A[A[A[A[A[A[A[A[A









row processed:   3%|▎         | 80/3047 [00:09<05:35,  8.84it/s][A[A[A[A[A[A[A[A[A[A

(79,)
(80,)












row processed:   3%|▎         | 81/3047 [00:09<05:40,  8.71it/s][A[A[A[A[A[A[A[A[A[A









row processed:   3%|▎         | 82/3047 [00:09<05:34,  8.87it/s][A[A[A[A[A[A[A[A[A[A

(81,)
(82,)












row processed:   3%|▎         | 83/3047 [00:10<05:50,  8.46it/s][A[A[A[A[A[A[A[A[A[A









row processed:   3%|▎         | 84/3047 [00:10<05:47,  8.53it/s][A[A[A[A[A[A[A[A[A[A

(83,)
(84,)












row processed:   3%|▎         | 85/3047 [00:10<05:42,  8.64it/s][A[A[A[A[A[A[A[A[A[A









row processed:   3%|▎         | 86/3047 [00:10<05:29,  8.98it/s][A[A[A[A[A[A[A[A[A[A

(85,)
(86,)












row processed:   3%|▎         | 87/3047 [00:10<05:30,  8.96it/s][A[A[A[A[A[A[A[A[A[A









row processed:   3%|▎         | 88/3047 [00:10<05:25,  9.08it/s][A[A[A[A[A[A[A[A[A[A

(87,)
(88,)












row processed:   3%|▎         | 89/3047 [00:10<05:34,  8.85it/s][A[A[A[A[A[A[A[A[A[A









row processed:   3%|▎         | 91/3047 [00:10<05:20,  9.23it/s][A[A[A[A[A[A[A[A[A[A

(89,)
(90,)
(91,)












row processed:   3%|▎         | 92/3047 [00:11<05:27,  9.03it/s][A[A[A[A[A[A[A[A[A[A









row processed:   3%|▎         | 93/3047 [00:11<05:27,  9.01it/s][A[A[A[A[A[A[A[A[A[A

(92,)
(93,)












row processed:   3%|▎         | 94/3047 [00:11<05:30,  8.92it/s][A[A[A[A[A[A[A[A[A[A









row processed:   3%|▎         | 95/3047 [00:11<05:35,  8.81it/s][A[A[A[A[A[A[A[A[A[A

(94,)
(95,)












row processed:   3%|▎         | 97/3047 [00:11<05:22,  9.16it/s][A[A[A[A[A[A[A[A[A[A

(96,)
(97,)












row processed:   3%|▎         | 98/3047 [00:11<05:22,  9.14it/s][A[A[A[A[A[A[A[A[A[A









row processed:   3%|▎         | 99/3047 [00:11<05:37,  8.72it/s][A[A[A[A[A[A[A[A[A[A

(98,)
(99,)












row processed:   3%|▎         | 100/3047 [00:11<05:39,  8.68it/s][A[A[A[A[A[A[A[A[A[A









row processed:   3%|▎         | 102/3047 [00:12<04:43, 10.37it/s][A[A[A[A[A[A[A[A[A[A

(100,)
(101,)
(102,)
(103,)












row processed:   3%|▎         | 104/3047 [00:12<04:10, 11.73it/s][A[A[A[A[A[A[A[A[A[A









row processed:   3%|▎         | 106/3047 [00:12<03:53, 12.59it/s][A[A[A[A[A[A[A[A[A[A

(104,)
(105,)
(106,)












row processed:   4%|▎         | 108/3047 [00:12<03:41, 13.26it/s][A[A[A[A[A[A[A[A[A[A









row processed:   4%|▎         | 111/3047 [00:12<03:10, 15.40it/s][A[A[A[A[A[A[A[A[A[A

(107,)
(108,)
(109,)
(110,)
(111,)












row processed:   4%|▎         | 114/3047 [00:12<02:49, 17.35it/s][A[A[A[A[A[A[A[A[A[A

(112,)
(113,)
(114,)
(115,)












row processed:   4%|▍         | 116/3047 [00:12<03:11, 15.30it/s][A[A[A[A[A[A[A[A[A[A









row processed:   4%|▍         | 118/3047 [00:12<03:21, 14.52it/s][A[A[A[A[A[A[A[A[A[A

(116,)
(117,)
(118,)












row processed:   4%|▍         | 120/3047 [00:13<03:26, 14.16it/s][A[A[A[A[A[A[A[A[A[A

(119,)
(120,)
(121,)












row processed:   4%|▍         | 122/3047 [00:13<03:30, 13.92it/s][A[A[A[A[A[A[A[A[A[A









row processed:   4%|▍         | 124/3047 [00:13<03:36, 13.53it/s][A[A[A[A[A[A[A[A[A[A

(122,)
(123,)
(124,)












row processed:   4%|▍         | 126/3047 [00:13<03:47, 12.86it/s][A[A[A[A[A[A[A[A[A[A

(125,)
(126,)
(127,)












row processed:   4%|▍         | 128/3047 [00:13<03:50, 12.64it/s][A[A[A[A[A[A[A[A[A[A









row processed:   4%|▍         | 130/3047 [00:13<03:40, 13.22it/s][A[A[A[A[A[A[A[A[A[A

(128,)
(129,)
(130,)












row processed:   4%|▍         | 132/3047 [00:14<03:33, 13.62it/s][A[A[A[A[A[A[A[A[A[A

(131,)
(132,)
(133,)












row processed:   4%|▍         | 134/3047 [00:14<03:39, 13.30it/s][A[A[A[A[A[A[A[A[A[A









row processed:   4%|▍         | 136/3047 [00:14<03:46, 12.85it/s][A[A[A[A[A[A[A[A[A[A

(134,)
(135,)
(136,)












row processed:   5%|▍         | 138/3047 [00:14<03:57, 12.23it/s][A[A[A[A[A[A[A[A[A[A

(137,)
(138,)
(139,)












row processed:   5%|▍         | 140/3047 [00:14<03:57, 12.23it/s][A[A[A[A[A[A[A[A[A[A









row processed:   5%|▍         | 142/3047 [00:14<03:45, 12.89it/s][A[A[A[A[A[A[A[A[A[A

(140,)
(141,)
(142,)












row processed:   5%|▍         | 144/3047 [00:14<03:44, 12.92it/s][A[A[A[A[A[A[A[A[A[A

(143,)
(144,)
(145,)












row processed:   5%|▍         | 146/3047 [00:15<03:42, 13.06it/s][A[A[A[A[A[A[A[A[A[A









row processed:   5%|▍         | 148/3047 [00:15<03:41, 13.08it/s][A[A[A[A[A[A[A[A[A[A

(146,)
(147,)
(148,)












row processed:   5%|▍         | 150/3047 [00:15<03:35, 13.45it/s][A[A[A[A[A[A[A[A[A[A

(149,)
(150,)
(151,)
(152,)












row processed:   5%|▌         | 153/3047 [00:15<03:24, 14.12it/s][A[A[A[A[A[A[A[A[A[A









row processed:   5%|▌         | 155/3047 [00:15<03:26, 14.02it/s][A[A[A[A[A[A[A[A[A[A

(153,)
(154,)
(155,)
(156,)












row processed:   5%|▌         | 158/3047 [00:15<03:06, 15.45it/s][A[A[A[A[A[A[A[A[A[A

(157,)
(158,)
(159,)












row processed:   5%|▌         | 160/3047 [00:16<03:13, 14.89it/s][A[A[A[A[A[A[A[A[A[A









row processed:   5%|▌         | 162/3047 [00:16<03:07, 15.36it/s][A[A[A[A[A[A[A[A[A[A

(160,)
(161,)
(162,)












row processed:   5%|▌         | 164/3047 [00:16<03:24, 14.09it/s][A[A[A[A[A[A[A[A[A[A











(163,)
(164,)
(165,)
(166,)
(167,)


row processed:   5%|▌         | 167/3047 [00:16<02:59, 16.04it/s][A[A[A[A[A[A[A[A[A[A









row processed:   6%|▌         | 170/3047 [00:16<02:42, 17.75it/s][A[A[A[A[A[A[A[A[A[A









row processed:   6%|▌         | 172/3047 [00:16<02:41, 17.83it/s][A[A[A[A[A[A[A[A[A[A

(168,)
(169,)
(170,)
(171,)
(172,)












row processed:   6%|▌         | 174/3047 [00:16<02:49, 16.96it/s][A[A[A[A[A[A[A[A[A[A









row processed:   6%|▌         | 176/3047 [00:16<02:48, 17.02it/s][A[A[A[A[A[A[A[A[A[A

(173,)
(174,)
(175,)
(176,)












row processed:   6%|▌         | 178/3047 [00:17<03:11, 15.01it/s][A[A[A[A[A[A[A[A[A[A

(177,)
(178,)
(179,)












row processed:   6%|▌         | 180/3047 [00:17<03:25, 13.93it/s][A[A[A[A[A[A[A[A[A[A









row processed:   6%|▌         | 182/3047 [00:17<03:45, 12.72it/s][A[A[A[A[A[A[A[A[A[A

(180,)
(181,)
(182,)












row processed:   6%|▌         | 184/3047 [00:17<03:21, 14.19it/s][A[A[A[A[A[A[A[A[A[A









row processed:   6%|▌         | 187/3047 [00:17<02:55, 16.26it/s][A[A[A[A[A[A[A[A[A[A

(183,)
(184,)
(185,)
(186,)
(187,)












row processed:   6%|▌         | 189/3047 [00:17<02:46, 17.18it/s][A[A[A[A[A[A[A[A[A[A









row processed:   6%|▋         | 192/3047 [00:17<02:35, 18.40it/s][A[A[A[A[A[A[A[A[A[A

(188,)
(189,)
(190,)
(191,)
(192,)












row processed:   6%|▋         | 195/3047 [00:18<02:27, 19.32it/s][A[A[A[A[A[A[A[A[A[A









row processed:   6%|▋         | 198/3047 [00:18<02:17, 20.76it/s]

(193,)
(194,)
(195,)
(196,)
(197,)
(198,)


[A[A[A[A[A[A[A[A[A[A









row processed:   7%|▋         | 201/3047 [00:18<02:07, 22.36it/s][A[A[A[A[A[A[A[A[A[A









row processed:   7%|▋         | 204/3047 [00:18<02:01, 23.34it/s][A[A[A[A[A[A[A[A[A[A

(199,)
(200,)
(201,)
(202,)
(203,)
(204,)
(205,)












row processed:   7%|▋         | 207/3047 [00:18<03:35, 13.19it/s][A[A[A[A[A[A[A[A[A[A

(206,)
(207,)












row processed:   7%|▋         | 209/3047 [00:19<05:14,  9.02it/s][A[A[A[A[A[A[A[A[A[A

(208,)
(209,)












row processed:   7%|▋         | 211/3047 [00:19<06:22,  7.41it/s][A[A[A[A[A[A[A[A[A[A

(210,)
(211,)












row processed:   7%|▋         | 213/3047 [00:20<07:08,  6.61it/s][A[A[A[A[A[A[A[A[A[A

(212,)
(213,)












row processed:   7%|▋         | 214/3047 [00:20<07:51,  6.01it/s][A[A[A[A[A[A[A[A[A[A

(214,)












row processed:   7%|▋         | 215/3047 [00:20<08:22,  5.64it/s][A[A[A[A[A[A[A[A[A[A









row processed:   7%|▋         | 218/3047 [00:20<06:21,  7.42it/s][A[A[A[A[A[A[A[A[A[A

(215,)
(216,)
(217,)
(218,)
(219,)
(220,)












row processed:   7%|▋         | 221/3047 [00:20<04:56,  9.53it/s][A[A[A[A[A[A[A[A[A[A









row processed:   7%|▋         | 223/3047 [00:20<04:50,  9.71it/s][A[A[A[A[A[A[A[A[A[A

(221,)
(222,)
(223,)












row processed:   7%|▋         | 225/3047 [00:21<05:17,  8.90it/s][A[A[A[A[A[A[A[A[A[A

(224,)
(225,)












row processed:   7%|▋         | 227/3047 [00:21<05:32,  8.49it/s][A[A[A[A[A[A[A[A[A[A

(226,)
(227,)












row processed:   8%|▊         | 229/3047 [00:21<05:44,  8.19it/s][A[A[A[A[A[A[A[A[A[A

(228,)
(229,)












row processed:   8%|▊         | 231/3047 [00:21<05:54,  7.94it/s][A[A[A[A[A[A[A[A[A[A

(230,)
(231,)












row processed:   8%|▊         | 232/3047 [00:22<06:09,  7.62it/s][A[A[A[A[A[A[A[A[A[A









row processed:   8%|▊         | 233/3047 [00:22<06:08,  7.64it/s][A[A[A[A[A[A[A[A[A[A

(232,)
(233,)












row processed:   8%|▊         | 234/3047 [00:22<06:17,  7.46it/s][A[A[A[A[A[A[A[A[A[A









row processed:   8%|▊         | 235/3047 [00:22<06:13,  7.53it/s][A[A[A[A[A[A[A[A[A[A

(234,)
(235,)












row processed:   8%|▊         | 236/3047 [00:22<06:23,  7.33it/s][A[A[A[A[A[A[A[A[A[A









row processed:   8%|▊         | 237/3047 [00:22<06:26,  7.27it/s][A[A[A[A[A[A[A[A[A[A

(236,)
(237,)












row processed:   8%|▊         | 238/3047 [00:22<07:20,  6.38it/s][A[A[A[A[A[A[A[A[A[A









row processed:   8%|▊         | 239/3047 [00:23<07:04,  6.62it/s][A[A[A[A[A[A[A[A[A[A

(238,)
(239,)












row processed:   8%|▊         | 240/3047 [00:23<06:39,  7.02it/s][A[A[A[A[A[A[A[A[A[A









row processed:   8%|▊         | 241/3047 [00:23<06:33,  7.13it/s][A[A[A[A[A[A[A[A[A[A

(240,)
(241,)












row processed:   8%|▊         | 242/3047 [00:23<06:35,  7.09it/s][A[A[A[A[A[A[A[A[A[A









row processed:   8%|▊         | 243/3047 [00:23<06:34,  7.10it/s][A[A[A[A[A[A[A[A[A[A

(242,)
(243,)
(244,)












row processed:   8%|▊         | 245/3047 [00:23<06:57,  6.71it/s][A[A[A[A[A[A[A[A[A[A

(245,)












row processed:   8%|▊         | 246/3047 [00:24<08:07,  5.75it/s][A[A[A[A[A[A[A[A[A[A

(246,)












row processed:   8%|▊         | 247/3047 [00:24<08:45,  5.33it/s][A[A[A[A[A[A[A[A[A[A









row processed:   8%|▊         | 249/3047 [00:24<07:01,  6.64it/s][A[A[A[A[A[A[A[A[A[A

(247,)
(248,)
(249,)












row processed:   8%|▊         | 250/3047 [00:24<06:36,  7.05it/s][A[A[A[A[A[A[A[A[A[A

(250,)
(251,)












row processed:   8%|▊         | 252/3047 [00:24<06:09,  7.57it/s][A[A[A[A[A[A[A[A[A[A









row processed:   8%|▊         | 253/3047 [00:25<06:04,  7.67it/s][A[A[A[A[A[A[A[A[A[A

(252,)
(253,)












row processed:   8%|▊         | 254/3047 [00:25<05:58,  7.78it/s][A[A[A[A[A[A[A[A[A[A









row processed:   8%|▊         | 255/3047 [00:25<05:54,  7.88it/s][A[A[A[A[A[A[A[A[A[A

(254,)
(255,)












row processed:   8%|▊         | 256/3047 [00:25<06:03,  7.68it/s][A[A[A[A[A[A[A[A[A[A

(256,)












row processed:   8%|▊         | 257/3047 [00:26<13:15,  3.51it/s][A[A[A[A[A[A[A[A[A[A

(257,)












row processed:   8%|▊         | 258/3047 [00:26<18:50,  2.47it/s][A[A[A[A[A[A[A[A[A[A

(258,)












row processed:   9%|▊         | 259/3047 [00:27<17:26,  2.66it/s][A[A[A[A[A[A[A[A[A[A

(259,)












row processed:   9%|▊         | 260/3047 [00:27<20:26,  2.27it/s][A[A[A[A[A[A[A[A[A[A

(260,)












row processed:   9%|▊         | 261/3047 [00:27<18:39,  2.49it/s][A[A[A[A[A[A[A[A[A[A

(261,)












row processed:   9%|▊         | 262/3047 [00:28<17:31,  2.65it/s][A[A[A[A[A[A[A[A[A[A

(262,)












row processed:   9%|▊         | 263/3047 [00:28<16:41,  2.78it/s][A[A[A[A[A[A[A[A[A[A

(263,)












row processed:   9%|▊         | 264/3047 [00:28<15:58,  2.90it/s][A[A[A[A[A[A[A[A[A[A

(264,)












row processed:   9%|▊         | 265/3047 [00:29<15:31,  2.99it/s][A[A[A[A[A[A[A[A[A[A

(265,)












row processed:   9%|▊         | 266/3047 [00:29<15:08,  3.06it/s][A[A[A[A[A[A[A[A[A[A

(266,)












row processed:   9%|▉         | 267/3047 [00:29<15:11,  3.05it/s][A[A[A[A[A[A[A[A[A[A









row processed:   9%|▉         | 269/3047 [00:29<11:23,  4.07it/s][A[A[A[A[A[A[A[A[A[A

(267,)
(268,)
(269,)
(270,)












row processed:   9%|▉         | 271/3047 [00:30<08:51,  5.23it/s][A[A[A[A[A[A[A[A[A[A

(271,)
(272,)












row processed:   9%|▉         | 273/3047 [00:30<08:06,  5.70it/s][A[A[A[A[A[A[A[A[A[A









row processed:   9%|▉         | 274/3047 [00:30<08:04,  5.72it/s][A[A[A[A[A[A[A[A[A[A

(273,)
(274,)












row processed:   9%|▉         | 275/3047 [00:30<08:43,  5.29it/s][A[A[A[A[A[A[A[A[A[A









row processed:   9%|▉         | 276/3047 [00:30<08:27,  5.47it/s][A[A[A[A[A[A[A[A[A[A

(275,)
(276,)












row processed:   9%|▉         | 277/3047 [00:31<08:15,  5.59it/s][A[A[A[A[A[A[A[A[A[A









row processed:   9%|▉         | 278/3047 [00:31<08:08,  5.66it/s][A[A[A[A[A[A[A[A[A[A

(277,)
(278,)












row processed:   9%|▉         | 279/3047 [00:31<08:07,  5.67it/s][A[A[A[A[A[A[A[A[A[A









row processed:   9%|▉         | 280/3047 [00:31<08:00,  5.76it/s][A[A[A[A[A[A[A[A[A[A

(279,)
(280,)












row processed:   9%|▉         | 281/3047 [00:31<08:01,  5.75it/s][A[A[A[A[A[A[A[A[A[A









row processed:   9%|▉         | 282/3047 [00:31<07:59,  5.77it/s][A[A[A[A[A[A[A[A[A[A

(281,)
(282,)












row processed:   9%|▉         | 283/3047 [00:32<07:56,  5.81it/s][A[A[A[A[A[A[A[A[A[A









row processed:   9%|▉         | 284/3047 [00:32<07:55,  5.81it/s][A[A[A[A[A[A[A[A[A[A

(283,)
(284,)












row processed:   9%|▉         | 285/3047 [00:32<08:05,  5.69it/s][A[A[A[A[A[A[A[A[A[A









row processed:   9%|▉         | 286/3047 [00:32<08:00,  5.75it/s][A[A[A[A[A[A[A[A[A[A

(285,)
(286,)












row processed:   9%|▉         | 287/3047 [00:32<07:59,  5.76it/s][A[A[A[A[A[A[A[A[A[A









row processed:   9%|▉         | 288/3047 [00:32<07:56,  5.79it/s][A[A[A[A[A[A[A[A[A[A

(287,)
(288,)












row processed:   9%|▉         | 289/3047 [00:33<08:03,  5.71it/s][A[A[A[A[A[A[A[A[A[A









row processed:  10%|▉         | 290/3047 [00:33<07:57,  5.77it/s][A[A[A[A[A[A[A[A[A[A

(289,)
(290,)












row processed:  10%|▉         | 291/3047 [00:33<08:01,  5.72it/s][A[A[A[A[A[A[A[A[A[A









row processed:  10%|▉         | 292/3047 [00:33<07:55,  5.79it/s][A[A[A[A[A[A[A[A[A[A

(291,)
(292,)












row processed:  10%|▉         | 293/3047 [00:33<07:54,  5.81it/s][A[A[A[A[A[A[A[A[A[A









row processed:  10%|▉         | 294/3047 [00:34<07:51,  5.83it/s][A[A[A[A[A[A[A[A[A[A

(293,)
(294,)












row processed:  10%|▉         | 295/3047 [00:34<07:56,  5.77it/s][A[A[A[A[A[A[A[A[A[A









row processed:  10%|▉         | 296/3047 [00:34<07:52,  5.83it/s][A[A[A[A[A[A[A[A[A[A

(295,)
(296,)












row processed:  10%|▉         | 297/3047 [00:34<07:54,  5.79it/s][A[A[A[A[A[A[A[A[A[A









row processed:  10%|▉         | 298/3047 [00:34<07:50,  5.85it/s][A[A[A[A[A[A[A[A[A[A

(297,)
(298,)












row processed:  10%|▉         | 299/3047 [00:34<08:16,  5.53it/s][A[A[A[A[A[A[A[A[A[A









row processed:  10%|▉         | 300/3047 [00:35<08:07,  5.63it/s][A[A[A[A[A[A[A[A[A[A

(299,)
(300,)












row processed:  10%|▉         | 301/3047 [00:35<08:12,  5.58it/s][A[A[A[A[A[A[A[A[A[A









row processed:  10%|▉         | 302/3047 [00:35<08:03,  5.68it/s][A[A[A[A[A[A[A[A[A[A

(301,)
(302,)












row processed:  10%|▉         | 303/3047 [00:35<08:09,  5.61it/s][A[A[A[A[A[A[A[A[A[A









row processed:  10%|▉         | 304/3047 [00:35<08:03,  5.67it/s][A[A[A[A[A[A[A[A[A[A

(303,)
(304,)












row processed:  10%|█         | 305/3047 [00:35<08:06,  5.64it/s][A[A[A[A[A[A[A[A[A[A

(305,)












row processed:  10%|█         | 306/3047 [00:36<10:27,  4.37it/s][A[A[A[A[A[A[A[A[A[A

(306,)












row processed:  10%|█         | 307/3047 [00:36<12:05,  3.78it/s][A[A[A[A[A[A[A[A[A[A

(307,)












row processed:  10%|█         | 308/3047 [00:37<13:19,  3.43it/s][A[A[A[A[A[A[A[A[A[A

(308,)












row processed:  10%|█         | 309/3047 [00:37<14:05,  3.24it/s][A[A[A[A[A[A[A[A[A[A

(309,)












row processed:  10%|█         | 310/3047 [00:37<14:34,  3.13it/s][A[A[A[A[A[A[A[A[A[A

(310,)












row processed:  10%|█         | 311/3047 [00:38<15:02,  3.03it/s][A[A[A[A[A[A[A[A[A[A

(311,)












row processed:  10%|█         | 312/3047 [00:38<15:18,  2.98it/s][A[A[A[A[A[A[A[A[A[A

(312,)












row processed:  10%|█         | 313/3047 [00:38<15:27,  2.95it/s][A[A[A[A[A[A[A[A[A[A

(313,)












row processed:  10%|█         | 314/3047 [00:39<15:48,  2.88it/s][A[A[A[A[A[A[A[A[A[A

(314,)












row processed:  10%|█         | 315/3047 [00:39<15:51,  2.87it/s][A[A[A[A[A[A[A[A[A[A

(315,)












row processed:  10%|█         | 316/3047 [00:39<15:49,  2.88it/s][A[A[A[A[A[A[A[A[A[A

(316,)












row processed:  10%|█         | 317/3047 [00:40<15:41,  2.90it/s][A[A[A[A[A[A[A[A[A[A

(317,)












row processed:  10%|█         | 318/3047 [00:40<15:46,  2.88it/s][A[A[A[A[A[A[A[A[A[A









row processed:  10%|█         | 319/3047 [00:40<13:17,  3.42it/s][A[A[A[A[A[A[A[A[A[A

(318,)
(319,)












row processed:  11%|█         | 320/3047 [00:40<11:32,  3.94it/s][A[A[A[A[A[A[A[A[A[A









row processed:  11%|█         | 321/3047 [00:41<10:21,  4.39it/s][A[A[A[A[A[A[A[A[A[A

(320,)
(321,)












row processed:  11%|█         | 322/3047 [00:41<09:29,  4.79it/s][A[A[A[A[A[A[A[A[A[A









row processed:  11%|█         | 323/3047 [00:41<08:49,  5.14it/s][A[A[A[A[A[A[A[A[A[A

(322,)
(323,)












row processed:  11%|█         | 324/3047 [00:41<08:34,  5.29it/s][A[A[A[A[A[A[A[A[A[A









row processed:  11%|█         | 325/3047 [00:41<08:10,  5.55it/s][A[A[A[A[A[A[A[A[A[A

(324,)
(325,)












row processed:  11%|█         | 326/3047 [00:41<08:07,  5.59it/s][A[A[A[A[A[A[A[A[A[A









row processed:  11%|█         | 327/3047 [00:42<07:46,  5.83it/s][A[A[A[A[A[A[A[A[A[A

(326,)
(327,)












row processed:  11%|█         | 328/3047 [00:42<07:39,  5.92it/s][A[A[A[A[A[A[A[A[A[A









row processed:  11%|█         | 329/3047 [00:42<07:28,  6.06it/s][A[A[A[A[A[A[A[A[A[A

(328,)
(329,)












row processed:  11%|█         | 330/3047 [00:42<07:31,  6.02it/s][A[A[A[A[A[A[A[A[A[A









row processed:  11%|█         | 331/3047 [00:42<07:38,  5.92it/s][A[A[A[A[A[A[A[A[A[A

(330,)
(331,)












row processed:  11%|█         | 332/3047 [00:42<07:41,  5.88it/s][A[A[A[A[A[A[A[A[A[A









row processed:  11%|█         | 333/3047 [00:43<07:34,  5.97it/s][A[A[A[A[A[A[A[A[A[A

(332,)
(333,)












row processed:  11%|█         | 334/3047 [00:43<07:47,  5.81it/s][A[A[A[A[A[A[A[A[A[A









row processed:  11%|█         | 337/3047 [00:43<06:02,  7.47it/s][A[A[A[A[A[A[A[A[A[A

(334,)
(335,)
(336,)
(337,)












row processed:  11%|█         | 339/3047 [00:43<05:38,  7.99it/s][A[A[A[A[A[A[A[A[A[A

(338,)
(339,)












row processed:  11%|█         | 341/3047 [00:43<05:26,  8.28it/s][A[A[A[A[A[A[A[A[A[A

(340,)
(341,)












row processed:  11%|█▏        | 343/3047 [00:44<05:38,  7.99it/s][A[A[A[A[A[A[A[A[A[A

(342,)
(343,)












row processed:  11%|█▏        | 344/3047 [00:44<05:29,  8.20it/s][A[A[A[A[A[A[A[A[A[A









row processed:  11%|█▏        | 345/3047 [00:44<05:24,  8.33it/s][A[A[A[A[A[A[A[A[A[A

(344,)
(345,)












row processed:  11%|█▏        | 346/3047 [00:44<06:02,  7.45it/s][A[A[A[A[A[A[A[A[A[A









row processed:  11%|█▏        | 347/3047 [00:44<06:41,  6.73it/s][A[A[A[A[A[A[A[A[A[A

(346,)
(347,)












row processed:  11%|█▏        | 348/3047 [00:44<07:00,  6.41it/s][A[A[A[A[A[A[A[A[A[A

(348,)












row processed:  11%|█▏        | 349/3047 [00:45<09:25,  4.77it/s][A[A[A[A[A[A[A[A[A[A

(349,)












row processed:  11%|█▏        | 350/3047 [00:45<11:29,  3.91it/s][A[A[A[A[A[A[A[A[A[A

(350,)












row processed:  12%|█▏        | 351/3047 [00:45<13:13,  3.40it/s][A[A[A[A[A[A[A[A[A[A

(351,)












row processed:  12%|█▏        | 352/3047 [00:46<14:04,  3.19it/s][A[A[A[A[A[A[A[A[A[A









row processed:  12%|█▏        | 354/3047 [00:46<10:45,  4.17it/s][A[A[A[A[A[A[A[A[A[A

(352,)
(353,)
(354,)












row processed:  12%|█▏        | 356/3047 [00:46<08:22,  5.35it/s][A[A[A[A[A[A[A[A[A[A









row processed:  12%|█▏        | 359/3047 [00:46<06:24,  6.98it/s][A[A[A[A[A[A[A[A[A[A

(355,)
(356,)
(357,)
(358,)
(359,)












row processed:  12%|█▏        | 361/3047 [00:46<05:48,  7.70it/s][A[A[A[A[A[A[A[A[A[A

(360,)
(361,)












row processed:  12%|█▏        | 363/3047 [00:47<05:33,  8.04it/s][A[A[A[A[A[A[A[A[A[A

(362,)
(363,)












row processed:  12%|█▏        | 365/3047 [00:47<05:07,  8.72it/s][A[A[A[A[A[A[A[A[A[A









row processed:  12%|█▏        | 368/3047 [00:47<04:07, 10.81it/s][A[A[A[A[A[A[A[A[A[A

(364,)
(365,)
(366,)
(367,)
(368,)












row processed:  12%|█▏        | 371/3047 [00:47<03:26, 12.95it/s][A[A[A[A[A[A[A[A[A[A

(369,)
(370,)
(371,)
(372,)
(373,)












row processed:  12%|█▏        | 374/3047 [00:47<03:03, 14.58it/s][A[A[A[A[A[A[A[A[A[A









row processed:  12%|█▏        | 376/3047 [00:47<03:07, 14.24it/s][A[A[A[A[A[A[A[A[A[A

(374,)
(375,)
(376,)
(377,)












row processed:  12%|█▏        | 379/3047 [00:47<02:43, 16.36it/s][A[A[A[A[A[A[A[A[A[A









row processed:  13%|█▎        | 382/3047 [00:47<02:25, 18.26it/s][A[A[A[A[A[A[A[A[A[A

(378,)
(379,)
(380,)
(381,)
(382,)












row processed:  13%|█▎        | 385/3047 [00:48<02:12, 20.11it/s][A[A[A[A[A[A[A[A[A[A









row processed:  13%|█▎        | 388/3047 [00:48<02:00, 22.09it/s][A[A[A[A[A[A[A[A[A[A

(383,)
(384,)
(385,)
(386,)
(387,)
(388,)












row processed:  13%|█▎        | 391/3047 [00:48<01:56, 22.85it/s][A[A[A[A[A[A[A[A[A[A

(389,)
(390,)
(391,)
(392,)
(393,)












row processed:  13%|█▎        | 394/3047 [00:48<02:00, 22.03it/s][A[A[A[A[A[A[A[A[A[A









row processed:  13%|█▎        | 397/3047 [00:48<02:03, 21.43it/s][A[A[A[A[A[A[A[A[A[A

(394,)
(395,)
(396,)
(397,)












row processed:  13%|█▎        | 400/3047 [00:48<02:03, 21.47it/s][A[A[A[A[A[A[A[A[A[A

(398,)
(399,)
(400,)
(401,)
(402,)












row processed:  13%|█▎        | 403/3047 [00:48<02:05, 21.15it/s][A[A[A[A[A[A[A[A[A[A

(403,)
(404,)
(405,)












row processed:  13%|█▎        | 406/3047 [00:50<10:05,  4.36it/s][A[A[A[A[A[A[A[A[A[A

(406,)
(407,)












row processed:  13%|█▎        | 408/3047 [00:51<13:52,  3.17it/s][A[A[A[A[A[A[A[A[A[A

(408,)
(409,)












row processed:  13%|█▎        | 410/3047 [00:52<15:57,  2.76it/s][A[A[A[A[A[A[A[A[A[A

(410,)












row processed:  13%|█▎        | 411/3047 [00:53<21:00,  2.09it/s][A[A[A[A[A[A[A[A[A[A

(411,)












row processed:  14%|█▎        | 412/3047 [00:53<18:02,  2.43it/s][A[A[A[A[A[A[A[A[A[A

(412,)












row processed:  14%|█▎        | 413/3047 [00:54<20:28,  2.14it/s][A[A[A[A[A[A[A[A[A[A

(413,)












row processed:  14%|█▎        | 414/3047 [00:54<19:31,  2.25it/s][A[A[A[A[A[A[A[A[A[A

(414,)












row processed:  14%|█▎        | 415/3047 [00:55<20:20,  2.16it/s][A[A[A[A[A[A[A[A[A[A

(415,)












row processed:  14%|█▎        | 416/3047 [00:55<20:35,  2.13it/s][A[A[A[A[A[A[A[A[A[A

(416,)












row processed:  14%|█▎        | 417/3047 [00:56<20:02,  2.19it/s][A[A[A[A[A[A[A[A[A[A

(417,)












row processed:  14%|█▎        | 418/3047 [00:56<21:06,  2.08it/s][A[A[A[A[A[A[A[A[A[A

(418,)












row processed:  14%|█▍        | 419/3047 [00:57<17:32,  2.50it/s][A[A[A[A[A[A[A[A[A[A

(419,)












row processed:  14%|█▍        | 420/3047 [00:57<22:01,  1.99it/s][A[A[A[A[A[A[A[A[A[A









row processed:  14%|█▍        | 423/3047 [00:57<15:58,  2.74it/s][A[A[A[A[A[A[A[A[A[A

(420,)
(421,)
(422,)
(423,)
(424,)












row processed:  14%|█▍        | 425/3047 [00:57<11:54,  3.67it/s][A[A[A[A[A[A[A[A[A[A

(425,)


KeyboardInterrupt: 

In [714]:
all_features["abs(ASA-ASA_Chain)"] = stride_arr
pearsonr(skempi_df.DDG, stride_arr)

(0.44807091830983697, 2.1281783251917486e-150)

In [718]:
DSSP = ["G", "H", "I", "T", "E", "B", "S", "C"]

from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()

lb.fit(DSSP) 

def get_bin_ss(stride):
    return lb.transform([stride["SS"]])[0]

In [731]:
ss_arr = get_stride_array(get_bin_ss, agg=lambda a: np.sum(a, axis=0))














row processed:   0%|          | 0/3047 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:   0%|          | 2/3047 [00:00<04:31, 11.20it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:   0%|          | 3/3047 [00:00<04:45, 10.66it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:   0%|          | 5/3047 [00:00<04:42, 10.78it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:   0%|          | 7/3047 [00:00<04:32, 11.16it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:   0%|          | 9/3047 [00:00<04:26, 11.39it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:   0%|          | 11/3047 [00:00<04:28, 11.31it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:   0%|          | 13/3047 [00:01<04:23, 11.51it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:   0%|          | 15/3047 [00:01<04:34, 11.04it/s][A[A[A

row processed:   9%|▊         | 263/3047 [00:25<15:38,  2.97it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:   9%|▊         | 264/3047 [00:25<15:14,  3.04it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:   9%|▊         | 265/3047 [00:25<14:41,  3.16it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:   9%|▊         | 266/3047 [00:26<14:17,  3.24it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:   9%|▉         | 267/3047 [00:26<14:09,  3.27it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:   9%|▉         | 269/3047 [00:26<10:43,  4.32it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:   9%|▉         | 271/3047 [00:26<08:13,  5.63it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:   9%|▉         | 273/3047 [00:26<07:20,  6.29it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:   9%|▉         | 275/3047 [00:27<07:20,  6.29it/s

row processed:  16%|█▌        | 492/3047 [00:58<10:24,  4.09it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  16%|█▌        | 493/3047 [00:58<09:07,  4.66it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  16%|█▌        | 494/3047 [00:58<08:09,  5.21it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  16%|█▌        | 495/3047 [00:58<07:30,  5.67it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  16%|█▋        | 496/3047 [00:58<07:06,  5.99it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  16%|█▋        | 497/3047 [00:58<06:48,  6.24it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  16%|█▋        | 498/3047 [00:59<06:37,  6.41it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  16%|█▋        | 499/3047 [00:59<07:17,  5.82it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  16%|█▋        | 500/3047 [00:59<07:43,  5.50it/s

row processed:  22%|██▏       | 684/3047 [01:29<02:15, 17.48it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  23%|██▎       | 687/3047 [01:29<02:01, 19.37it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  23%|██▎       | 691/3047 [01:29<01:46, 22.05it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  23%|██▎       | 695/3047 [01:29<01:35, 24.65it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  23%|██▎       | 699/3047 [01:29<01:27, 26.79it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  23%|██▎       | 703/3047 [01:30<02:49, 13.83it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  23%|██▎       | 706/3047 [01:30<03:59,  9.77it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  23%|██▎       | 708/3047 [01:30<04:19,  9.02it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  23%|██▎       | 710/3047 [01:31<05:56,  6.55it/s

row processed:  28%|██▊       | 859/3047 [02:17<05:40,  6.42it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  28%|██▊       | 860/3047 [02:18<05:37,  6.48it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  28%|██▊       | 861/3047 [02:18<05:36,  6.49it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  28%|██▊       | 862/3047 [02:18<05:33,  6.56it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  28%|██▊       | 863/3047 [02:18<05:34,  6.54it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  28%|██▊       | 864/3047 [02:18<05:33,  6.56it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  28%|██▊       | 865/3047 [02:18<05:32,  6.56it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  28%|██▊       | 866/3047 [02:19<05:28,  6.64it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  28%|██▊       | 867/3047 [02:19<05:24,  6.72it/s

row processed:  34%|███▍      | 1041/3047 [03:03<11:15,  2.97it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  34%|███▍      | 1042/3047 [03:04<11:09,  2.99it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  34%|███▍      | 1043/3047 [03:04<11:05,  3.01it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  34%|███▍      | 1044/3047 [03:04<11:02,  3.02it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  34%|███▍      | 1045/3047 [03:04<10:57,  3.04it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  34%|███▍      | 1046/3047 [03:05<11:08,  2.99it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  34%|███▍      | 1047/3047 [03:05<11:07,  2.99it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  34%|███▍      | 1048/3047 [03:05<11:05,  3.00it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  34%|███▍      | 1049/3047 [03:06<11:00, 

row processed:  40%|███▉      | 1207/3047 [03:40<08:13,  3.73it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  40%|███▉      | 1208/3047 [03:40<08:12,  3.73it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  40%|███▉      | 1209/3047 [03:40<08:01,  3.82it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  40%|███▉      | 1210/3047 [03:41<07:52,  3.89it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  40%|███▉      | 1211/3047 [03:41<07:48,  3.92it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  40%|███▉      | 1212/3047 [03:41<07:45,  3.94it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  40%|███▉      | 1213/3047 [03:41<07:41,  3.97it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  40%|███▉      | 1214/3047 [03:42<07:37,  4.01it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  40%|███▉      | 1215/3047 [03:42<07:32, 

row processed:  44%|████▍     | 1345/3047 [04:09<04:03,  6.98it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  44%|████▍     | 1346/3047 [04:09<04:09,  6.82it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  44%|████▍     | 1347/3047 [04:10<04:02,  7.02it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  44%|████▍     | 1348/3047 [04:10<03:57,  7.15it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  44%|████▍     | 1349/3047 [04:10<04:01,  7.05it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  44%|████▍     | 1350/3047 [04:10<03:58,  7.13it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  44%|████▍     | 1351/3047 [04:10<03:57,  7.14it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  44%|████▍     | 1352/3047 [04:10<03:56,  7.16it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  44%|████▍     | 1353/3047 [04:10<03:56, 

row processed:  49%|████▉     | 1502/3047 [04:31<04:42,  5.47it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  49%|████▉     | 1503/3047 [04:31<04:43,  5.45it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  49%|████▉     | 1504/3047 [04:31<04:42,  5.47it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  49%|████▉     | 1505/3047 [04:32<04:40,  5.50it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  49%|████▉     | 1506/3047 [04:32<04:39,  5.51it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  49%|████▉     | 1507/3047 [04:32<04:38,  5.53it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  49%|████▉     | 1508/3047 [04:32<04:36,  5.56it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  50%|████▉     | 1509/3047 [04:32<04:35,  5.59it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  50%|████▉     | 1510/3047 [04:33<04:34, 

row processed:  54%|█████▍    | 1640/3047 [05:06<06:29,  3.61it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  54%|█████▍    | 1641/3047 [05:07<06:24,  3.65it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  54%|█████▍    | 1642/3047 [05:07<06:22,  3.67it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  54%|█████▍    | 1643/3047 [05:07<06:20,  3.69it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  54%|█████▍    | 1644/3047 [05:07<06:20,  3.69it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  54%|█████▍    | 1645/3047 [05:08<06:20,  3.68it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  54%|█████▍    | 1646/3047 [05:08<06:17,  3.71it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  54%|█████▍    | 1647/3047 [05:08<06:17,  3.71it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  54%|█████▍    | 1648/3047 [05:08<06:15, 

row processed:  58%|█████▊    | 1778/3047 [05:45<05:07,  4.13it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  58%|█████▊    | 1779/3047 [05:45<05:07,  4.12it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  58%|█████▊    | 1780/3047 [05:45<05:03,  4.17it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  58%|█████▊    | 1781/3047 [05:45<05:01,  4.19it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  58%|█████▊    | 1782/3047 [05:46<05:00,  4.21it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  59%|█████▊    | 1783/3047 [05:46<04:58,  4.24it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  59%|█████▊    | 1784/3047 [05:46<04:58,  4.23it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  59%|█████▊    | 1785/3047 [05:46<04:56,  4.26it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  59%|█████▊    | 1786/3047 [05:47<04:58, 

row processed:  67%|██████▋   | 2053/3047 [06:06<01:01, 16.29it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  67%|██████▋   | 2055/3047 [06:06<01:01, 16.12it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  68%|██████▊   | 2057/3047 [06:06<01:01, 16.20it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  68%|██████▊   | 2059/3047 [06:07<00:59, 16.47it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  68%|██████▊   | 2061/3047 [06:07<00:59, 16.63it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  68%|██████▊   | 2063/3047 [06:07<00:58, 16.79it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  68%|██████▊   | 2065/3047 [06:07<00:58, 16.88it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  68%|██████▊   | 2067/3047 [06:07<00:58, 16.89it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  68%|██████▊   | 2069/3047 [06:07<00:57, 

row processed:  77%|███████▋  | 2359/3047 [06:26<00:47, 14.39it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  77%|███████▋  | 2361/3047 [06:26<00:50, 13.62it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  78%|███████▊  | 2363/3047 [06:26<00:47, 14.46it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  78%|███████▊  | 2365/3047 [06:26<00:45, 15.08it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  78%|███████▊  | 2367/3047 [06:26<00:42, 16.16it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  78%|███████▊  | 2369/3047 [06:27<00:40, 16.66it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  78%|███████▊  | 2371/3047 [06:27<00:40, 16.82it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  78%|███████▊  | 2373/3047 [06:27<00:41, 16.43it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  78%|███████▊  | 2375/3047 [06:27<00:44, 

row processed:  86%|████████▌ | 2618/3047 [06:52<02:03,  3.46it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  86%|████████▌ | 2619/3047 [06:53<03:07,  2.29it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  86%|████████▌ | 2620/3047 [06:54<02:44,  2.60it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  86%|████████▌ | 2621/3047 [06:54<02:27,  2.88it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  86%|████████▌ | 2622/3047 [06:54<02:51,  2.48it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  86%|████████▌ | 2623/3047 [06:55<02:33,  2.76it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  86%|████████▌ | 2624/3047 [06:55<02:23,  2.95it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  86%|████████▌ | 2625/3047 [06:55<02:13,  3.15it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  86%|████████▌ | 2626/3047 [06:56<02:38, 

row processed:  91%|█████████▏| 2785/3047 [07:38<02:00,  2.18it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  91%|█████████▏| 2786/3047 [07:38<01:33,  2.79it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  91%|█████████▏| 2787/3047 [07:38<01:18,  3.30it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  91%|█████████▏| 2788/3047 [07:38<01:05,  3.97it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  92%|█████████▏| 2789/3047 [07:39<01:07,  3.82it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  92%|█████████▏| 2791/3047 [07:39<00:51,  4.94it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  92%|█████████▏| 2793/3047 [07:39<00:46,  5.48it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  92%|█████████▏| 2795/3047 [07:39<00:36,  6.82it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  92%|█████████▏| 2797/3047 [07:39<00:32, 

row processed:  97%|█████████▋| 2951/3047 [08:24<00:17,  5.43it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  97%|█████████▋| 2952/3047 [08:24<00:18,  5.17it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  97%|█████████▋| 2953/3047 [08:25<00:22,  4.25it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  97%|█████████▋| 2954/3047 [08:25<00:24,  3.83it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  97%|█████████▋| 2955/3047 [08:25<00:21,  4.24it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  97%|█████████▋| 2956/3047 [08:25<00:21,  4.27it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  97%|█████████▋| 2957/3047 [08:26<00:22,  4.05it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  97%|█████████▋| 2958/3047 [08:26<00:25,  3.56it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












row processed:  97%|█████████▋| 2959/3047 [08:26<00:27, 

In [732]:
[pearsonr(skempi_df.DDG, np.asarray(ss_arr)[:, j]) for j in range(8)]

[(0.046420830918097108, 0.010384883923705844),
 (0.19428201978462795, 2.6734087103648955e-27),
 (0.13355078837182441, 1.3403716975243515e-13),
 (0.020648461720111762, 0.25451909563207237),
 (-0.041496984072534521, 0.021982781352113706),
 (nan, 1.0),
 (nan, 1.0),
 (0.041696500814343597, 0.021352756381487238)]

In [733]:
all_features.keys(), len(all_features.keys())

([('uniform', 'CP_B', 'SKOJ970101', 6),
  '#mutations',
  ('original', 'CP_B', 'SKOJ970101', 7),
  ('no_profile', 'CP_A', 'SKOJ970101', 6),
  ('original', 'CP_A', 'SKOJ970101', 7),
  ('no_profile', 'CP_A', 'BASU010101', 7),
  ('uniform', 'CP_A', 'SKOJ970101', 7),
  ('no_profile', 'CP_B', 'SKOJ970101', 6),
  ('original', 'CP_A', 'BASU010101', 6),
  ('uniform', 'CP_B', 'BASU010101', 6),
  ('original', 'CP_B', 'BASU010101', 6),
  ('no_profile', 'CP_B', 'SKOJ970101', 7),
  ('original', 'CP_B', 'SKOJ970101', 6),
  ('no_profile', 'CP_A', 'SKOJ970101', 7),
  ('EI', 'SKOJ970101'),
  ('no_profile', 'CP_A', 'BASU010101', 6),
  ('uniform', 'CP_A', 'BASU010101', 7),
  ('no_profile', 'CP_B', 'BASU010101', 6),
  ('EI', 'BLOSUM62'),
  ('original', 'CP_B', 'BASU010101', 7),
  ('uniform', 'CP_B', 'BASU010101', 7),
  ('uniform', 'CP_B', 'SKOJ970101', 7),
  'abs(ASA-ASA_Chain)',
  ('EI', 'BASU010101'),
  ('original', 'CP_A', 'SKOJ970101', 6),
  ('uniform', 'CP_A', 'BASU010101', 6),
  ('uniform', 'CP_A', 

In [734]:
import itertools

In [735]:
xcor_mat = np.corrcoef(np.asarray(all_features.values()))

In [736]:
xcor_mat.shape

(29, 29)

In [737]:
class XCor(object):
    
    def __init__(self, all_features):
        self.feat_name_to_indx = {key:i for i, key in enumerate(all_features.keys())}
        self.xcor_mat = np.corrcoef(np.asarray(all_features.values()))
        
    def __getitem__(self, t):
        feat1, feat2 = t
        i = self.feat_name_to_indx[feat1]
        j = self.feat_name_to_indx[feat2]
        return self.xcor_mat[(i, j)]

In [738]:
xcor = XCor(all_features)

In [739]:
def search_min_xcor(all_features, th=0.05):
    acc = set()
    for comb in itertools.combinations(all_features.keys(), 2):
        feat1, feat2 = comb
        rho = xcor[(feat1, feat2)]
        if abs(rho) < th:
            print(feat1, feat2, rho)
            acc.add(feat1)
            acc.add(feat2)
    return acc

In [783]:
acc_feats = search_min_xcor(all_features)

(('uniform', 'CP_B', 'SKOJ970101', 6), '#mutations', 0.039810904106046208)
('#mutations', ('no_profile', 'CP_A', 'SKOJ970101', 6), -0.033714399623451741)
('#mutations', ('original', 'CP_A', 'BASU010101', 6), 0.017056502896153274)
('#mutations', ('no_profile', 'CP_B', 'SKOJ970101', 7), 0.042463711564533047)
('#mutations', ('no_profile', 'CP_A', 'SKOJ970101', 7), -0.038619918216982803)
('#mutations', ('EI', 'SKOJ970101'), -0.0011196272438495438)
('#mutations', ('uniform', 'CP_B', 'SKOJ970101', 7), 0.042943490507457799)
('#mutations', ('no_profile', 'CP_B', 'BASU010101', 7), 0.031059433777381178)
('#mutations', ('original', 'CP_A', 'BASU010101', 7), 0.046772400285923214)
(('no_profile', 'CP_A', 'SKOJ970101', 6), 'abs(ASA-ASA_Chain)', 0.049260535153685864)
(('uniform', 'CP_A', 'SKOJ970101', 7), 'abs(ASA-ASA_Chain)', 0.047348865688833862)
(('uniform', 'CP_B', 'BASU010101', 6), ('EI', 'BLOSUM62'), -0.017812503616760917)
(('EI', 'BLOSUM62'), ('uniform', 'CP_B', 'BASU010101', 7), -0.0275336087

In [784]:
len(acc_feats), acc_feats

(16,
 {'#mutations',
  'abs(ASA-ASA_Chain)',
  ('EI', 'BLOSUM62'),
  ('EI', 'SKOJ970101'),
  ('no_profile', 'CP_A', 'SKOJ970101', 6),
  ('no_profile', 'CP_A', 'SKOJ970101', 7),
  ('no_profile', 'CP_B', 'BASU010101', 7),
  ('no_profile', 'CP_B', 'SKOJ970101', 7),
  ('original', 'CP_A', 'BASU010101', 6),
  ('original', 'CP_A', 'BASU010101', 7),
  ('uniform', 'CP_A', 'SKOJ970101', 6),
  ('uniform', 'CP_A', 'SKOJ970101', 7),
  ('uniform', 'CP_B', 'BASU010101', 6),
  ('uniform', 'CP_B', 'BASU010101', 7),
  ('uniform', 'CP_B', 'SKOJ970101', 6),
  ('uniform', 'CP_B', 'SKOJ970101', 7)})

In [797]:
acc_feats = {'#mutations',
  'abs(ASA-ASA_Chain)',
  ('EI', 'BLOSUM62'),
  ('EI', 'SKOJ970101'),
  ('EI', 'BASU010101'),
  ('no_profile', 'CP_A', 'SKOJ970101', 6),
  ('no_profile', 'CP_A', 'SKOJ970101', 7),
  ('no_profile', 'CP_B', 'BASU010101', 7),
  ('no_profile', 'CP_B', 'SKOJ970101', 7),
  ('uniform', 'CP_A', 'SKOJ970101', 6),
  ('uniform', 'CP_A', 'SKOJ970101', 7),
  ('uniform', 'CP_B', 'BASU010101', 6),
  ('uniform', 'CP_B', 'BASU010101', 7),
  ('uniform', 'CP_B', 'SKOJ970101', 6),
  ('uniform', 'CP_B', 'SKOJ970101', 7)}

In [798]:
G1 = [
    "%s_%s_%s" % (s[:4], s[4], s[5].strip())  for s in  
    """
    1CSOEI 1CT0EI 1CT2EI 1CT4EI 1SGDEI 1SGEEI 1SGNEI 1SGPEI 1SGQEI 1SGYEI 2NU0EI
    2NU1EI 2NU2EI 2NU4EI 2SGPEI 2SGQEI 3SGBEI 1IARAB 1XD3AB 1F47AB 1ACBEI
    1H9DAB 2HRKAB 3BP8AC 2OOBAB
    """.split(' ') if s.strip()]

G2 = [
    "%s_%s_%s" % (s[:4], s[4], s[5].strip()) for s in  
    """
    1JTGAB 1S0WAC 2G2UAB 2G2WAB 1A4YAB 1Z7XWX 2GOXAB 2NOJAB 3D5RAC
    3D5SAC 1KACAB 1P69AB 1P6AAB 3BK3AC 1JCKAB 1SBBAB 4CPAAI 1S1QAB 2B42AB
    1E96AB 2I26NL
    """.split(' ') if s.strip()]


G3 = [
    "%s_%s_%s" % (s[:4], s[4], s[5].strip())  for s in  
    """
    1PPFEI 1CSEEI 1SBNEI 1TM1EI 1TM3EI 1TM4EI 1TM5EI 1TM7EI 1TMGEI 1TO1EI 1Y1KEI
    1Y33EI 1Y34EI 1Y3BEI 1Y4AEI 1GC1GC 2SICEI 2O3BAB 1FC2CD 2BTFAP 1EFNAB
    2A9KAB
    """.split(' ') if s.strip()]

G4 = [
    "%s_%s_%s" % (s[:4], s[4], s[5].strip())  for s in  
    """
    1R0REI 1EAWAB 2FTLEI 3BTDEI 3BTEEI 3BTFEI 3BTGEI 3BTHEI 3BTMEI 3BTQEI
    3BTTEI 3BTWEI 1AK4AD 1M9EAD 2J0TAD 1FFWAB 1MAHAF 1UUZAD 1SMFEI 2AJFAE
    2J1KCT
    """.split(' ') if s.strip()]

G5 = [
    "%s_%s_%s" % (s[:4], s[4], s[5].strip())  for s in  
    """
    1B2SAD 1B2UAD 1B3SAD 1BRSAD 1X1XAD 1EMVAB 2VLNAB 2VLOAB 2VLQAB
    2WPTAB 1A22AB 2B0ZAB 2B10AB 2B11AB 2B12AB 2PCBAB 2PCCAB 1KTZAB 1LFDAB
    1FCCAC 1GL0EI 1GL1AI 1HE8AB 2HLEAB 2I9BAE
    """.split(' ') if s.strip()]

G6 = [prot for prot in skempi_df.Protein if prot not in set(G1 + G2 + G3 + G4 + G5)]

In [799]:
sum(skempi_df.Protein.isin(G6))

983

In [800]:
df = skempi_df
from sklearn.preprocessing import StandardScaler

def run_cv_test(X, get_regressor, normalize=0):
    gt, preds = [], []
    for group in [G1, G2, G3, G4, G5]:
        indx_tst = df.Protein.isin(group)
        indx_trn = np.logical_not(indx_tst)
        y_trn = df.DDG[indx_trn]
        y_true = df.DDG[indx_tst]
        X_trn = X[indx_trn]
        X_tst = X[indx_tst]
        regressor = get_regressor()
        if normalize == 1:
            scaler = StandardScaler()
            scaler.fit(X_trn)
            X_trn, X_tst = scaler.transform(X_trn), scaler.transform(X_tst)
        regressor.fit(X_trn, y_trn)
        y_pred = regressor.predict(X_tst)
        print(pearsonr(y_true, y_pred))
        preds.extend(y_pred)
        gt.extend(y_true)
    return gt, preds

In [801]:
X = np.transpose([all_features[feat] for feat in acc_feats])
X = np.concatenate([X, np.asarray(ss_arr)], axis=1)
X.shape

(3047, 23)

In [802]:
from sklearn.ensemble import RandomForestRegressor
def get_regressor(): return RandomForestRegressor(n_estimators=100, random_state=101)
gt, preds = run_cv_test(X, get_regressor, normalize=1)
print(pearsonr(gt, preds))
len(gt)

(0.69888841990325246, 3.599609198720432e-59)
(0.53874098691958006, 6.537995574037864e-27)
(0.60198739783958222, 2.6194166429084624e-40)
(0.26846690437762311, 2.2909689268459242e-08)
(0.62223008385708112, 1.599771739340398e-56)
(0.49653652205471754, 6.1966484463430904e-129)


2064

In [803]:
from sklearn.ensemble import RandomForestRegressor
def get_regressor(): return SVR(kernel='rbf')
gt, preds = run_cv_test(X, get_regressor, normalize=1)
print(pearsonr(gt, preds))
len(gt)

(0.58022726828700433, 6.4478675035687233e-37)
(0.4895111455539598, 7.8999468588461493e-22)
(0.60088575024732327, 3.9460649653023566e-40)
(0.23440846061454071, 1.1889598147058405e-06)
(0.55356029414812391, 1.1001176820293134e-42)
(0.44084021868761042, 6.9509966485260807e-99)


2064

In [804]:
def run_cv_test(X, alpha=0.2, normalize=1):
    gt, preds = [], []
    for group in [G1, G2, G3, G4, G5]:
        indx_tst = df.Protein.isin(group)
        indx_trn = np.logical_not(indx_tst)
        y_trn = df.DDG[indx_trn]
        y_true = df.DDG[indx_tst]
        X_trn = X[indx_trn]
        X_tst = X[indx_tst]
        rf = RandomForestRegressor(n_estimators=50, random_state=101)
        svr = SVR(kernel='rbf')
        if normalize == 1:
            scaler = StandardScaler()
            scaler.fit(X_trn)
            X_trn, X_tst = scaler.transform(X_trn), scaler.transform(X_tst)
        svr.fit(X_trn, y_trn)
        rf.fit(X_trn, y_trn)
        y_pred_svr = svr.predict(X_tst)
        y_pred_rf = rf.predict(X_tst)
        y_pred = alpha * y_pred_svr + (1-alpha) * y_pred_rf
        print(pearsonr(y_true, y_pred))
        preds.extend(y_pred)
        gt.extend(y_true)
    return gt, preds

In [805]:
gt, preds = run_cv_test(X, normalize=1)
print(pearsonr(gt, preds))
len(gt)

(0.69352295924838425, 6.2643124887526144e-58)
(0.54398827848582498, 1.6718124183066529e-27)
(0.61376396774888775, 2.9634728427112909e-42)
(0.26420506962114376, 3.8757192779436134e-08)
(0.63419542726906764, 2.6858246707796961e-59)
(0.49769134075499566, 1.2832537954501601e-129)


2064

In [806]:
cp_b = np.asarray(all_features[('uniform', 'CP_B', 'BASU010101', 7)])
cp_a = np.asarray(all_features[('uniform', 'CP_A', 'BASU010101', 7)])
ei = np.asarray(all_features[('EI', 'SKOJ970101')])
ddg = skempi_df.DDG

In [807]:
c1 = pearsonr(ei, ddg)[0]
c2 = pearsonr(cp_a, ddg)[0]
c3 = pearsonr(cp_b, ddg)[0]
s = c1 + c2 + c3
a1 = c1/s
a2 = c2/s
a3 = c3/s
c1, c2, c3

(0.19216942446949448, 0.21351384288462372, 0.36314745255523262)

In [1]:
ddg_hat =  np.multiply(a1, ei) + np.multiply(a2, cp_a) + np.multiply(a3, cp_b)
pearsonr(ddg_hat, ddg)

NameError: name 'np' is not defined