In [13]:
import re
import json
import pandas as pd
import numpy as np
from Bio.PDB import *
from tqdm.notebook import tqdm
from itertools import product
import warnings
warnings.filterwarnings('ignore')

In [14]:
import logging
feat_logger = logging.getLogger('gen_features_for_datasets')
feat_logger.setLevel(logging.INFO)
handler2 = logging.FileHandler('./logs/gen_features_for_datasets.log', mode='w')
formatter2 = logging.Formatter("%(levelname)s %(message)s") #!!!
handler2.setFormatter(formatter2)
feat_logger.addHandler(handler2)

In [15]:
from res_biochem import residueFeature
from CSM.gen_csm import Generate_CSM

In [16]:
# –ó–∞–≥—Ä—É–∂–∞–µ–º —Å–ª–æ–≤–∞—Ä—å —Å –≤–µ–∫—Ç–æ—Ä–æ–º —Ñ–∞—Ä–º–∞–∫–æ—Ñ–æ—Ä–æ–≤ –¥–ª—è –∫–∞–∂–¥–æ–π –ê–ö
with open('./CSM/AA_vector_pharm_dict.json', 'r') as file:
    AA_vector_pharm_dict = json.load(file)

# –ü–µ—Ä–µ–≤–æ–¥–∏–º –≤—Å–µ —Å–ø–∏—Å–∫–∏ –≤ –∑–Ω–∞—á–µ–Ω–∏—è—Ö —Å–ª–æ–≤–∞—Ä—è –≤ np.array –¥–ª—è —É–¥–æ–±—Å—Ç–≤–∞ –≤—ã—á–∏—Ç–∞–Ω–∏—è –¥—Ä—É–≥ –∏–∑ –¥—Ä—É–≥–∞ –≤ –¥–∞–ª—å–Ω–µ–π—à–µ–º
for key in AA_vector_pharm_dict:
    AA_vector_pharm_dict[key] = np.array(AA_vector_pharm_dict[key])

In [18]:
def GenerateFeatures(path2mutations, paths2pdbs, res_env_radius, Dmin, Dmax, Dstep):
    Features = []
    MutationSet = pd.read_csv(path2mutations)
    pdb_graph_dict = {}

    with tqdm(total=len(MutationSet), desc="Mutations") as progressbar:
        for i, (index, Mutation) in enumerate(MutationSet.iterrows()):

            pdb_id = Mutation['PDB']
            mutation = Mutation['mutation']
            chain = Mutation['chain']
            ddG = Mutation['Exp. DDG']
            aa, num_aa, mut_aa = re.match(r"([A-Za-z])(\d+[A-Za-z]*)([A-Za-z])", mutation).groups()#F149A ‚Äì> F+149+A
            feat_logger.info(f'–í—ã–¥–µ–ª—è–µ–º —Ñ–∏—á–∏ –¥–ª—è {pdb_id},{mutation},{chain}... üèÉ')


            for path in paths2pdbs:
            ##################
            ..................
            ##################
  
            pharmacophore_count = AA_vector_pharm_dict[mut_aa] - AA_vector_pharm_dict[aa]
            residue_feature = residueFeature(mut_aa) - residueFeature(aa)
                
            FeaturesRow = [
                *CSM_row, 
                *pharmacophore_count,
                *residue_feature,
                ddG
                ]
            Features.append(FeaturesRow)
            feat_logger.info(f'–£—Å–ø–µ—Ö! üòÄ')

            ##################
            ..................
            ##################

            progressbar.update()


    combinations = list(product(('Hyd','Pos','Neg','Acc','Don','Aro','Sul','Neu'), repeat=2))
    column_names = [f"{cls1}_{cls2}_{dist}" for dist in range(Dmin, Dmax, Dstep) for cls1, cls2 in combinations]
    column_names.extend(['‚àÜHyd', '‚àÜPos', '‚àÜNeg', '‚àÜAcc', '‚àÜDon', '‚àÜAro', '‚àÜSul', '‚àÜNeu'])
    column_names.extend(['‚àÜAAvolume', '‚àÜAAhydropathy', '‚àÜAAarea', '‚àÜAAweight', '‚àÜAAcharge', '‚àÜAAflexibily', '‚àÜAAchemical', '‚àÜAAsize', '‚àÜAAhbonds'])
    column_names.append('ddG')
    Features = pd.DataFrame(Features, index=MutationSet['ID'], columns=column_names)

    return Features

In [19]:
def GenerateReverseFeatures(features):
    rev_features = features.copy()

    # –î–ª—è –Ω–µ–∫–æ—Ç–æ—Ä—ã—Ö —Ñ–∏—á–µ–π –¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ –ø—Ä–æ—Å—Ç–æ –ø–æ–º–µ–Ω—è—Ç—å –∑–Ω–∞–∫
    columns_to_invert = ['‚àÜHyd', '‚àÜPos', '‚àÜNeg', '‚àÜAcc', '‚àÜDon', '‚àÜAro', '‚àÜSul', '‚àÜNeu', '‚àÜAAvolume', '‚àÜAAhydropathy', '‚àÜAAarea', '‚àÜAAweight', '‚àÜAAcharge', '‚àÜAAflexibily', '‚àÜAAchemical', '‚àÜAAsize', '‚àÜAAhbonds']
    for col in columns_to_invert:
        rev_features[col] = rev_features[col].apply(lambda x: -x)
    
    rev_features.index = pd.read_csv('../datasets/mCSM-AB2_reverse.csv')['ID']
    
    return rev_features

### –§–∏—á–∏ –¥–ª—è mCSM-AB2 –¥–∞—Ç–∞—Å–µ—Ç–∞ ‚Äî antibody-antigene interactions (AbAg)

In [20]:
# –ù–∞—Å—Ç—Ä–æ–π–∫–∏ –¥–ª—è CSM
# –ë—É–¥–µ–º —Å—á–∏—Ç–∞—Ç—å –∫–æ–ª-–≤–æ –∫–æ–Ω—Ç–∞–∫—Ç–æ–≤ –≤ –°SM –ø–æ —Å—Ñ–µ—Ä–∏—á–µ—Å–∫–∏–º —Å–ª–æ—è–º —Ç–æ–ª—â–∏–Ω–æ–π Dstep (4√Ö) –≤ —Ä–∞–¥–∏—É—Å–µ res_env_radius (10√Ö)
res_env_radius = 10
Dstep = 4
Dmin = Dstep
Dmax = res_env_radius * 2 + 1

In [None]:
path2mutations = '../datasets/mCSM-AB2_straight.csv' # –ø—É—Ç—å –¥–æ —Ç–∞–±–ª–∏—Ü—ã —Å –º—É—Ç–∞—Ü–∏—è–º–∏ (—Ç–æ–ª—å–∫–æ –ø—Ä—è–º—ã–µ –º—É—Ç–∞—Ü–∏–∏)
paths2pdbs = ['../datasets/SKEMPI2_PDBs_from_rcsb', '../datasets/SKEMPI2_PDBs', '../datasets/AB_bind'] # –ø—É—Ç–∏ –¥–æ –ø–∞–ø–æ–∫ —Å pdb —Å—Ç—Ä—É–∫—Ç—É—Ä–∞–º–∏, –ø—Ä–æ–±–ª–µ–º–∞ –≤ —Ç–æ–º, —á—Ç–æ –≤–µ–∑–¥–µ —Ä–∞–∑–Ω–∞—è –Ω—É–º–µ—Ä–∞—Ü–∏—è –ê–ö, –∫–∞–∫–æ–π —Å–æ–æ—Ç–≤–µ—Ç—Å–≤—É–µ—Ç —Å—Ç—Ä–æ–∫–∞ –º—É—Ç–∞—Ü–∏–π –≤ —Ç–∞–±–ª–∏—Ü–µ –Ω–µ–∏–∑–≤–µ—Å—Ç–Ω–æ, –±—ã–≤–∞–µ—Ç –ø–æ-—Ä–∞–∑–Ω–æ–º—É –≤ —Å–ª—É—á–∞–µ mCSM-AB2 –¥–∞—Ç–∞—Å–µ—Ç–∞

Features = GenerateFeatures(path2mutations, paths2pdbs, res_env_radius, Dmin, Dmax, Dstep)

In [None]:
rev_Features = GenerateReverseFeatures(Features)
pd.concat([Features, rev_Features]).to_csv('./features/AbAg_features.csv')

In [None]:
AbAg = pd.read_csv('./features/AbAg_features.csv') #, index_col='ID') 
AbAg

Unnamed: 0,ID,Hyd_Hyd_4,Hyd_Pos_4,Hyd_Neg_4,Hyd_Acc_4,Hyd_Don_4,Hyd_Aro_4,Hyd_Sul_4,Hyd_Neu_4,Pos_Hyd_4,...,‚àÜAAvolume,‚àÜAAhydropathy,‚àÜAAarea,‚àÜAAweight,‚àÜAAcharge,‚àÜAAflexibily,‚àÜAAchemical,‚àÜAAsize,‚àÜAAhbonds,ddG
0,1,91.0,6,6,45,52,85,1,64,6,...,-22.5,5.3,-35.0,-44.010,1.0,-17.0,-2.0,-1.0,-3.0,0.48
1,2,89.0,9,16,64,57,69,0,94,9,...,-78.1,-2.0,-55.0,-42.081,0.0,-8.0,0.0,-3.0,0.0,-0.99
2,3,57.0,12,7,63,60,13,0,101,12,...,-25.5,5.3,-45.0,-43.025,0.0,-35.0,-3.0,-1.0,-2.0,-1.08
3,4,201.0,16,9,97,103,246,0,116,16,...,-27.5,2.5,-25.0,-30.026,0.0,-2.0,-5.0,-1.0,-2.0,0.07
4,5,142.0,15,4,100,94,173,0,101,15,...,-27.5,2.5,-25.0,-30.026,0.0,-2.0,-5.0,-1.0,-2.0,-1.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1795,1806,161.0,0,6,66,76,224,0,96,0,...,27.5,-2.5,25.0,30.026,-0.0,2.0,5.0,1.0,2.0,-0.34
1796,1807,184.0,2,7,72,73,255,0,122,2,...,105.0,-3.1,115.0,92.097,-0.0,17.0,6.0,4.0,2.0,-3.21
1797,1808,164.0,0,5,60,72,221,1,97,0,...,105.0,-3.1,115.0,92.097,-0.0,17.0,6.0,4.0,2.0,-0.36
1798,1809,306.0,0,6,95,115,483,5,122,0,...,105.0,-3.1,115.0,92.097,-0.0,17.0,6.0,4.0,2.0,-1.72


### –§–∏—á–∏ –¥–ª—è –≤—Å–µ–≥–æ SKEMPIV2 ‚Äî protein-protein interactions (PPI)

In [None]:
# –ù–∞—Å—Ç—Ä–æ–π–∫–∏ –¥–ª—è CSM
# –ë—É–¥–µ–º —Å—á–∏—Ç–∞—Ç—å –∫–æ–ª-–≤–æ –∫–æ–Ω—Ç–∞–∫—Ç–æ–≤ –≤ –°SM –ø–æ —Å—Ñ–µ—Ä–∏—á–µ—Å–∫–∏–º —Å–ª–æ—è–º —Ç–æ–ª—â–∏–Ω–æ–π Dstep (4√Ö) –≤ —Ä–∞–¥–∏—É—Å–µ res_env_radius (10√Ö)
res_env_radius = 10
Dstep = 4
Dmin = Dstep
Dmax = res_env_radius * 2 + 1

In [None]:
path2mutations = '../datasets/skempi_v2_short.csv' # –ø—É—Ç—å –¥–æ —Ç–∞–±–ª–∏—Ü—ã —Å –º—É—Ç–∞—Ü–∏—è–º–∏ (—Ç–æ–ª—å–∫–æ –ø—Ä—è–º—ã–µ –º—É—Ç–∞—Ü–∏–∏)
paths2pdbs = ['../datasets/SKEMPI2_PDBs_from_rcsb', '../datasets/SKEMPI2_PDBs'] # –ø—É—Ç–∏ –¥–æ –ø–∞–ø–æ–∫ —Å pdb —Å—Ç—Ä—É–∫—Ç—É—Ä–∞–º–∏, –ø—Ä–æ–±–ª–µ–º–∞ –≤ —Ç–æ–º, —á—Ç–æ –≤–µ–∑–¥–µ —Ä–∞–∑–Ω–∞—è –Ω—É–º–µ—Ä–∞—Ü–∏—è –ê–ö, –∫–∞–∫–æ–π —Å–æ–æ—Ç–≤–µ—Ç—Å–≤—É–µ—Ç —Å—Ç—Ä–æ–∫–∞ –º—É—Ç–∞—Ü–∏–π –≤ —Ç–∞–±–ª–∏—Ü–µ –Ω–µ–∏–∑–≤–µ—Å—Ç–Ω–æ, –±—ã–≤–∞–µ—Ç –ø–æ-—Ä–∞–∑–Ω–æ–º—É –≤ —Å–ª—É—á–∞–µ mCSM-AB2 –¥–∞—Ç–∞—Å–µ—Ç–∞

In [None]:
PPIFeatures = GenerateFeatures(path2mutations, paths2pdbs, res_env_radius, Dmin, Dmax, Dstep)

In [None]:
rev_PPIFeatures = GenerateReverseFeatures(PPIFeatures)
pd.concat([PPIFeatures, rev_PPIFeatures]).to_csv('./features/PPI_features.csv')

In [None]:
PPI = pd.read_csv('./features/PPI_features.csv')
PPI