In [97]:
import re
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from Bio.PDB import *
from Bio.Data.IUPACData import protein_letters_1to3
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings('ignore')

In [98]:
from graphein.protein.config import ProteinGraphConfig
from graphein.protein.edges.atomic import add_atomic_edges
from graphein.protein.graphs import construct_graph, read_pdb_to_dataframe
from graphein.protein.subgraphs import extract_subgraph_from_point
from graphein.protein.utils import save_graph_to_pdb
from graphein.molecule.edges.distance import compute_distmat, get_interacting_atoms
from graphein.protein.visualisation import plotly_protein_structure_graph

In [99]:
from rdkit import Chem
from rdkit.Chem import ChemicalFeatures
from rdkit import RDConfig
import os
fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef')
factory = ChemicalFeatures.BuildFeatureFactory(fdefName)

In [100]:
MutationSet = pd.read_csv('../datasets/mCSM-AB2_dataset_short.csv')

AtomClass = factory.GetFeatureFamilies()

residue_environment_radius = 10

Dmin = 0
Dstep = 10 
Dmax = residue_environment_radius * 2

graphs_dict = {}

In [101]:
def extractResidueEnvironment(Mutation, pdb_id, mutation, chain):

    aa, num_aa, mut_aa = re.match(r"([A-Za-z])(\d+)([A-Za-z])", mutation).groups() #F149A –> F+149+A
    aa = protein_letters_1to3[aa].upper() # F –> Phe –> PHE


    # Делаем граф из всего белка, если еще не делали на предыдущем шаге
    if pdb_id not in graphs_dict.keys():
        print('он опять строит граф')
        params_to_change = {"granularity": "atom", "edge_construction_functions": [add_atomic_edges]}
        config = ProteinGraphConfig(**params_to_change)
        graph = construct_graph(config=config, pdb_code=pdb_id)
        # Сохраняем граф в словаре
        graphs_dict[pdb_id] = graph
    else:
        # Если граф уже существует, используем его
        graph = graphs_dict[pdb_id]
    
    # Поиск координат СА мутирующего остатка
    for node, data in graph.nodes(data=True):
        if f'{chain}:{aa}:{num_aa}:CA' in node:
            mut_center = data['coords']
    

    # Выделение подграфа residue_environment_sg вокруг мутируемого остатка
    residue_environment_sg = extract_subgraph_from_point(graph, centre_point=(mut_center), radius=residue_environment_radius)


    # Сохранение подграфа residue_environment_sg в pdb формате
    save_graph_to_pdb(residue_environment_sg, f'./cutPDBs/{pdb_id}_{mutation}_cut.pdb')

    # Перевод pdb в датафрейм
    residue_environment = read_pdb_to_dataframe(f'./cutPDBs/{pdb_id}_{mutation}_cut.pdb')

    return residue_environment

In [102]:
def extractEnvironmentFeats(Mutation, pdb_id, mutation):
    
    # Создание меток фармакофоров для атомов окружения
    mol = Chem.MolFromPDBFile(f'./cutPDBs/{pdb_id}_{mutation}_cut.pdb')
    features = factory.GetFeaturesForMol(mol)

    return features

In [104]:
# Проверка, как выглядит residue_environment_sg
# plotly_protein_structure_graph(residue_environment_sg, node_size_min=4, node_size_multiplier=2)

In [105]:
def calculateAtomicPairwiseDist(df):
    coords = df.filter(like='_coord')
    coords

    distMatrix = compute_distmat(coords)
    distMatrix
    return distMatrix

In [107]:
def getFrequency(distMatrix, dist, cls, feats, residue_environment):
    # Выделить из feats координаты определенного cls
    coords_for_cls = []
    for feat in feats:
        if feat.GetFamily() == cls:
            coords_for_cls.append(list(feat.GetPos()))

    # Выделить индексы соответсвующих 
    indexes_for_cls = []
    for index, row in residue_environment.iterrows():
        df_coord = [row['x_coord'], row['y_coord'], row['z_coord']]
        if  df_coord in coords_for_cls:
            indexes_for_cls.append(index)
    print(indexes_for_cls)

    # Отфильтровать матрицу расстояний, оставив только строки и столбцы соответсвующие фармакофорам вида cls
    distMatrix_for_cls = distMatrix[indexes_for_cls][:, indexes_for_cls]
    
    # Считаю кол-во ячеек матрицы, значения в которых меньше dist, и делю на два (матрица симметричная)
    arr = np.where(distMatrix_for_cls <= dist, 1, 0)
    frequency = np.count_nonzero(arr) / 2 

    return frequency

In [112]:
def Generate_mCSM(MutationSet, AtomClass, Dmin, Dmax, Dstep):
    mCSM = []
    for index, Mutation in MutationSet.iterrows():

        pdb_id = Mutation['PDB']
        mutation = Mutation['mutation']
        chain = Mutation['chain']

        residue_environment = extractResidueEnvironment(Mutation, pdb_id, mutation, chain)
        feats = extractEnvironmentFeats(Mutation, pdb_id, mutation)
        j = 0
        mCSM_row = []
        distMatrix = calculateAtomicPairwiseDist(residue_environment)
        for dist in range(Dmin, Dmax, Dstep):
            for cls in AtomClass:
                mCSM_row.append(getFrequency(distMatrix, dist, cls, feats, residue_environment))
                j += 1
        mCSM.append(mCSM_row)
    return mCSM

In [None]:
mCSM = Generate_mCSM(MutationSet, AtomClass, Dmin, Dmax, Dstep)

In [114]:
graphs_dict.items()

dict_items([('1FC2', <networkx.classes.graph.Graph object at 0x12d9dc370>), ('1FCC', <networkx.classes.graph.Graph object at 0x12df4ded0>)])

In [115]:
df = pd.DataFrame(mCSM, columns=list(AtomClass) * 2)
df

Unnamed: 0,Donor,Acceptor,NegIonizable,PosIonizable,ZnBinder,Aromatic,Hydrophobe,LumpedHydrophobe,Donor.1,Acceptor.1,NegIonizable.1,PosIonizable.1,ZnBinder.1,Aromatic.1,Hydrophobe.1,LumpedHydrophobe.1
0,13.0,12.0,0.0,1.5,0.0,0.0,12.0,0.0,233.0,181.0,0.0,3.0,0.0,0.0,259.0,0.0
1,13.0,12.0,0.0,1.5,0.0,0.0,12.0,0.0,233.0,181.0,0.0,3.0,0.0,0.0,259.0,0.0
2,13.0,11.0,0.0,1.5,0.0,0.0,13.5,0.0,266.5,172.0,0.0,2.5,0.0,0.0,310.5,0.0
3,15.0,14.0,0.0,3.5,0.0,0.0,14.0,0.5,299.5,262.0,0.0,16.5,0.0,0.0,350.5,0.5
4,13.0,11.5,0.0,3.0,0.0,0.0,7.5,0.0,267.0,243.5,0.0,10.0,0.0,0.0,118.5,0.0
5,14.0,13.5,0.0,1.5,0.0,0.0,13.5,0.0,290.0,222.0,0.0,3.5,0.0,0.0,297.0,0.0
6,8.5,8.5,0.0,0.5,0.0,0.0,8.0,0.5,107.5,111.5,0.0,1.0,0.0,0.0,123.0,0.5
7,11.5,10.5,0.0,2.0,0.0,0.0,13.0,0.0,251.0,153.5,0.0,5.5,0.0,0.0,302.5,0.0
8,11.0,10.0,0.0,2.0,0.0,0.0,13.5,0.5,249.5,216.5,0.0,6.0,0.0,0.0,279.5,2.0
9,15.5,14.5,0.0,2.5,0.0,0.0,11.0,0.0,282.0,268.0,0.0,3.5,0.0,0.0,154.0,0.0
