# Preprocessing

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
from pathlib import Path
import pickle
import sys
import warnings

sys.path.append('../..')
from kinsim_structure.preprocessing import KlifsMetadataLoader, Mol2ToPdbConverter, KlifsMetadataFilter

In [4]:
warnings.simplefilter('ignore')  # Caution: Suppresses Biopython warnings

In [5]:
PATH_KLIFS_DOWNLOAD = Path('/home/dominique/Documents/data/kinsim/20191115_full/KLIFS_download')

## KlifsMetadataLoader

In [6]:
klifs_metadata_loader = KlifsMetadataLoader()
klifs_metadata_loader.from_files(
    PATH_KLIFS_DOWNLOAD / 'overview.csv', 
    PATH_KLIFS_DOWNLOAD / 'KLIFS_export.csv'
)

In [7]:
klifs_metadata = klifs_metadata_loader.data_essential
klifs_metadata.head()

Unnamed: 0,pdb_id,alternate_model,chain,kinase,kinase_all,family,groups,species,dfg,ac_helix,pocket,rmsd1,rmsd2,qualityscore,resolution,missing_residues,missing_atoms,filepath
0,3dko,A,A,EphA7,"[EPHA7, EphA7]",Eph,TK,Human,out,out,RVIGA_EFGEVCSVAIKTLDFLCEASIMGQFDPNVVHLEGVMIVIE...,0.944,2.254,8.1,2.0,3,7,HUMAN/EphA7/3dko_altA_chainA
1,2rei,B,A,EphA7,"[EPHA7, EphA7]",Eph,TK,Human,in,in,RVIGAGEFGEVCSVAIKTLDFLCEASIMGQFDPNVVHLEGVMIVIE...,0.784,2.094,7.6,1.6,1,0,HUMAN/EphA7/2rei_altB_chainA
2,3dko,B,A,EphA7,"[EPHA7, EphA7]",Eph,TK,Human,out,out,RVIGA_EFGEVCSVAIKTLDFLCEASIMGQFDPNVVHLEGVMIVIE...,0.944,2.254,8.1,2.0,3,7,HUMAN/EphA7/3dko_altB_chainA
3,2rei,A,A,EphA7,"[EPHA7, EphA7]",Eph,TK,Human,in,in,RVIGAGEFGEVCSVAIKTLDFLCEASIMGQFDPNVVHLEGVMIVIE...,0.784,2.093,7.6,1.6,1,0,HUMAN/EphA7/2rei_altA_chainA
4,3v8t,B,A,ITK,[ITK],Tec,TK,Human,in,out-like,QEIGSG___LVHLVAIKTIDFIEEAEVMMKLSPKLVQLYGVCLVFE...,0.842,2.047,7.8,2.0,4,6,HUMAN/ITK/3v8t_altB_chainA


In [8]:
klifs_metadata.to_csv(PATH_KLIFS_DOWNLOAD / 'klifs_metadata_unfiltered.csv')

## KlifsMetadataFilter

In [9]:
f = open('../scripts/klifs_metadata_filter.py')
file_content = f.readlines()
for i in file_content:
    print(i, end="")

import logging

from pathlib import Path
import pickle
import sys

sys.path.append('../..')
from kinsim_structure.preprocessing import KlifsMetadataLoader, KlifsMetadataFilter

PATH_SCRIPT = Path(__name__).parent
PATH_KLIFS_DOWNLOAD = Path('/home/dominique/Documents/data/kinsim/20191115_full/KLIFS_download')

logger = logging.getLogger(__name__)
logging.basicConfig(
    format='%(asctime)s %(message)s',
    datefmt='%m/%d/%Y %I:%M:%S %p',
    filename=PATH_SCRIPT / 'klifs_metadata_filter.log',
    filemode='w',
    level=logging.INFO
)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
logging.getLogger('').addHandler(console)


def main():

    # Load metadata
    klifs_metadata_loader = KlifsMetadataLoader()
    klifs_metadata_loader.from_files(
        PATH_KLIFS_DOWNLOAD / 'overview.csv',
        PATH_KLIFS_DOWNLOAD / 'KLIFS_export.csv'
    )

    # Filter metadata
    klifs_metadata_filter = KlifsMetadataFilter()
    klifs_metadata_filter.from_klifs_metadata(klifs_me

In [10]:
with open(PATH_KLIFS_DOWNLOAD / 'klifs_metadata_filter.p', 'rb') as f:
    klifs_metadata_filter = pickle.load(f)

In [11]:
klifs_metadata_filter.filtering_statistics

Unnamed: 0,filtering_step,n_filtered,n_remained
0,Unfiltered,0,10469
1,Only Human,505,9964
2,Only DFG in,1298,8666
3,Only resolution <= 4,42,8624
4,Only quality score >= 4,18,8606
5,Only existing pocket/protein.mol2 files,0,8606
6,Only existing protein.pdb files,0,8606
7,Only parsable protein.pdb files,0,8606
8,Only clean residue IDs (no underscores in pock...,3,8603
9,Only without X residues at important KLIFS pos...,1,8602


In [12]:
klifs_metadata_filter.filtered_indices

{'non_existing_mol2s': [],
 'non_existing_pdbs': [],
 'non_parsable_pdbs': [],
 'with_underscored_residues': [7033, 7920, 7921],
 'with_x_residue_at_important_klifs_position': [8098]}