# Preprocessing

## Imports

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import pandas as pd
from pathlib import Path
import pickle
import sys
import warnings

sys.path.append('../..')
from kinsim_structure.preprocessing import KlifsMetadataLoader, Mol2FormatScreener, Mol2KlifsToPymolConverter, Mol2ToPdbConverter, KlifsMetadataFilter

In [4]:
warnings.simplefilter('ignore')  # Caution: Suppresses Biopython warnings

## Dataset

In [5]:
PATH_KLIFS_DOWNLOAD = Path('/home/dominique/Documents/data/kinsim/20191115_full/KLIFS_download')

## KlifsMetadataLoader

In [6]:
klifs_metadata_loader = KlifsMetadataLoader()
klifs_metadata_loader.from_files(
    PATH_KLIFS_DOWNLOAD / 'overview.csv', 
    PATH_KLIFS_DOWNLOAD / 'KLIFS_export.csv'
)

In [7]:
klifs_metadata = klifs_metadata_loader.data_essential
klifs_metadata.head()

Unnamed: 0,pdb_id,alternate_model,chain,kinase,kinase_all,family,groups,species,dfg,ac_helix,pocket,rmsd1,rmsd2,qualityscore,resolution,missing_residues,missing_atoms,filepath
0,3dko,A,A,EphA7,"[EPHA7, EphA7]",Eph,TK,Human,out,out,RVIGA_EFGEVCSVAIKTLDFLCEASIMGQFDPNVVHLEGVMIVIE...,0.944,2.254,8.1,2.0,3,7,HUMAN/EphA7/3dko_altA_chainA
1,2rei,B,A,EphA7,"[EPHA7, EphA7]",Eph,TK,Human,in,in,RVIGAGEFGEVCSVAIKTLDFLCEASIMGQFDPNVVHLEGVMIVIE...,0.784,2.094,7.6,1.6,1,0,HUMAN/EphA7/2rei_altB_chainA
2,3dko,B,A,EphA7,"[EPHA7, EphA7]",Eph,TK,Human,out,out,RVIGA_EFGEVCSVAIKTLDFLCEASIMGQFDPNVVHLEGVMIVIE...,0.944,2.254,8.1,2.0,3,7,HUMAN/EphA7/3dko_altB_chainA
3,2rei,A,A,EphA7,"[EPHA7, EphA7]",Eph,TK,Human,in,in,RVIGAGEFGEVCSVAIKTLDFLCEASIMGQFDPNVVHLEGVMIVIE...,0.784,2.093,7.6,1.6,1,0,HUMAN/EphA7/2rei_altA_chainA
4,3v8t,B,A,ITK,[ITK],Tec,TK,Human,in,out-like,QEIGSG___LVHLVAIKTIDFIEEAEVMMKLSPKLVQLYGVCLVFE...,0.842,2.047,7.8,2.0,4,6,HUMAN/ITK/3v8t_altB_chainA


In [8]:
klifs_metadata.to_csv(PATH_KLIFS_DOWNLOAD.parent / 'klifs_metadata.csv')

## Mol2FormatScreener

In [9]:
with open(PATH_KLIFS_DOWNLOAD.parent / 'mol2_format_screener.p', 'rb') as f:
    mol2_format_screener = pickle.load(f)

### Underscored residues

In [10]:
structures = mol2_format_screener.structures_irregular['residues_underscored'].groupby('molecule_code')
print(f'Number of structures with underscored residues: {len(structures)}')

Number of structures with underscored residues: 208


In [11]:
mol2_format_screener.structures_irregular['residues_underscored'].head()

Unnamed: 0,molecule_code,res_id,res_name,subst_name
0,3v5q.B,-3,GLY,GLY_3
1,3v5q.B,-2,ILE,ILE_2
2,3v5q.B,-1,HIS,HIS_1
0,4ymj.A,-3,GLY,GLY_3
1,4ymj.A,-2,ILE,ILE_2


### Non-standard residues

In [12]:
structures = mol2_format_screener.structures_irregular['residues_non_standard'].groupby('molecule_code')
print(f'Number of structures with non-standard residues: {len(structures)}')

Number of structures with non-standard residues: 1754


In [13]:
non_standard_residues = mol2_format_screener.structures_irregular['residues_non_standard'].groupby('res_name')
print(f'Number of unique non-standard residues: {len(non_standard_residues)}, i.e. {list(non_standard_residues.groups.keys())}')

Number of unique non-standard residues: 30, i.e. ['ACE', 'ACY', 'ALY', 'AME', 'CAF', 'CAS', 'CME', 'CSD', 'CSO', 'CSS', 'CSX', 'CXM', 'CY0', 'GLC', 'KCX', 'LGY', 'MAN', 'MHO', 'NEP', 'NMM', 'OCS', 'OCY', 'PHD', 'PO2', 'PTL', 'PTR', 'SCS', 'SEP', 'SIN', 'TPO']


In [14]:
mol2_format_screener.structures_irregular['residues_non_standard'].head()

Unnamed: 0,molecule_code,res_id,res_name,subst_name
0,4ymj.A,706,SEP,SEP706
0,4ymj.B,706,SEP,SEP706
0,4ymj.B,706,SEP,SEP706
0,4ymj.A,706,SEP,SEP706
0,2vag.A,341,SEP,SEP341


### Residues with duplicated atom names

In [15]:
structures = mol2_format_screener.structures_irregular['residues_duplicated_atom_names'].groupby('molecule_code')
print(f'Number of structures with residues with duplicated atom names: {len(structures)}')

Number of structures with residues with duplicated atom names: 6689


In [16]:
atom_names = mol2_format_screener.structures_irregular['residues_duplicated_atom_names'].atom_name.unique()
print(f'Duplicated atom names: {list(atom_names)}')

Duplicated atom names: ['H', 'H1', 'H2', 'HG2', 'HG3', 'H3', 'H4']


In [17]:
mol2_format_screener.structures_irregular['residues_duplicated_atom_names'].head()

Unnamed: 0,molecule_code,res_id,res_name,subst_name,atom_name
0,3dko.A,641,ALA,ALA641,H
1,3dko.A,641,ALA,ALA641,H
2,3dko.A,778,GLY,GLY778,H
3,3dko.A,778,GLY,GLY778,H
4,3dko.A,902,HIS,HIS902,H


## Mol2KlifsToPymolConverter

## Mol2ToPdbConverter

In [18]:
with open(PATH_KLIFS_DOWNLOAD.parent / 'mol2_to_pdb_converter.log', 'r') as f:
    lines = f.readlines()

## KlifsMetadataFilter

In [19]:
with open(PATH_KLIFS_DOWNLOAD / 'klifs_metadata_filter.p', 'rb') as f:
    klifs_metadata_filter = pickle.load(f)

In [20]:
klifs_metadata_filter.filtering_statistics

Unnamed: 0,filtering_step,n_filtered,n_remained
0,Unfiltered,0,10469
1,Only Human,505,9964
2,Only DFG in,1298,8666
3,Only resolution <= 4,42,8624
4,Only quality score >= 4,18,8606
5,Only existing pocket/protein mol2 files,0,8606
6,Only existing protein pdb files,0,8606
7,Only parsable protein pdb files,0,8606
8,Only clean residue IDs (no underscores in pock...,3,8603
9,Only without X residues at important KLIFS pos...,1,8602


In [21]:
klifs_metadata_filtered = klifs_metadata_filter.filtered
klifs_metadata_filtered.to_csv(PATH_KLIFS_DOWNLOAD.parent / 'klifs_metadata_filtered.csv')