# Kinase data preprocessing

This notebook performs the following preprocessing tasks:

1. Set path to KLIFS download files
2. Load, merge and filter KLIFS metadata
3. Remove KLIFS metadata entries with missing mol2 files
4. Download PDB files for KLIFS metadata
5. Remove KLIFS metadata entries with missing PDB files 
6. Remove KLIFS metadata entries with unparsable PDB files
7. Remove KLIFS metadata entries with underscored residue IDs in mol2 file
8. Remove structures with KLIFS residue X
9. Filter by resolution and quality score
10. Save final KLIFS dataset (metadata)

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
from pathlib import Path
import sys

from Bio.PDB import PDBList
import numpy as np
import pandas as pd

sys.path.extend(['./..'])
from kinsim_structure.auxiliary import split_klifs_code, get_klifs_regions
from kinsim_structure.preprocessing import get_klifs_metadata_from_files, download_from_pdb
from kinsim_structure.preprocessing import get_species, get_dfg, get_unique_pdbid_per_kinase
from kinsim_structure.preprocessing import drop_missing_mol2s, drop_missing_pdbs, drop_unparsable_pdbs
from kinsim_structure.preprocessing import drop_underscored_residue_ids, drop_residue_x

## Globals

### 1. Set path to KLIFS download files

In [4]:
# Path to data directory
dataset_name = '20190724_full'

path_to_data = Path('/') / 'home' / 'dominique' / 'Documents' / 'data' / 'kinsim' / dataset_name
path_to_results = Path('..') / 'results' / dataset_name
path_to_results.mkdir(parents=True, exist_ok=True)

### 2. Load, merge and filter KLIFS metadata

#### Load and merge KLIFS download metadata files

In [5]:
klifs_overview_file = path_to_data / 'raw' / 'KLIFS_download' /'overview.csv'
klifs_export_file = path_to_data / 'raw'/ 'KLIFS_export.csv'

In [6]:
klifs_metadata = get_klifs_metadata_from_files(klifs_overview_file, klifs_export_file)

In [7]:
klifs_metadata.shape

(10136, 21)

In [9]:
klifs_metadata.to_csv(path_to_data / 'preprocessed' / 'klifs_metadata_unfiltered.csv')

#### Filter metadata by species

Keep only human entries

In [None]:
klifs_metadata.groupby('species').size()

In [None]:
klifs_metadata_filtered = get_species(klifs_metadata, species='Human')

In [None]:
klifs_metadata_filtered.shape

#### Filter metadata by DFG loop position

Keep only structures with DFG-in loops.

In [None]:
klifs_metadata_filtered.groupby('dfg').size()

In [None]:
klifs_metadata_filtered = get_dfg(klifs_metadata_filtered, dfg='in')

In [None]:
klifs_metadata_filtered.shape

#### Filter metadata by unique kinase-PDB ID combinations
Keep only the KLIFS entry per kinase-PDB ID combination with the best quality score.

In [None]:
klifs_metadata_filtered = get_unique_pdbid_per_kinase(klifs_metadata_filtered)

In [None]:
klifs_metadata_filtered.shape

In [None]:
klifs_metadata_filtered.to_csv(path_to_data / 'preprocessed' / 'klifs_metadata_step2result.csv')

### 3. Remove KLIFS metadata entries with missing mol2 files

In [None]:
klifs_metadata_filtered = drop_missing_mol2s(klifs_metadata_filtered, path_to_data)

In [None]:
klifs_metadata_filtered.shape

In [None]:
print(f'Number of unique PDB IDs in dataset: {klifs_metadata_filtered.pdb_id.unique().size}')

In [None]:
# Check if there are PDB IDs occurring multiple times for one kinase
grouped = klifs_metadata_filtered.groupby('pdb_id')['kinase'].size()
multiple_pdb_ids = list(grouped[grouped > 1].index)
klifs_metadata_filtered.loc[klifs_metadata_filtered.pdb_id.isin(multiple_pdb_ids)].sort_values('pdb_id')

### 4. Download PDB files for KLIFS metadata

In [None]:
# Download cif files if file does not exist
download_from_pdb(klifs_metadata_filtered, path_to_data)

### 5. Remove KLIFS metadata entries with missing PDB files 

Let's check if we have PDB files for all entries in the KLIFS metadata. For PDB IDs without a corresponding cif file, remove corresponding entries in KLIFS metadata

In [None]:
# Get PDBs in KLIFS metadata
pdb_ids_metadata = klifs_metadata_filtered.pdb_id.unique()

# Get PDBs for downloaded cif files
pdb_ids_ciffiles = [i.stem for i in (path_to_data / 'raw' / 'PDB_download').glob('*')]

In [None]:
# Missing CIF files that are in the KLIFS dataset (deprecated PDB entries)
missing_cifs = set(pdb_ids_metadata) - set(pdb_ids_ciffiles)
print(f'Number of KLIFS metadata PDB IDs with missing CIF file: {len(missing_cifs)}')

In [None]:
# In case of missing cif files, try to download them again
pdbfile = PDBList()
for i in missing_cifs:
    pdbfile.retrieve_pdb_file(i, pdir=path_to_data / 'raw' / 'PDB_download')

In [None]:
# In case of missing cif files, delete corresponding PDB ID entries in KLIFS metadata
klifs_metadata_filtered = drop_missing_pdbs(klifs_metadata_filtered, path_to_data)

In [None]:
klifs_metadata_filtered.shape

### 6. Remove KLIFS metadata entries with unparsable PDB files

Remove PDB IDs for which parsing does not work (using `Bio.PDB.MMCIFParser`).

In [None]:
klifs_metadata_filtered = drop_unparsable_pdbs(klifs_metadata_filtered, path_to_data)

In [None]:
klifs_metadata_filtered.shape

### 7. Remove KLIFS metadata entries with underscored residue IDs in mol2 file

In [None]:
klifs_metadata_filtered = drop_underscored_residue_ids(klifs_metadata_filtered)

In [None]:
klifs_metadata_filtered.shape

In [None]:
klifs_metadata_filtered.to_csv(path_to_data / 'preprocessed' / 'klifs_metadata_preprocessed_incl_resX.csv')

In [None]:
klifs_metadata_filtered = pd.read_csv(path_to_data / 'preprocessed' / 'klifs_metadata_preprocessed_incl_resX.csv')

### 8. Remove structures with KLIFS residue X

Some structures contain mutations or modifications in their KLIFS binding site. KLIFS denotes these with an X in the pocket sequence.

We remove all structures containing such a residue in important regions in the binding site.

In [None]:
klifs_metadata_filtered = drop_residue_x(klifs_metadata_filtered)

In [None]:
klifs_metadata_filtered.shape

In [None]:
klifs_metadata_filtered[klifs_metadata_filtered.pdb_id=='4otp']

In [None]:
klifs_metadata_filtered.to_csv(path_to_data / 'preprocessed' / 'klifs_metadata_preprocessed_incl_resolution_qualityscore.csv')

### 9. Filter by resolution and quality score

In [None]:
klifs_metadata_filtered = klifs_metadata_filtered[
    (klifs_metadata_filtered.resolution <= 4) &
    (klifs_metadata_filtered.qualityscore >= 4)
].copy()

In [None]:
klifs_metadata_filtered.shape

### 10. Save final KLIFS dataset (metadata)

In [None]:
klifs_metadata_filtered.shape

In [None]:
klifs_metadata_filtered.rename(
    columns={'index': 'metadata_index'}, inplace=True
)

In [None]:
codes = []

for index, row in klifs_metadata_filtered.iterrows():
    
    species = row.species.upper()
    kinase = row.kinase
    pdb_id = row.pdb_id
    chain = ''
    alternate_model = ''
    
    if row.chain != '-':
        chain = f'_chain{row.chain}'
    if row.alternate_model != '-':
        alternate_model = f'_alt{row.alternate_model}'
        
    codes.append(f'{species}/{kinase}/{pdb_id}{chain}{alternate_model}')

codes[:10]

In [None]:
klifs_metadata_filtered['code'] = codes

In [None]:
klifs_metadata_filtered

In [None]:
klifs_metadata_filtered.to_csv(path_to_data / 'preprocessed' / 'klifs_metadata_preprocessed.csv')