# Imports used in this Notebook

In [1]:
from opencadd.databases.klifs import setup_remote

# Set up remote session
remote = setup_remote()

import pandas as pd
import csv
from opencadd.structure.core import Structure

# Creating the Dataset with Kinase Structures from KLIFS

In [2]:
# get all IDs for the kinases, because all_kinases() does not return groups/families
all_IDs = remote.kinases.all_kinases()['kinase.klifs_id'].tolist()
num_kin = len(all_IDs) # should be 1127 according to tutorial

# get all kinases one by one for groups/families
all_kinases = remote.kinases.by_kinase_klifs_id(all_IDs[0])
for ID in all_IDs[1:]:
    all_kinases = all_kinases.append(remote.kinases.by_kinase_klifs_id(ID))
    
# get all pdb_ids    
all_struc_IDs = remote.structures.all_structures()[["structure.klifs_id", 'structure.pdb_id', 
                                                    'kinase.klifs_id', 'structure.pocket', 
                                                    'ligand.expo_id', 'structure.dfg', 'structure.ac_helix',
                                                    'structure.resolution', 'structure.qualityscore', 
                                                    'structure.missing_residues', 'structure.missing_atoms', 
                                                    'structure.rmsd1', 'structure.rmsd2']]
num_all_struc = all_struc_IDs.shape[0]    
# Drop all duplicates in pdb_ids, because we download them from the PDB itself. 
# KLIFS differentiates the chains, so if a Kinase has two structures, the pdb_id is two times in this dataset.
all_pdbs = all_struc_IDs.drop_duplicates(subset=['structure.pdb_id'])
num_duplicates = num_all_struc - all_pdbs.shape[0]
# Drop all NaNs if there are any (should be 0)
num_na = all_pdbs['structure.pdb_id'].isnull().sum()
# Drop all NaNs if there are any (should be 0)
all_pdbs = all_pdbs.dropna(subset=['structure.pdb_id'])
num_all_pdbs = all_pdbs.shape[0]

# merge the pdb_ids to the kinases DataFrame
klifs = pd.merge(all_kinases, all_pdbs, how='left',  on='kinase.klifs_id')
kinase_wo_struc = klifs['structure.pdb_id'].isnull().sum()

# drop every entry where no pdb_id was found
klifs = klifs.dropna(subset=['structure.pdb_id'])
num_merged = klifs.shape[0]
num_kin_merged = klifs['kinase.klifs_id'].nunique()

# statistics:
print(f"Number of Kinases: {num_kin}") # 1127
print(f"Number of all structures: {num_all_struc}") # 12572
print(f"Number of duplicates in structures: {num_duplicates}") # 6763
print(f"Number of NAs in structures: {num_na}") # 0
print(f"Number of NAs in merged dataset: {kinase_wo_struc}") # 787
print(f"Number of structures in merged dataset: {num_merged}") # 5809
print(f"Number of kinases in merged dataset: {num_kin_merged}") # 340

# write the DataFrame to local file for further usage
#klifs.to_csv("../data/samples/<NAME_OF_FILE>", mode="w", header=True, index = False) # here the file was "20211102_klifs_dataset.csv"

Number of Kinases: 1127
Number of all structures: 12572
Number of duplicates in structures: 6763
Number of NAs in structures: 0
Number of NAs in merged dataset: 787
Number of structures in merged dataset: 5809
Number of kinases in merged dataset: 340


In [3]:
klifs_df = pd.read_csv("../data/samples/20211102_klifs_dataset.csv")

In [4]:
klifs_df

Unnamed: 0,kinase.klifs_id,kinase.klifs_name,kinase.full_name,kinase.gene_name,kinase.family,kinase.group,kinase.subfamily,species.klifs,kinase.uniprot,kinase.iuphar,...,structure.pocket,ligand.expo_id,structure.dfg,structure.ac_helix,structure.resolution,structure.qualityscore,structure.missing_residues,structure.missing_atoms,structure.rmsd1,structure.rmsd2
0,531,AAK1,AP2 associated kinase 1,Aak1,NAK,Other,,Mouse,Q3UHJ0,0,...,EVLAEGGFALVFLCALKRMVCKREIQIMRDL_KNIVGYIDSLILMD...,YFS,in,in,2.65,7.2,2.0,58.0,0.780,2.112
1,531,AAK1,AP2 associated kinase 1,Aak1,NAK,Other,,Mouse,Q3UHJ0,0,...,EVLAEGGFALVFLCALKRMVCKREIQIMRDL_KNIVGYIDSLILMD...,YFY,in,in,2.20,7.6,1.0,42.0,0.780,2.121
2,277,AAK1,AP2 associated kinase 1,AAK1,NAK,Other,BIKE,Human,Q2M2I8,1921,...,EVLAEGGFAIVFLCALKRMVCKREIQIMRDLSKNIVGYIDSLILMD...,KSA,in,in,1.95,8.6,0.0,14.0,0.777,2.125
3,277,AAK1,AP2 associated kinase 1,AAK1,NAK,Other,BIKE,Human,Q2M2I8,1921,...,EVLAEGGFAIVFLCALKRMVCKREIQIMRDLSKNIVGYIDSLILMD...,LKB,in,in,1.97,9.3,0.0,7.0,0.778,2.130
4,277,AAK1,AP2 associated kinase 1,AAK1,NAK,Other,BIKE,Human,Q2M2I8,1921,...,EVLAEGGFAIVFLCALKRMVCKREIQIMRDLSKNIVGYIDSLILMD...,XIN,in,in,1.90,8.8,0.0,12.0,0.776,2.120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5804,338,Wnk3,WNK lysine deficient protein kinase 3,WNK3,WNK,Other,,Human,Q9BYP7,2282,...,IELGRGAFKTVYKVAWCELRFKEEAEMLKGLQPNIVRFYDSVLVTE...,KS1,in,out,2.04,8.0,0.0,25.0,0.800,2.213
5805,338,Wnk3,WNK lysine deficient protein kinase 3,WNK3,WNK,Other,,Human,Q9BYP7,2282,...,IELGRGAFKTVYKVAWCELRFKEEAEMLKGLQPNIVRFYDSVLVTE...,-,in,out,2.06,8.0,0.0,0.0,0.827,2.198
5806,471,ZAP70,zeta chain of T cell receptor associated prote...,ZAP70,Syk,TK,,Human,P43403,2285,...,IELGCGNFGSVRQVAIKVLEMMREAQIMHQLDPYIVRLIGVMLVME...,STU,in,in,2.30,8.0,0.0,0.0,0.784,2.090
5807,471,ZAP70,zeta chain of T cell receptor associated prote...,ZAP70,Syk,TK,,Human,P43403,2285,...,IELGCGNFGSVRQVAIKVLEMMREAQIMHQLDPYIVRLIGVMLVME...,ANP,in,out,3.00,8.0,0.0,0.0,0.831,2.159


# Get sample for the Benchmark

We want to benchmark OpenCADD on different Kinase groups.
First the alignments are done in the same groups.
Then alignments are done between groups.

For each kinase group, 50 structures were randomly selected for the benchmark.
They have to fulfill two requirements:

    1. structure.dfg has to be "in".
    2. structure.qualityscore has to be 8.0 or above.
    
Each structure will be aligned against each other structure.
So for 50 structures, there will be 1225 alignments performed.

Some strucutres, that causes errors in OpenCADD were excluded. The list of them can be derived from the excluded structures while acquiring the sample sets. For most of these structures, all alignments with this structures did not work for at least one method in OpenCADD.

### Get sample structures for Tyrosine Kinases (TK)

Some structures have a Chain A, which is called Chain B.

For those structures, the appropiate chain has to be selected for the alignment.

In [5]:
TK_DFGin = klifs_df[(klifs_df['kinase.group'] == 'TK') & (klifs_df['structure.dfg'] == 'in') & 
                    (klifs_df['structure.qualityscore'] >= 8)]
TK_DFGin_sample_df = TK_DFGin.sample(50, random_state= 2021)
TK_DFGin_samples = TK_DFGin_sample_df[['structure.pdb_id', 'kinase.klifs_name',
                                        'kinase.group', 'species.klifs']].values.tolist()

with open("../data/samples/TK_samples.txt", "w", newline='') as f:
    wr = csv.writer(f)
    for struc in TK_DFGin_samples:
        u = Structure.from_pdbid(struc[0])
        chain_id = "A"
        chain = u.select_atoms("backbone and name CA and segid A")
        if len(chain) == 0:
            chain_id = "B"
            chain = u.select_atoms("backbone and name CA and segid B")
        struc.append(chain_id)
        print(struc, len(chain))
        wr.writerow(struc)

['6itj', 'FGFR1', 'TK', 'Human', 'A'] 289
['4g2f', 'EphA3', 'TK', 'Human', 'A'] 280
['2yn8', 'EphB4', 'TK', 'Human', 'A'] 253
['1ksw', 'SRC', 'TK', 'Human', 'A'] 449
['6e4f', 'BTK', 'TK', 'Human', 'A'] 274
['6db4', 'JAK3', 'TK', 'Human', 'A'] 277
['4zsa', 'FGFR1', 'TK', 'Human', 'A'] 289
['3qti', 'MET', 'TK', 'Human', 'A'] 292
['5tq8', 'JAK2', 'TK', 'Human', 'A'] 291
['5vgo', 'BTK', 'TK', 'Human', 'A'] 280
['5x27', 'EGFR', 'TK', 'Human', 'A'] 302
['6tfw', 'EGFR', 'TK', 'Human', 'A'] 266
['6z4b', 'EGFR', 'TK', 'Human', 'A'] 271
['5ttv', 'JAK3', 'TK', 'Human', 'A'] 281
['6ssb', 'SYK', 'TK', 'Human', 'A'] 268
['4zse', 'EGFR', 'TK', 'Human', 'A'] 298
['4rx8', 'SYK', 'TK', 'Human', 'A'] 274
['3f66', 'MET', 'TK', 'Human', 'A'] 295
['3q32', 'JAK2', 'TK', 'Human', 'A'] 291
['4cd0', 'ALK', 'TK', 'Human', 'A'] 295
['3cc6', 'PYK2', 'TK', 'Human', 'A'] 286
['3acj', 'LCK', 'TK', 'Human', 'A'] 270
['4fvr', 'JAK2-b', 'TK', 'Human', 'A'] 274
['6e6e', 'SRC', 'TK', 'Human', 'A'] 268
['6wtp', 'JAK2', 'TK

### Get sample structures for Tyrosine Kinases Like (TKL)

Some structures have a Chain A, which is called Chain B. (e.g. 2Y4I https://www.rcsb.org/structure/2Y4I)
For very few structures chain A has still another name (in this case it is chain L for "5L6W").

For those structures, the Chain B has to be selected for the alignment.

A very low number of structures cause errors in MMLigner. This is due to some transcribing of MDAnalysis while writing the structures back to *.pdb files for MMLigner.
For TKL structures "6ti8" causes this type of problem. The structure was included and a new dataset for the benchmark was created.

In [6]:
TKL_DFGin = klifs_df[(klifs_df['kinase.group'] == 'TKL') & (klifs_df['structure.dfg'] == 'in') & 
                    (klifs_df['structure.qualityscore'] >= 8)]
TKL_DFGin = TKL_DFGin[TKL_DFGin['structure.pdb_id'] != "6ti8"]
TKL_DFGin_sample_df = TKL_DFGin.sample(50, random_state= 2021)
TKL_DFGin_samples = TKL_DFGin_sample_df[['structure.pdb_id', 'kinase.klifs_name',
                                        'kinase.group', 'species.klifs']].values.tolist()

with open("../data/samples/TKL_samples.txt", "w", newline='') as f:
    wr = csv.writer(f)
    for struc in TKL_DFGin_samples:
        u = Structure.from_pdbid(struc[0])
        chain_id = "A"
        chain = u.select_atoms("backbone and name CA and segid A")
        if struc[0] == "5l6w":
            chain_id = "L"
            chain = u.select_atoms("backbone and name CA and segid L")
        elif len(chain) == 0:
            chain_id = "B"
            chain = u.select_atoms("backbone and name CA and segid B")
        struc.append(chain_id)
        print(struc, len(chain))
        wr.writerow(struc)

['3h9r', 'ALK2', 'TKL', 'Human', 'A'] 312
['3mtf', 'ALK2', 'TKL', 'Human', 'A'] 297
['5ita', 'BRAF', 'TKL', 'Human', 'A'] 250
['1py5', 'TGFbR1', 'TKL', 'Human', 'A'] 301
['5j7b', 'RIPK2', 'TKL', 'Human', 'A'] 277
['4ehg', 'BRAF', 'TKL', 'Human', 'A'] 262
['5e8u', 'TGFbR1', 'TKL', 'Human', 'A'] 306
['5qtz', 'TGFbR1', 'TKL', 'Human', 'A'] 308
['5oy6', 'ALK2', 'TKL', 'Human', 'A'] 291
['3s95', 'LIMK1', 'TKL', 'Human', 'A'] 297
['3hmm', 'TGFbR1', 'TKL', 'Human', 'A'] 293
['5l6w', 'LIMK1', 'TKL', 'Human', 'L'] 290
['5e8t', 'TGFbR1', 'TKL', 'Human', 'A'] 304
['3ppj', 'BRAF', 'TKL', 'Human', 'A'] 262
['5jsm', 'BRAF', 'TKL', 'Human', 'A'] 267
['2y4i', 'KSR2', 'TKL', 'Human', 'B'] 265
['4x2g', 'TGFbR1', 'TKL', 'Human', 'A'] 308
['6szj', 'RIPK2', 'TKL', 'Human', 'A'] 279
['5uit', 'IRAK4', 'TKL', 'Human', 'A'] 284
['5s7j', 'ALK2', 'TKL', 'Human', 'A'] 300
['5uiq', 'IRAK4', 'TKL', 'Human', 'A'] 268
['7jus', 'KSR2', 'TKL', 'Human', 'B'] 276
['5e8w', 'TGFbR1', 'TKL', 'Human', 'A'] 304
['5s77', 'ALK2

In [7]:
CMGC_DFGin = klifs_df[(klifs_df['kinase.group'] == 'CMGC') & (klifs_df['structure.dfg'] == 'in') & 
                    (klifs_df['structure.qualityscore'] >= 8)]
problematic_structures = ["5osu","4fv5","6dmg", "5ngu", "3v6r", "6g92", "4yll", "6qa1", "3zrm",
                          "1jvp", "6g9m", "5oti", "4dgl"]
CMGC_DFGin = CMGC_DFGin[~CMGC_DFGin['structure.pdb_id'].isin(problematic_structures)]
CMGC_DFGin_sample_df = CMGC_DFGin.sample(50, random_state= 2021)
CMGC_DFGin_samples = CMGC_DFGin_sample_df[['structure.pdb_id', 'kinase.klifs_name',
                                        'kinase.group', 'species.klifs']].values.tolist()

with open("../data/samples/CMGC_samples.txt", "w", newline='') as f:
    wr = csv.writer(f)
    for struc in CMGC_DFGin_samples:
        u = Structure.from_pdbid(struc[0])
        chain_id = "A"
        chain = u.select_atoms("backbone and name CA and segid A")
        if struc[0] == "6xd3":
            chain_id = "J"
            chain = u.select_atoms("backbone and name CA and segid J") 
        elif len(chain) == 0:
            chain_id = "B"
            chain = u.select_atoms("backbone and name CA and segid B")
        struc.append(chain_id)
        print(struc, len(chain))
        wr.writerow(struc)

['3rk9', 'CDK2', 'CMGC', 'Human', 'A'] 294
['2vx3', 'DYRK1A', 'CMGC', 'Human', 'A'] 346
['1w98', 'CDK2', 'CMGC', 'Human', 'A'] 297
['1pw2', 'CDK2', 'CMGC', 'Human', 'A'] 294
['3sd0', 'GSK3B', 'CMGC', 'Human', 'A'] 350
['3rni', 'CDK2', 'CMGC', 'Human', 'A'] 294
['6hk7', 'GSK3B', 'CMGC', 'Human', 'A'] 346
['4cxa', 'CRK7', 'CMGC', 'Human', 'A'] 329
['6ges', 'Erk1', 'CMGC', 'Human', 'A'] 350
['4qte', 'Erk2', 'CMGC', 'Human', 'A'] 364
['1oi9', 'CDK2', 'CMGC', 'Human', 'A'] 295
['3q04', 'CK2a1', 'CMGC', 'Human', 'A'] 331
['1ke6', 'CDK2', 'CMGC', 'Human', 'A'] 280
['6yl1', 'CDK2', 'CMGC', 'Human', 'A'] 291
['7aj5', 'DYRK1A', 'CMGC', 'Human', 'A'] 330
['3rpv', 'CDK2', 'CMGC', 'Human', 'A'] 294
['1r3c', 'p38a', 'CMGC', 'Human', 'A'] 350
['6xd3', 'CDK7', 'CMGC', 'Human', 'J'] 297
['4h36', 'JNK3', 'CMGC', 'Human', 'A'] 356
['4h3p', 'Erk2', 'CMGC', 'Human', 'A'] 342
['2c4g', 'CDK2', 'CMGC', 'Human', 'A'] 301
['3qwj', 'CDK2', 'CMGC', 'Human', 'A'] 294
['4fki', 'CDK2', 'CMGC', 'Human', 'A'] 309
['2r

In [8]:
CAMK_DFGin = klifs_df[(klifs_df['kinase.group'] == 'CAMK') & (klifs_df['structure.dfg'] == 'in') & 
                    (klifs_df['structure.qualityscore'] >= 8)]
CAMK_DFGin = CAMK_DFGin[CAMK_DFGin['structure.pdb_id'] != "1yi4"]
CAMK_DFGin_sample_df = CAMK_DFGin.sample(50, random_state= 2021)
CAMK_DFGin_samples = CAMK_DFGin_sample_df[['structure.pdb_id', 'kinase.klifs_name',
                                        'kinase.group', 'species.klifs']].values.tolist()

with open("../data/samples/CAMK_samples.txt", "w", newline='') as f:
    wr = csv.writer(f)
    for struc in CAMK_DFGin_samples:
        u = Structure.from_pdbid(struc[0])
        chain_id = "A"
        chain = u.select_atoms("backbone and name CA and segid A")
        if struc[0] == "3fyk":
            chain_id = "X"
            chain = u.select_atoms("backbone and name CA and segid X")
        elif len(chain) == 0:
            chain_id = "B"
            chain = u.select_atoms("backbone and name CA and segid B")
        struc.append(chain_id)
        print(struc, len(chain))
        wr.writerow(struc)

['5oor', 'CHK1', 'CAMK', 'Human', 'A'] 273
['3mft', 'CASK', 'CAMK', 'Human', 'A'] 303
['2yex', 'CHK1', 'CAMK', 'Human', 'A'] 276
['4ft0', 'CHK1', 'CAMK', 'Human', 'A'] 267
['3lm5', 'DRAK2', 'CAMK', 'Human', 'A'] 275
['4nif', 'RSK3-b', 'CAMK', 'Human', 'A'] 316
['5av2', 'DAPK1', 'CAMK', 'Human', 'A'] 275
['5a6o', 'DAPK3', 'CAMK', 'Human', 'A'] 268
['5n50', 'PIM1', 'CAMK', 'Human', 'A'] 277
['6c9g', 'AMPKa1', 'CAMK', 'Human', 'A'] 385
['1zlt', 'CHK1', 'CAMK', 'Human', 'A'] 272
['2cke', 'DAPK2', 'CAMK', 'Human', 'A'] 301
['6cjw', 'MNK2', 'CAMK', 'Human', 'A'] 270
['5auu', 'DAPK1', 'CAMK', 'Human', 'A'] 274
['2vn9', 'CaMK2d', 'CAMK', 'Human', 'A'] 301
['5op4', 'CHK1', 'CAMK', 'Human', 'A'] 259
['3zxt', 'DAPK1', 'CAMK', 'Human', 'A'] 249
['2hxl', 'CHK1', 'CAMK', 'Human', 'A'] 272
['1zws', 'DAPK2', 'CAMK', 'Human', 'A'] 278
['5n4z', 'PIM1', 'CAMK', 'Human', 'A'] 275
['4bzn', 'PIM1', 'CAMK', 'Human', 'A'] 272
['6vru', 'PIM1', 'CAMK', 'Human', 'A'] 279
['4wsy', 'PIM1', 'CAMK', 'Human', 'A'] 27



['4bky', 'MELK', 'CAMK', 'Human', 'A'] 317
['1jkk', 'DAPK1', 'CAMK', 'Human', 'A'] 277
['2xj2', 'PIM1', 'CAMK', 'Human', 'A'] 276
['4d2p', 'MELK', 'CAMK', 'Human', 'A'] 314
['2ycq', 'CHK2', 'CAMK', 'Human', 'A'] 286
['5eak', 'MARK2', 'CAMK', 'Human', 'A'] 305
['3kc3', 'MAPKAPK2', 'CAMK', 'Human', 'A'] 288
['4k18', 'PIM1', 'CAMK', 'Human', 'A'] 277
['3fhr', 'MAPKAPK3', 'CAMK', 'Human', 'A'] 275
['3dls', 'PASK', 'CAMK', 'Human', 'A'] 285
['4k0y', 'PIM1', 'CAMK', 'Human', 'A'] 275
['5auv', 'DAPK1', 'CAMK', 'Human', 'A'] 275
['2e9u', 'CHK1', 'CAMK', 'Human', 'A'] 269
['3kga', 'MAPKAPK2', 'CAMK', 'Human', 'A'] 295
['4uv0', 'DAPK1', 'CAMK', 'Human', 'A'] 300
['7oal', 'CASK', 'CAMK', 'Human', 'A'] 298
['3fyk', 'MAPKAPK2', 'CAMK', 'Human', 'X'] 282
['6xf0', 'CaMK2a', 'CAMK', 'Human', 'A'] 268
['4bzo', 'PIM1', 'CAMK', 'Human', 'A'] 272
['2yer', 'CHK1', 'CAMK', 'Human', 'A'] 268
['2pzy', 'MAPKAPK2', 'CAMK', 'Human', 'A'] 291
['1yxu', 'PIM1', 'CAMK', 'Human', 'A'] 273
['3m42', 'MAPKAPK2', 'CAMK',