This notebook is used to 
- find available kinase structures (`kinase_pdblist.txt`) in `PDBBind`
- create kinase `graphclan` using available structures

In [1]:
import pickle
import pandas as pd
graphclan_file = 'ClanGraph_90_df.pkl'
kinase_file = 'kinase_pdbids.txt'
kinase_clanfile = 'ClanGraph_kinase_90_df.pkl'

In [2]:
import re
def readEntryList(filename, sep='[\s,; ]+'):
    with open(filename, 'r') as file:
        content = file.read()
    results = re.split(sep, content)
    results = [r.strip().lower() for r in results]
    return results

## Load GraphClan and Kinase PDB IDs

In [3]:
kinase_pids = readEntryList(kinase_file)
print(f"Number of kinase structures found in PDB: {len(kinase_pids)}")
with open(graphclan_file, 'rb') as file:
    df = pickle.load(file)
df

Number of kinase structures found in PDB: 9727


Unnamed: 0,Structure_Clan_ID,PDBIDList
0,0,"[1a30, 1a94, 1a9m, 1aaq, 1aid, 1ajv, 1ajx, 1b6..."
1,1,"[1bcd, 1bn1, 1bn3, 1bn4, 1bnn, 1bnq, 1bnt, 1bn..."
2,2,"[1yet, 2qg0, 2qg2, 2uwd, 2xab, 2xdk, 2xdl, 2xd..."
3,3,"[1bju, 1bjv, 1bty, 1c5p, 1c5q, 1c5s, 1c5t, 1eb..."
4,4,"[1a4w, 1bcu, 1bhx, 1c5n, 1c5o, 1d3d, 1d3p, 1d9..."
...,...,...
1321,1321,[3hl5]
1322,1322,[2y7i]
1323,1323,[2oi2]
1324,1324,[1xt8]


## Extract kinase from the DataFrame

In [4]:
def extract(row, kinase_pids):
    old_list = row['PDBIDList']
    new_list = set(old_list).intersection(kinase_pids)
    new_list = sorted(new_list)
    return new_list
df['new_PDBIDList'] = df.apply(lambda x: extract(x, kinase_pids), axis=1)
mask = df.apply(lambda x: True if len(x['new_PDBIDList'])>0 else False, axis=1)
modified_df = df[mask]
modified_df = modified_df.drop('PDBIDList', axis=1)
modified_df = modified_df.rename(columns={'new_PDBIDList': 'PDBIDList'})
modified_df

Unnamed: 0,Structure_Clan_ID,PDBIDList
16,16,"[1kv1, 1yqj, 2baj, 2bak, 2bal, 2yix, 2zb1, 3d7..."
18,18,"[5i9x, 5i9y, 5ia0, 5ia1, 5ia2, 5ia3, 5ia4, 5ia..."
20,20,"[3bqc, 3h30, 3pe1, 3pe2, 5cqu, 5cs6, 5csp, 5cu..."
25,25,"[1q8t, 1q8u, 1stc, 1xh4, 1xh5, 1xh9, 1ydr, 1yd..."
30,30,"[1b38, 1e1x, 1jsv, 1pxo, 1pxp, 2fvd, 2xmy, 2xn..."
...,...,...
1224,1224,[6gzd]
1246,1246,[4z07]
1262,1262,[4da5]
1294,1294,[4y0a]


## Write to file

In [5]:
with open(kinase_clanfile, 'wb') as file:
    pickle.dump(modified_df, file)

## Statistics about the kinase clan

In [6]:
import itertools
kinase_in_PDBBind = itertools.chain(*modified_df['PDBIDList'].values.tolist())
kinase_in_PDBBind = list(kinase_in_PDBBind)
print("Number of kinase structures in PDBBind:", len(kinase_in_PDBBind))

Number of kinase structures in PDBBind: 386
