# SiteAlign postprocessing

- Paper
  - https://doi.org/10.1002/prot.21858
  - https://drugdesign.unistra.fr/labwebsite/publications/paper102.pdf
- SiteAlign setup
  - Based on the existing KLIFS alignment, reduce the number of steps to 1 (instead of 3) and the translational steps and adjusting the rotational and translational intensity
  - Exclude pocket residues with modifications to avoid segmentation fault
  - 3-4 weeks of calculations; 1/10 minute per pocket pair

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
from opencadd.databases import klifs



## Load KLIFS structures (remote)

In [3]:
session = klifs.setup_remote()
structures = session.structures.all_structures()
structures["structure.code"] = structures.apply(
    lambda x: 
    f"{x['structure.pdb_id']}"
    f"{'' if x['structure.alternate_model'] == '-' else '_alt' + x['structure.alternate_model']}"
    f"{'' if x['structure.chain'] == '-' else '_chain' + x['structure.chain']}",
    axis=1
)
structures.head()

Unnamed: 0,structure.klifs_id,structure.pdb_id,structure.alternate_model,structure.chain,species.klifs,kinase.klifs_id,kinase.klifs_name,kinase.names,kinase.family,kinase.group,...,structure.bp_ii_b,structure.bp_iii,structure.bp_iv,structure.bp_v,structure.grich_distance,structure.grich_angle,structure.grich_rotation,structure.filepath,structure.curation_flag,structure.code
0,10970,6hhf,-,A,Human,1,AKT1,,,,...,False,True,True,False,20.2232,67.835503,55.009102,,False,6hhf_chainA
1,10439,6buu,B,B,Human,1,AKT1,,,,...,False,False,False,False,17.9011,59.594799,60.872501,,False,6buu_altB_chainB
2,2532,3mv5,B,A,Human,1,AKT1,,,,...,False,False,False,False,17.3969,57.0919,55.166599,,False,3mv5_altB_chainA
3,10437,6c0i,B,B,Human,1,AKT1,,,,...,False,False,False,False,17.9011,59.594799,60.872501,,False,6c0i_altB_chainB
4,2533,3ow4,-,B,Human,1,AKT1,,,,...,False,False,False,False,19.164,61.781799,50.624699,,False,3ow4_chainB


## Load SiteAlign structure pairs

In [4]:
sitealign_path = Path("SiteAlign_KiSSim.txt")

In [5]:
sitealign_df = pd.read_table(sitealign_path, delimiter="\t", index_col=0)
sitealign_df = sitealign_df.rename(columns={"ID": "structure1", "Reference Target": "structure2"})
sitealign_df.shape

(8266330, 11)

In [6]:
structure_codes = pd.concat([sitealign_df["structure1"], sitealign_df["structure2"]]).unique().tolist()
len(structure_codes)

4119

In theory, we should have the following number of pairwise distances (without self-comparison):

In [7]:
n_structures = len(structure_codes)
n_distances_theory = int((n_structures * n_structures - n_structures) / 2)
n_distances_theory

8481021

In practice, we have:

In [8]:
n_distances_practice = sitealign_df.shape[0]
n_distances_practice

8266330

In [9]:
n_distances_practice / n_distances_theory

0.9746857129583808

### Remove entries with invalid structure codes

In [10]:
regex = (
    r"(^[a-zA-Z0-9]{4}_chain[a-zA-Z0-9]$)"
    r"|(^[a-zA-Z0-9]{4}_alt[A-Z0-9]$)|"
    r"(^[a-zA-Z0-9]{4}_alt[A-Z0-9]_chain[a-zA-Z0-9]$)"
)
regex

'(^[a-zA-Z0-9]{4}_chain[a-zA-Z0-9]$)|(^[a-zA-Z0-9]{4}_alt[A-Z0-9]$)|(^[a-zA-Z0-9]{4}_alt[A-Z0-9]_chain[a-zA-Z0-9]$)'

In [11]:
# Show invalid structure codes
sitealign_df[
    ~sitealign_df["structure1"].str.match(regex, na=False) |
    ~sitealign_df["structure2"].str.match(regex, na=False)
]

Unnamed: 0,structure1,structure2,Residue_count,Distance1,Distance2,Distance3,Distance4,N,N2,N3,N4
1046,3mpm_altA_chainA,3orz_al,,,,,,,,,
16,1di9_chainA,3eqp_chain,,,,,,,,,
334,2e9v_chainA,,,,,,,,,,
477,2qod_altA_chainA,3eqc_altA_chainA.mol,,,,,,,,,


In [12]:
# Drop entries with invalid structure codes
sitealign_df = sitealign_df[sitealign_df["structure1"].str.match(regex, na=False)]
sitealign_df = sitealign_df[sitealign_df["structure2"].str.match(regex, na=False)]
sitealign_df.shape

(8266326, 11)

### Map structure code to KLIFS ID/kinase name

In [13]:
structure_code_to_kinase_names = {
    row["structure.code"]: row["kinase.klifs_name"] 
    for _, row 
    in structures.iterrows()
}

In [14]:
structure_code_to_klifs_id = {
    row["structure.code"]: row["structure.klifs_id"] 
    for _, row 
    in structures.iterrows()
}

In [15]:
%%time
# Add columns for kinase names
sitealign_df["kinase1"] = sitealign_df.apply(
    lambda x: structure_code_to_kinase_names[x["structure1"]], 
    axis=1
)
sitealign_df["kinase2"] = sitealign_df.apply(
    lambda x: structure_code_to_kinase_names[x["structure2"]], 
    axis=1
)

CPU times: user 1min 23s, sys: 2.66 s, total: 1min 25s
Wall time: 1min 25s


In [16]:
%%time
# Add columns for structure KLIFS ID
sitealign_df["structure1.klifs_id"] = sitealign_df.apply(
    lambda x: structure_code_to_klifs_id[x["structure1"]], 
    axis=1
)
sitealign_df["structure2.klifs_id"] = sitealign_df.apply(
    lambda x: structure_code_to_klifs_id[x["structure2"]], 
    axis=1
)

CPU times: user 1min 30s, sys: 2.93 s, total: 1min 33s
Wall time: 1min 33s


### [FYI] Show entries with missing values

In [17]:
# Show missing values
sitealign_df_missing_values = sitealign_df[sitealign_df.isna().any(axis=1)]
print(sitealign_df_missing_values.shape)
sitealign_df_missing_values.head()

(29, 15)


Unnamed: 0,structure1,structure2,Residue_count,Distance1,Distance2,Distance3,Distance4,N,N2,N3,N4,kinase1,kinase2,structure1.klifs_id,structure2.klifs_id
1087,3oaw_chainA,6fdy_chainU,81.0,1.0,1.0,1.0,,0.0,0.0,0.0,0.0,p110g,ULK3,9240,10389
18,1e1v_chainA,3p1a_altA_chainA,,,,,,,,,,CDK2,MYT1,3886,1216
247,2a2a_altA_chainA,3fc1_chainX,78.0,0.9842,0.0833,0.125,,58.0,1.0,1.0,0.0,DAPK2,p38a,1145,5110
248,2a4l_chainA,2qod_altA_chainA,,,,,,,,,,CDK2,EphA3,4238,2869
2515,5enn_altA_chainA,6fdy_chainU,81.0,1.0,1.0,1.0,,0.0,0.0,0.0,0.0,PIK3C3,ULK3,9343,10389


### Remove structures that are not part of KiSSim-DFG-in dataset?

In [18]:
# KiSSim structure KLIFS IDs
kissim_structure_klifs_ids = pd.read_csv(
    PATH_DATA / "processed/structure_klifs_ids_dfg_in.txt"
).squeeze().to_list()
# SiteAlign structure KLIFS IDs
sitealign_structure_klifs_ids = pd.concat(
    [sitealign_df["structure1.klifs_id"], sitealign_df["structure2.klifs_id"]]
).unique().tolist()

In [19]:
unwanted_structure_klifs_ids = list(
    set(sitealign_structure_klifs_ids) - set(kissim_structure_klifs_ids)
)
unwanted_structure_klifs_ids

[4]

In [20]:
# Remove unwanted structure pairs
sitealign_df = sitealign_df[
    ~(
        sitealign_df["structure1.klifs_id"].isin(unwanted_structure_klifs_ids) |
        sitealign_df["structure2.klifs_id"].isin(unwanted_structure_klifs_ids) 
    )
]
sitealign_df.shape

(8262235, 15)

### Structure pair stats

In [21]:
sitealign_df = sitealign_df[sitealign_df["Distance1"].notna()]
print(f"Number of structure pairs: {sitealign_df.shape[0]}")

Number of structure pairs: 8262233


In [22]:
structure_codes = pd.concat(
    [sitealign_df["structure1"], sitealign_df["structure2"]]
).unique().tolist()
print(f"Number of structures: {len(structure_codes)}")

Number of structures: 4114


## Map SiteAlign structure to kinase distance pairs

In [23]:
sitealign_kinase_distance_df = sitealign_df[
    ["kinase1", "kinase2", "Distance1"]
].copy()
# Sort kinase pairs by name
sitealign_kinase_distance_df[["kinase1", "kinase2"]] = (
    np.sort(sitealign_kinase_distance_df[["kinase1", "kinase2"]], axis=1)
)
# Group structures by kinase pairs and take minimum distances
sitealign_kinase_distance_df = sitealign_kinase_distance_df.groupby(
    ["kinase1", "kinase2"]
).min()
sitealign_kinase_distance_df = sitealign_kinase_distance_df.reset_index()

## Generate SiteAlign distance matrix

In [24]:
def distance_pairs_to_matrix(
    pairs, item1_col_name, item2_col_name, distance_col_name
):
    """
    Transform list of pairwise distances into a
    distance matrix.
    
    Parameters
    ----------
    pairs : pandas.DataFrame
        Table of pairwise distances.
    item1_col_name : str
        Name of column in `pairs` that contains first items of pairs.
    item2_col_name : str
        Name of column in `pairs` that contains second items of pairs.
    distance_col_name : str
        Name of column in `pairs` that contains distances.
        
    Returns
    -------
    pandas.DataFrame
        Distance matrix.
    """
    # Get all item names to be set in the matrix
    index_column_names = pd.concat(
        [pairs[item1_col_name], pairs[item2_col_name]]
    ).unique().tolist()
    
    # Initialize matrix
    matrix = pd.DataFrame([], columns=index_column_names, index=index_column_names)
    
    # Fill matrix
    for index, row in pairs.iterrows():
        if index % 1000000 == 0:
            print(index)
        matrix.loc[row[item1_col_name], row[item2_col_name]] = row[distance_col_name]
        matrix.loc[row[item2_col_name], row[item1_col_name]] = row[distance_col_name]
        
    # Fill diagonal
    for name in index_column_names:
        matrix.loc[name, name] = 0
        
    return matrix

### Structure distance matrix

### Kinase distance matrix

In [None]:
%%time
sitealign_kinase_matix = distance_pairs_to_matrix(
    sitealign_kinase_distance_df,
    "kinase1",
    "kinase2",
    "Distance1"
)
sitealign_kinase_matix.to_csv(
    Path("sitealign_kinase_distance_matrix.csv"),
    index=True
)

In [28]:
sitealign_kinase_matix = pd.read_csv(
    Path("sitealign_kinase_distance_matrix.csv"),
    index_col=0
)

Unnamed: 0,AAK1,ABL1,ABL2,ACK,ACTR2,ACTR2B,ADCK3,AKT1,AKT2,ALK,...,ZAK,ZAP70,p110a,p110d,p110g,p38a,p38b,p38d,p38g,p70S6K
AAK1,0,0.2336,0.2777,0.2492,0.2655,0.2487,0.3581,0.1784,0.2281,0.2291,...,0.2763,0.2228,0.379,0.3742,0.3591,0.2562,0.2882,0.2494,0.339,0.1932
ABL1,0.2336,0,0.0341,0.1674,0.2834,0.263,0.3723,0.2484,0.2537,0.1255,...,0.2154,0.1768,0.3305,0.3204,0.3542,0.2203,0.2727,0.2477,0.2561,0.2454
ABL2,0.2777,0.0341,0,0.1654,0.3067,0.3222,0.4135,0.2292,0.2691,0.1516,...,0.2653,0.1917,0.3895,0.3799,0.3849,0.2522,0.2917,0.2852,0.3091,0.2806
ACK,0.2492,0.1674,0.1654,0,0.2993,0.305,0.3357,0.2121,0.2494,0.1388,...,0.2239,0.1728,0.3407,0.3385,0.3457,0.2334,0.259,0.2275,0.2831,0.2634
ACTR2,0.2655,0.2834,0.3067,0.2993,0,0.0989,0.4423,0.3199,0.2973,0.3067,...,0.2949,0.3146,0.4222,0.4282,0.3658,0.2682,0.3009,0.3159,0.3623,0.276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
p38a,0.2562,0.2203,0.2522,0.2334,0.2682,0.3026,0.3436,0.2399,0.2377,0.2051,...,0.2628,0.2337,0.3202,0.3072,0.3232,0,0.1597,0.1026,0.1004,0.2025
p38b,0.2882,0.2727,0.2917,0.259,0.3009,0.3491,0.382,0.2883,0.3126,0.269,...,0.2737,0.3067,0.3501,0.334,0.3172,0.1597,0,0.2416,0.2298,0.2888
p38d,0.2494,0.2477,0.2852,0.2275,0.3159,0.3044,0.3435,0.2425,0.2127,0.2566,...,0.292,0.2507,0.3335,0.3383,0.2966,0.1026,0.2416,0,0.0729,0.2419
p38g,0.339,0.2561,0.3091,0.2831,0.3623,0.338,0.3588,0.233,0.2506,0.2648,...,0.3609,0.3048,0.3807,0.3654,0.367,0.1004,0.2298,0.0729,0,0.2841
