# SiteAlign postprocessing

- Paper
  - https://doi.org/10.1002/prot.21858
  - https://drugdesign.unistra.fr/labwebsite/publications/paper102.pdf
- SiteAlign setup
  - Based on the existing KLIFS alignment, reduce the number of steps to 1 (instead of 3) and the translational steps and adjusting the rotational and translational intensity
  - Exclude pocket residues with modifications to avoid segmentation fault
  - 3-4 weeks of calculations; 1/10 minute per pocket pair

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
from opencadd.databases import klifs

from src.paths import PATH_DATA



## Load KLIFS structures (remote)

In [3]:
session = klifs.setup_remote()
structures = session.structures.all_structures()
structures["structure.code"] = structures.apply(
    lambda x: 
    f"{x['structure.pdb_id']}"
    f"{'' if x['structure.alternate_model'] == '-' else '_alt' + x['structure.alternate_model']}"
    f"{'' if x['structure.chain'] == '-' else '_chain' + x['structure.chain']}",
    axis=1
)
structures.head()

Unnamed: 0,structure.klifs_id,structure.pdb_id,structure.alternate_model,structure.chain,species.klifs,kinase.klifs_id,kinase.klifs_name,kinase.names,kinase.family,kinase.group,...,structure.bp_ii_b,structure.bp_iii,structure.bp_iv,structure.bp_v,structure.grich_distance,structure.grich_angle,structure.grich_rotation,structure.filepath,structure.curation_flag,structure.code
0,10970,6hhf,-,A,Human,1,AKT1,,,,...,False,True,True,False,20.2232,67.835503,55.009102,,False,6hhf_chainA
1,10439,6buu,B,B,Human,1,AKT1,,,,...,False,False,False,False,17.9011,59.594799,60.872501,,False,6buu_altB_chainB
2,2532,3mv5,B,A,Human,1,AKT1,,,,...,False,False,False,False,17.3969,57.0919,55.166599,,False,3mv5_altB_chainA
3,10437,6c0i,B,B,Human,1,AKT1,,,,...,False,False,False,False,17.9011,59.594799,60.872501,,False,6c0i_altB_chainB
4,2533,3ow4,-,B,Human,1,AKT1,,,,...,False,False,False,False,19.164,61.781799,50.624699,,False,3ow4_chainB


## Load SiteAlign structure pairs

In [4]:
#sitealign_path = Path("SiteAlign_KiSSim.txt.gz")
sitealign_path = Path("complete_SiteAlign.txt.gz")

In [5]:
sitealign_df = pd.read_table(sitealign_path, delimiter="\t", index_col=0)
sitealign_df = sitealign_df.rename(columns={"ID": "structure1", "Reference Target": "structure2"})
sitealign_df.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(8464555, 11)

In [6]:
structure_codes = pd.concat([sitealign_df["structure1"], sitealign_df["structure2"]]).unique().tolist()
len(structure_codes)

4115

In theory, we should have the following number of pairwise distances (without self-comparison):

In [7]:
n_structures = len(structure_codes)
n_distances_theory = int((n_structures * n_structures - n_structures) / 2)
n_distances_theory

8464555

In practice, we have:

In [8]:
n_distances_practice = sitealign_df.shape[0]
n_distances_practice

8464555

In [9]:
n_distances_practice / n_distances_theory

1.0

### Remove entries with invalid structure codes

In [10]:
regex = (
    r"(^[a-zA-Z0-9]{4}_chain[a-zA-Z0-9]$)"
    r"|(^[a-zA-Z0-9]{4}_alt[A-Z0-9]$)|"
    r"(^[a-zA-Z0-9]{4}_alt[A-Z0-9]_chain[a-zA-Z0-9]$)"
)
regex

'(^[a-zA-Z0-9]{4}_chain[a-zA-Z0-9]$)|(^[a-zA-Z0-9]{4}_alt[A-Z0-9]$)|(^[a-zA-Z0-9]{4}_alt[A-Z0-9]_chain[a-zA-Z0-9]$)'

In [11]:
# Show invalid structure codes
sitealign_df[
    ~sitealign_df["structure1"].str.match(regex, na=False) |
    ~sitealign_df["structure2"].str.match(regex, na=False)
]

Unnamed: 0,structure1,structure2,Residue_count,Distance1,Distance2,Distance3,Distance4,N,N2,N3,N4


In [12]:
# Drop entries with invalid structure codes
sitealign_df = sitealign_df[sitealign_df["structure1"].str.match(regex, na=False)]
sitealign_df = sitealign_df[sitealign_df["structure2"].str.match(regex, na=False)]
sitealign_df.shape

(8464555, 11)

### Map structure code to KLIFS ID/kinase name

In [13]:
structure_code_to_kinase_names = {
    row["structure.code"]: row["kinase.klifs_name"] 
    for _, row 
    in structures.iterrows()
}

In [14]:
structure_code_to_klifs_id = {
    row["structure.code"]: row["structure.klifs_id"] 
    for _, row 
    in structures.iterrows()
}

In [15]:
%%time
# Add columns for kinase names
sitealign_df["kinase1"] = sitealign_df.apply(
    lambda x: structure_code_to_kinase_names[x["structure1"]], 
    axis=1
)
sitealign_df["kinase2"] = sitealign_df.apply(
    lambda x: structure_code_to_kinase_names[x["structure2"]], 
    axis=1
)

CPU times: user 6min 24s, sys: 11.1 s, total: 6min 35s
Wall time: 6min 42s


In [16]:
%%time
# Add columns for structure KLIFS ID
sitealign_df["structure1.klifs_id"] = sitealign_df.apply(
    lambda x: structure_code_to_klifs_id[x["structure1"]], 
    axis=1
)
sitealign_df["structure2.klifs_id"] = sitealign_df.apply(
    lambda x: structure_code_to_klifs_id[x["structure2"]], 
    axis=1
)

CPU times: user 7min 43s, sys: 14 s, total: 7min 57s
Wall time: 8min 5s


### [FYI] Show entries with missing values

In [17]:
# Show missing values
sitealign_df_missing_values = sitealign_df[sitealign_df.isna().any(axis=1)]
print(sitealign_df_missing_values.shape)
sitealign_df_missing_values.head()

(21, 15)


Unnamed: 0,structure1,structure2,Residue_count,Distance1,Distance2,Distance3,Distance4,N,N2,N3,N4,kinase1,kinase2,structure1.klifs_id,structure2.klifs_id
52,1h24_chainA,3fc1_chainX,78,0.9836,0.0792,0.125,,56,1,1,0,CDK2,p38a,4199,5110
53,1h25_chainA,3fc1_chainX,78,0.9834,0.0708,0.125,,56,1,1,0,CDK2,p38a,4273,5110
247,2a2a_altA_chainA,3fc1_chainX,78,0.9842,0.0833,0.125,,58,1,1,0,DAPK2,p38a,1145,5110
287,2c0t_altA_chainA,2yjr_altA_chainA,84,1.0,1.0,1.0,,0,0,0,0,HCK,ALK,2653,5399
570,2w4k_altA_chainA,3fc1_chainX,78,0.9831,0.0375,0.125,,57,1,1,0,DAPK1,p38a,5653,5110


### Remove structures that are not part of KiSSim-DFG-in dataset?

In [18]:
# KiSSim structure KLIFS IDs
kissim_structure_klifs_ids = pd.read_csv(
    PATH_DATA / "processed/structure_klifs_ids_dfg_in.txt"
).squeeze().to_list()
# SiteAlign structure KLIFS IDs
sitealign_structure_klifs_ids = pd.concat(
    [sitealign_df["structure1.klifs_id"], sitealign_df["structure2.klifs_id"]]
).unique().tolist()

In [19]:
unwanted_structure_klifs_ids = list(
    set(sitealign_structure_klifs_ids) - set(kissim_structure_klifs_ids)
)
unwanted_structure_klifs_ids

[4]

In [20]:
# Remove unwanted structure pairs
sitealign_df = sitealign_df[
    ~(
        sitealign_df["structure1.klifs_id"].isin(unwanted_structure_klifs_ids) |
        sitealign_df["structure2.klifs_id"].isin(unwanted_structure_klifs_ids) 
    )
]
sitealign_df.shape

(8460441, 15)

### Structure pair stats

In [21]:
sitealign_df = sitealign_df[sitealign_df["Distance1"].notna()]
print(f"Number of structure pairs: {sitealign_df.shape[0]}")

Number of structure pairs: 8460441


In [22]:
structure_codes = pd.concat(
    [sitealign_df["structure1"], sitealign_df["structure2"]]
).unique().tolist()
print(f"Number of structures: {len(structure_codes)}")

Number of structures: 4114


## Map SiteAlign structure to kinase distance pairs

In [23]:
sitealign_kinase_distance_df = sitealign_df[
    ["kinase1", "kinase2", "Distance1"]
].copy()
# Sort kinase pairs by name
sitealign_kinase_distance_df[["kinase1", "kinase2"]] = (
    np.sort(sitealign_kinase_distance_df[["kinase1", "kinase2"]], axis=1)
)
# Group structures by kinase pairs and take minimum distances
sitealign_kinase_distance_df = sitealign_kinase_distance_df.groupby(
    ["kinase1", "kinase2"]
).min()
sitealign_kinase_distance_df = sitealign_kinase_distance_df.reset_index()

## Generate SiteAlign distance matrix

In [24]:
def distance_pairs_to_matrix(
    pairs, item1_col_name, item2_col_name, distance_col_name
):
    """
    Transform list of pairwise distances into a
    distance matrix.
    
    Parameters
    ----------
    pairs : pandas.DataFrame
        Table of pairwise distances.
    item1_col_name : str
        Name of column in `pairs` that contains first items of pairs.
    item2_col_name : str
        Name of column in `pairs` that contains second items of pairs.
    distance_col_name : str
        Name of column in `pairs` that contains distances.
        
    Returns
    -------
    pandas.DataFrame
        Distance matrix.
    """
    # Get all item names to be set in the matrix
    index_column_names = pd.concat(
        [pairs[item1_col_name], pairs[item2_col_name]]
    ).unique().tolist()
    
    # Initialize matrix
    matrix = pd.DataFrame([], columns=index_column_names, index=index_column_names)
    
    # Fill matrix
    for index, row in pairs.iterrows():
        if index % 1000000 == 0:
            print(index)
        matrix.loc[row[item1_col_name], row[item2_col_name]] = row[distance_col_name]
        matrix.loc[row[item2_col_name], row[item1_col_name]] = row[distance_col_name]
        
    # Fill diagonal
    for name in index_column_names:
        matrix.loc[name, name] = 0
        
    return matrix

### Structure distance matrix

### Kinase distance matrix

In [25]:
%%time
sitealign_kinase_matix = distance_pairs_to_matrix(
    sitealign_kinase_distance_df,
    "kinase1",
    "kinase2",
    "Distance1"
)
sitealign_kinase_matix.to_csv(
    Path("sitealign_kinase_distance_matrix.csv"),
    index=True
)

0
CPU times: user 21 s, sys: 48.5 ms, total: 21.1 s
Wall time: 21.5 s


In [26]:
sitealign_kinase_matix = pd.read_csv(
    Path("sitealign_kinase_distance_matrix.csv"),
    index_col=0
)