# Fragment analysis

Note that this notebook contains the code that was used to caluclate most of the statistics as well as to generate the respective plots shown in the mnauscript. 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd

from util import *

## Load fragment library

Let's load the fragment library (in different formats for different kind of analysis lateron):

- `fragment_library`: Dictionary of fragments (values as DataFrame) by subpockets (keys).
- `fragment_library_concat`: All fragments in one DataFrame.
- `fragment_library_concat_wo_x`: All fragments except fragments from pool X in one DataFrame.

In [3]:
# Path to library folder
PATH_TO_LIB = Path('.') / '..' / 'data' / 'fragment_library'

In [None]:
# Get fragment library by subpocket
fragment_library = read_fragment_library(PATH_TO_LIB)

for subpocket, fragments in fragment_library.items():
    fragments['subpocket'] = subpocket

In [None]:
print(fragment_library.keys())
print(f'Example fragments for subpocket AP:')

In [None]:
# Merge all subpockets
fragment_library_concat = pd.concat(fragment_library)
fragment_library_concat.reset_index(drop=True, inplace=True)

In [None]:
# Remove pool X
fragment_library_concat_wo_x = fragment_library_concat[
    fragment_library_concat.subpocket != 'X'
].copy()

## Fragment library overview

How many fragments (with and without pool X) do we have - and from how many original ligands do these come from?

In [None]:
# Number of fragments in all fragment subpocket pools
n_fragments = fragment_library_concat.shape[0]

# Number of fragments in all fragment subpocket pools except pool X
n_fragments_wo_x = fragment_library_concat_wo_x.shape[0]

print(f'Number of fragments in library: {n_fragments}')
print(f'Number of fragments in library without pool X: {n_fragments_wo_x}')

In [None]:
# Deduplicate these strings in order to get the number of ligands from which the fragments originate
n_original_ligands = fragment_library_concat.groupby(
    ['kinase', 'complex_pdb', 'ligand_pdb']
).size().shape[0]

print(f'Number of original ligands (from which fragments originate): {n_original_ligands}')

## Number of subpockets occupied by a ligand

Ligands can occupy subpocket(s) multiple times. Let's check how often this happens for which subpockets.

In [None]:
n_fragments_per_subpocket_per_ligand = fragment_library_concat_wo_x.groupby(
    ['kinase', 'complex_pdb', 'ligand_pdb', 'subpocket']
).size()
n_fragments_per_subpocket_per_ligand = n_fragments_per_subpocket_per_ligand.reset_index()
n_fragments_per_subpocket_per_ligand.rename(columns={0: 'n_fragments'}, inplace=True)

### How often does a ligand occupy a subpocket mulitple times? And how many times?

In [None]:
n_fragments_per_subpocket_per_ligand.groupby('n_fragments').size()

### Which subpockets are occupied how often by multiple fragments per ligand?

In [None]:
n_fragments_per_subpocket_per_ligand[
    n_fragments_per_subpocket_per_ligand.n_fragments > 1
].groupby('subpocket').size()

In order to calculate the number of subpockets a ligand is occupying, it is not of interest how often a subpocket is occupied by a ligand. Thus, we keep only one entry per ligand and subpocket.

In [None]:
n_subpockets_per_ligand = fragment_library_concat_wo_x.groupby(
    ['kinase', 'complex_pdb', 'ligand_pdb', 'subpocket']
).first().reset_index().groupby(
    ['kinase', 'complex_pdb', 'ligand_pdb']
).size()

In [None]:
n_subpockets_per_ligand_distribution = pd.concat(
    [
        n_subpockets_per_ligand.value_counts().sort_index().rename('ligand_count'), 
        n_subpockets_per_ligand.value_counts(normalize=True).sort_index().rename('ligand_frequency')*100
    ],
    axis=1
)

In [None]:
print('Number of subpockets per ligand (distribution):')
n_subpockets_per_ligand_distribution

#### Creates Figure 3.A in manuscript

In [None]:
plot_n_subpockets(n_subpockets_per_ligand_distribution)

## Examples for a few special cases of ligands/fragments discussed in the manuscript

### Examples for ligands occupying 6 subpockets

Could equally be adapted to get samples for ligands occupying less subpockets.

In [None]:
ligand_of_interest_six = n_subpockets_per_ligand[n_subpockets_per_ligand == 6].reset_index()
ligand_of_interest_six

#### Creates Figure S1, subfgure A in supplementary 

In [None]:
image_six_subpockets = draw_fragmented_ligand(
    fragment_library, 
    ligand_of_interest_six.complex_pdb.values[0], 
    ligand_of_interest_six.ligand_pdb.values[0],
    mols_per_row=3
)
image_six_subpockets.save(f'figures/extreme_subpockets_six_fragments.png')
image_six_subpockets

In [None]:
pdb_ids = ['4fnz']
image_ligand_from_pdb_ids = draw_ligands_from_pdb_ids(pdb_ids, sub_img_size=(300,300))
image_ligand_from_pdb_ids.save(f'figures/extreme_subpockets_six_ligand.png')
image_ligand_from_pdb_ids

### Unfragmented ligands
Collect all ligands that only (and thus fully) cover one subpocket

In [None]:
ligand_of_interest_one = n_subpockets_per_ligand[n_subpockets_per_ligand == 1].reset_index()

In [None]:
# get info for those ligands
unfragmented_ligands = []

for index, row in ligand_of_interest_one.iterrows():
    
    unfragmented_ligands.append(
        get_fragmented_ligand(
            fragment_library, 
            row.complex_pdb, 
            row.ligand_pdb
        )
    )
    
unfragmented_ligands = pd.concat(unfragmented_ligands)

In [None]:
# Draw and save those ligands
image_unfragmented = draw_fragments(unfragmented_ligands, mols_per_row=10)
image_unfragmented.save(f'figures/extreme_subpockets_one.png')
image_unfragmented

### Examples for unfragmented ligands in paper
#### Creates supplement figure S1, subfigures B1-B8, C1-C3, D1-D3

In [None]:
unfragmented_ligands_paper_b = [
    ['6q3b', 'PYZ'], 
    ['6q3c', 'BYZ'], 
    ['6q48', 'HHQ'],
    ['6q4a', 'HGW'],
    ['6q4b', 'HHN'], 
    ['6q4c', 'HH8'],
    ['6q4e', 'HH5'],
    ['6q4f', '26D']
]
unfragmented_ligands_paper_c = [
    ['3q9y', 'TXQ'],
    ['5j1w', '6FB'],
    ['2o63', 'MYC']
]
unfragmented_ligands_paper_d = [
    ['4fst', 'HK4'],
    ['3fyj', 'B97'],
    ['6cfm', 'EA7']
]

##### Fragment-like ligands B1-B8

In [None]:
image_unfragmented_paper_b = draw_selected_fragments(
    unfragmented_ligands_paper_b, 
    unfragmented_ligands
)
image_unfragmented_paper_b.save(f'figures/extreme_subpockets_one_paper_b.png')
image_unfragmented_paper_b

##### Rigid/many fused rings containing ligands C1-C3

In [None]:
image_unfragmented_paper_c = draw_selected_fragments(
    unfragmented_ligands_paper_c, 
    unfragmented_ligands
)
image_unfragmented_paper_c.save(f'figures/extreme_subpockets_one_paper_c.png')
image_unfragmented_paper_c

##### Other larger unfragmented ligands D1-D3

In [None]:
image_unfragmented_paper_d = draw_selected_fragments(
    unfragmented_ligands_paper_d, 
    unfragmented_ligands
)
image_unfragmented_paper_d.save(f'figures/extreme_subpockets_one_paper_d.png')
image_unfragmented_paper_d

## Fragments per subpocket

### Deduplicated fragments (per subpocket)

In [None]:
# collect all fragments and deduplicated version of them
n_fragments_per_subpocket = [len(fragments) for subpocket, fragments in fragment_library.items()]
n_fragments_per_subpocket_deduplicated = [len(fragments.drop_duplicates('smiles')) for subpocket, fragments in fragment_library.items()]

In [None]:
# get fragments numbers per subpocket
stats_n_fragments = pd.DataFrame(
    {
        'n_fragments': n_fragments_per_subpocket,
        'n_fragments_deduplicated': n_fragments_per_subpocket_deduplicated,
        'freq_duplicates': [(i-j) / i * 100 for i, j in zip(n_fragments_per_subpocket, n_fragments_per_subpocket_deduplicated)]
    },
    index=fragment_library.keys()
)
stats_n_fragments

In [None]:
print(f'Mean duplicates frequency across subpockets: {stats_n_fragments.freq_duplicates.mean()}')

In [None]:
pd.DataFrame(
    {
        'freq_fragments': [i / sum(n_fragments_per_subpocket[:-1]) * 100 for i in n_fragments_per_subpocket[:-1]],
        'freq_fragments_deduplicated': [i / sum(n_fragments_per_subpocket_deduplicated[:-1]) * 100 for i in n_fragments_per_subpocket_deduplicated[:-1]]
    },
    index=list(fragment_library.keys())[:-1]
)

In [None]:
print(f'Number of fragments (wo X): {sum(n_fragments_per_subpocket[:-1])}')
print(f'Number of deduplicated fragments (wo X): {sum(n_fragments_per_subpocket_deduplicated[:-1])}')

#### Creates Figure 3.B in manuscript 

In [None]:
plot_n_fragments_per_subpocket(n_fragments_per_subpocket, n_fragments_per_subpocket_deduplicated)

### Singleton fragments

Analyze which fragments appear only ones or several times in each subpocket.

In [None]:
fragment_occurrence = fragment_library_concat.groupby(['subpocket', 'smiles']).size()
fragment_occurrence.head()

In [None]:
# Collect numbers
singletons = pd.DataFrame(
    {
        '# singletons': fragment_occurrence[fragment_occurrence == 1].reset_index().groupby('subpocket').size(),
        '# duplicated fragments': fragment_occurrence[fragment_occurrence > 1].reset_index().groupby('subpocket').size(),
        '# deduplicated fragments': fragment_occurrence.reset_index().groupby('subpocket').size()
    }
)
singletons['% singletons'] = singletons['# singletons'] / singletons['# deduplicated fragments'] * 100
singletons

In [None]:
print(f"Average ratio of singletons across subpockets: {singletons.drop('X')['% singletons'].mean()}")

In [None]:
singletons.sum()

### Duplicate fragments
Analyze if fragments that appear several times (duplicate fragments) bind to the same or different kinases (or even across kinase groups).

Group fragments by subpocket and their SMILES. This deduplicates the dataset per subpocket based on the fragment SMILES.

Per deduplicated fragment save all kinases and kinase groups. 
This allows to track the following:

1. How many deduplicated fragments are there (within each subpocket)?
2. How many fragments (within subpockets) are singletons?
3. How many fragments (within subpockets) are duplicates?
4. From the duplicates, how many bind to the same kinase?
5. From the duplicates, how many bind to different kinases?
6. From the duplicates binding to different kinases, how many bind to the same kinase group?
7. From the duplicates binding to different kinases, how many to different kinase groups?

In [None]:
# Retain information on kinase and group during deduplication
fragments_by_subpocket_and_smiles = pd.concat(
    [
        fragment_library_concat.groupby(['subpocket', 'smiles'])['kinase'].apply(list), 
        fragment_library_concat.groupby(['subpocket', 'smiles'])['group'].apply(list)
    ], 
    axis=1
)
fragments_by_subpocket_and_smiles.head()

#### 1. How many deduplicated fragments are there (within each subpocket)?

In [None]:
fragments_deduplicated = fragments_by_subpocket_and_smiles
fragments_deduplicated.shape[0]

#### 2. How many fragments (within subpockets) are singletons?

In [None]:
fragments_singletons = fragments_by_subpocket_and_smiles[
        fragments_by_subpocket_and_smiles.apply(
        lambda x: len(x.kinase) == 1,
        axis=1
    )
]
fragments_singletons.shape[0]

#### 3. How many fragments (within subpockets) are duplicates?

In [None]:
fragments_duplicates = fragments_by_subpocket_and_smiles[
        fragments_by_subpocket_and_smiles.apply(
        lambda x: len(x.kinase) != 1,
        axis=1
    )
]
fragments_duplicates.shape[0]

#### 4. From the duplicates, how many bind to the same kinase?

In [None]:
fragments_duplicates_same_kinase = fragments_by_subpocket_and_smiles[
        fragments_by_subpocket_and_smiles.apply(
        lambda x: (len(x.kinase) != 1) & (len(set(x.kinase)) == 1),
        axis=1
    )
]
fragments_duplicates_same_kinase.shape[0]

#### 5. From the duplicates, how many bind to different kinases?

In [None]:
fragments_duplicates_different_kinases = fragments_by_subpocket_and_smiles[
        fragments_by_subpocket_and_smiles.apply(
        lambda x: (len(x.kinase) != 1) & (len(set(x.kinase)) != 1),
        axis=1
    )
]
fragments_duplicates_different_kinases.shape[0]

#### 6. From the duplicates binding to different kinases, how many bind to the same kinase group?

In [None]:
fragments_duplicates_different_kinases_same_group = fragments_by_subpocket_and_smiles[
        fragments_by_subpocket_and_smiles.apply(
        lambda x: (len(x.kinase) != 1) & (len(set(x.kinase)) != 1) & (len(set(x.group)) == 1),
        axis=1
    )
]
fragments_duplicates_different_kinases_same_group.shape[0]

#### 7. From the duplicates binding to different kinases, how many to different kinase groups?

In [None]:
fragments_duplicates_different_kinases_and_group = fragments_by_subpocket_and_smiles[
        fragments_by_subpocket_and_smiles.apply(
        lambda x: (len(x.kinase) != 1) & (len(set(x.kinase)) != 1) & (len(set(x.group)) != 1),
        axis=1
    )
]
fragments_duplicates_different_kinases_and_group.shape[0]

#### Summarize numbers
Note the below dataframes where created in steps 1-7 above

In [None]:
stats = pd.DataFrame(
    [
        fragments_deduplicated.reset_index().groupby('subpocket').size(),
        fragments_singletons.reset_index().groupby('subpocket').size(),
        fragments_duplicates.reset_index().groupby('subpocket').size(),
        fragments_duplicates_same_kinase.reset_index().groupby('subpocket').size(),
        fragments_duplicates_different_kinases.reset_index().groupby('subpocket').size(),
        fragments_duplicates_different_kinases_same_group.reset_index().groupby('subpocket').size(),
        fragments_duplicates_different_kinases_and_group.reset_index().groupby('subpocket').size(),
    ],
    index=[
        'fragments_deduplicated',
        'fragments_singletons',
        'fragments_duplicates',
        'fragments_duplicates_same_kinase',
        'fragments_duplicates_different_kinases',
        'fragments_duplicates_different_kinases_same_group',
        'fragments_duplicates_different_kinases_and_group',
    ]
)
stats

In [None]:
stats.transpose().sum()

In [None]:
print(
    f'Ratio of singletons \n(with respect to deduplicated fragments):\n'
    f'{fragments_singletons.shape[0] / fragments_deduplicated.shape[0]}\n'
)

print(
    f'Ratio of duplicate fragments that bind only to one kinase \n(with respect to deduplicated fragments):\n'
    f'{fragments_duplicates_same_kinase.shape[0] / fragments_deduplicated.shape[0]}\n'
)

print(
    f'Ratio of duplicate fragments that bind to different kinases and kinase groups \n(with respect to deduplicated fragments):\n'
    f'{fragments_duplicates_different_kinases_and_group.shape[0] / fragments_deduplicated.shape[0]}\n'
)

print(
    f'Ratio of duplicate fragments that bind to different kinase groups \n(with respect to fragments that bind to different kinases):\n'
    f'{fragments_duplicates_different_kinases_and_group.shape[0] / fragments_duplicates_different_kinases.shape[0]}\n'
)

## Subpocket connections

What subpocket connections do we see how often (connection as in a connection between 2 subpockets)?

In [None]:
# Respective information is stored in 'connections' column
connections_by_fragment = connections_by_fragment(fragment_library_concat_wo_x)
connections_by_fragment.head()

In [None]:
n_ligands_multiple_fragments_in_subpocket = connections_by_fragment[
    connections_by_fragment.apply(
        lambda x: len(x.connections) != len(set(x.connections)),
        axis=1
    )
].groupby(
    ['kinase', 'complex_pdb', 'ligand_pdb']
).size()

print(f'Number of ligands that show multiple fragments in one or more subpockets: {len(n_ligands_multiple_fragments_in_subpocket)}')

##### Connection frequencies

In [None]:
connections_by_ligand = connections_by_ligand(fragment_library_concat_wo_x)
connections_by_ligand.head()

In [None]:
connections = connections_count_by_ligand(connections_by_ligand)
connections

## Fragment physicochemical properties

Get descriptors for fragments per subpocket (deduplicated per subpocket):

In [None]:
descriptors = descriptors_by_fragments(fragment_library)

In [None]:
descriptors.groupby('subpocket').median()

#### Creates Figure 5.A in manuscript

In [None]:
plot_fragment_descriptors(descriptors)

##### Some more details on the descriptor values per subpocket

In [None]:
descriptors.groupby('subpocket')['# HBD'].describe()

In [None]:
descriptors.groupby('subpocket')['# HBA'].describe()

In [None]:
descriptors.groupby('subpocket')['LogP'].describe().sort_values('50%')

In [None]:
descriptors.groupby('subpocket')['# Heavy atoms'].describe()

## Fragment similarity by subpockets

1. Group and deduplicate all fragments by subpocket.
2. Calculate similarities for all pairwise fragment combinations within subpockets.

In [None]:
similarities_per_subpocket = fragment_similarity_per_subpocket(fragment_library_concat)
similarities_per_subpocket.head()

#### Creates Figure 5.B in manuscript

In [None]:
plot_fragment_similarity(similarities_per_subpocket, 'Subpocket')

In [None]:
similarities_per_subpocket.groupby('subpocket', sort=False).mean()

In [None]:
{key: len(value) for key, value in fragment_library.items()}

## Fragment similarity by kinase group

1. Group and deduplicate all fragments by kinase group and subpocket (excluding pool X).
2. Calculate similarities for all pairwise fragment combinations within kinase groups and subpockets (excluding X).
4. Calculate total distribution across all kinase groups (i.e. get all similarities by kinase groups and subpockets for the new kinase group 'Total').

In [None]:
similarities_per_kinase_group = fragment_similarity_per_kinase_group(fragment_library_concat_wo_x)
similarities_per_kinase_group.head()

In [None]:
similarities_per_kinase_group.groupby('group').size()

#### Creates Figure 5.C in manuscript

In [None]:
plot_fragment_similarity(similarities_per_kinase_group, 'Kinase group')

In [None]:
similarities_per_kinase_group.groupby('group', sort=False).mean()

## Draw ligand and fragments for GitHub README figure

In [None]:
complex_pdb = '3w2s'
ligand_pdb = 'W2R'

In [None]:
get_fragmented_ligand(fragment_library, complex_pdb, ligand_pdb)

In [None]:
img = draw_ligands_from_pdb_ids([complex_pdb], sub_img_size=(400,400))
img.save(f'figures/ligands_3w2s.png')
img

In [None]:
img = draw_fragmented_ligand(fragment_library, complex_pdb, ligand_pdb, mols_per_row=5)
img.save(f'figures/fragments_3w2s_altA_chainA.png')
img