In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from kinodata.data.data_split import RandomSplit, Split
from kinodata.data.cold_split import ColdSplit
from kinodata.data.utils.similarity import BLOSUMSubstitutionSimilarity, pairwise_tanimoto_similarity
from kinodata.data.utils.scaffolds import generate_scaffolds
from kinodata.data.utils.pocket_sequence_klifs import add_pocket_sequence
from kinodata.data.utils.cluster import AffinityPropagation
from kinodata.data.dataset import KinodataDocked

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dataset = KinodataDocked()

In [5]:
f"Number of data points: {len(dataset)}"

'Number of data points: 40525'

In [6]:
PATH = Path("..") / "data" / "test_splits"
SEEDS = tuple(range(5))
TRAIN_SIZE = 0.8
VAL_SIZE = 0.1
TEST_SIZE = 0.1
data_source = dataset.df
data_source["ident"] = data_source["ident"].astype(int)

# data point identifiers
idents = [int(data.ident) for data in dataset]

Reading data frame..
Checking for missing pocket mol2 files...


100%|██████████| 2439/2439 [00:00<00:00, 35656.83it/s]


In [7]:
if not PATH.exists():
    PATH.mkdir()

In [8]:
# take into account that source data frame was not filtered
# need to apply same filter we used for the processed data for this notebook
data_source = data_source.set_index("ident").loc[idents]

In [9]:
def path_to_split(kind: str, seed: int) -> Path:
    path = PATH / kind / f"seed_{seed}.csv"
    if not path.parents[0].exists():
        path.parents[0].mkdir()
    return path

def pocket_split_path(seed: int) -> Path:
    return path_to_split("pocket", seed)
                         
def random_split_path(seed: int) -> Path:
    return path_to_split("random", seed)

def scaffold_split_path(seed: int) -> Path:
    return path_to_split("scaffold", seed)

## Generate random splits

In [10]:
index_to_ident = {
    i: data.ident for i, data in enumerate(dataset)
}

In [11]:
random_split = RandomSplit(TRAIN_SIZE, VAL_SIZE, TEST_SIZE)
for seed in SEEDS:
    split = random_split(dataset, seed)
    # idents are more persistent/robust than data object index
    split = split.remap_index(index_to_ident)
    print(split)
    split.to_data_frame().to_csv(random_split_path(seed))
    

Split[str](train=32421, val=4052, test=4052, source=None)
Split[str](train=32421, val=4052, test=4052, source=None)
Split[str](train=32421, val=4052, test=4052, source=None)
Split[str](train=32421, val=4052, test=4052, source=None)
Split[str](train=32421, val=4052, test=4052, source=None)


## Generate scaffold splits

In [12]:
if not (PATH / 'scaffold').exists():
    (PATH / "scaffold").mkdir()

In [13]:
scaffold_path = PATH / "scaffold" / "scaffolds.csv"
if scaffold_path.exists():
    df_scaffold = pd.read_csv(scaffold_path)
else:
    df_scaffold = generate_scaffolds(dataset)
    df_scaffold.to_csv(scaffold_path, index=False)

df_scaffold.head()

Unnamed: 0.1,Unnamed: 0,ident,scaffold
0,0,37861,C1CCC(CC2CCC(C3CCCCC3)C2)CC1
1,1,40692,C1CCC(CC2CCC(C3CC(C4CCCC4)C4CCCCC34)CC2)CC1
2,2,41985,C1CCC(CC2CCC(C3CCC4CCCCC43)CC2)CC1
3,3,42689,C1CCC(CC2CCC(C3CCCCC3)C2)CC1
4,4,44685,C1CCC(CC2CCCC3CCCCC32)CC1


In [14]:
scaffold_similarities = pairwise_tanimoto_similarity(df_scaffold["scaffold"].unique())

In [15]:
scaffold_split = ColdSplit(TRAIN_SIZE, VAL_SIZE, TEST_SIZE, attribute_key="scaffold", clustering=None)
for seed in SEEDS:
    split = scaffold_split(df_scaffold, seed, scaffold_similarities)
    print(split)
    split.to_data_frame().to_csv(scaffold_split_path(seed), index=False)
    

Split[int](train=31929, val=4657, test=4406, source=None)
Split[int](train=32444, val=4104, test=4444, source=None)
Split[int](train=32885, val=4251, test=3856, source=None)
Split[int](train=33427, val=4047, test=3518, source=None)
Split[int](train=31343, val=3953, test=5696, source=None)


## Pocket Splits

In [16]:
if not (PATH / "pocket").exists():
    (PATH / "pocket").mkdir()

In [17]:
pocket_path = PATH / "pocket" / "pocket_sequences.csv"
if pocket_path.exists():
    df_pocket = pd.read_csv(pocket_path)
else:
    df_pocket = add_pocket_sequence(data_source, pocket_sequence_key="structure.pocket_sequence")
    df_pocket[["ident", "structure.pocket_sequence"]].to_csv(pocket_path, index=False)

df_pocket = df_pocket.set_index("ident")
df_pocket = df_pocket.loc[idents]
df_pocket["ident"] = df_pocket.index

In [18]:
df_pocket.head()

Unnamed: 0_level_0,Unnamed: 0,structure.pocket_sequence,ident
ident,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
37861,0,KPLGR____QVIEVAVKMLALMSELKILIHIGLNVVNLLGAMVIVE...,37861
40692,1,VKLGQG__GEVWMVAIKTLAFLQEAQVMKKLREKLVQLYAVYIVTE...,40692
41985,2,VKLGQGCFGEVWMVAIKTLAFLQEAQVMKKLREKLVQLYAVYIVGE...,41985
42689,3,KPLGR____QVIEVAVKMLALMSELKILIHIGLNVVNLLGAMVIVE...,42689
44685,4,KVLGSGAFGTVYKVAIKELEILDEAYVMASVDPHVCRLLGIQLITQ...,44685


In [19]:
pocket_similarities = BLOSUMSubstitutionSimilarity()(df_pocket["structure.pocket_sequence"].unique())

In [20]:
pocket_split = ColdSplit(TRAIN_SIZE, VAL_SIZE, TEST_SIZE, attribute_key="structure.pocket_sequence")
for seed in SEEDS:
    split = pocket_split(df_pocket, seed, pocket_similarities)
    print(split)
    split.to_data_frame().to_csv(pocket_split_path(seed), index=False)

Split[int](train=31018, val=5203, test=4304, source=None)
Split[int](train=31376, val=3169, test=5980, source=None)
Split[int](train=33972, val=4793, test=1760, source=None)
Split[int](train=34437, val=3785, test=2303, source=None)
Split[int](train=28889, val=5578, test=6058, source=None)
