In [2]:
import torch
import os
import pandas as pd
from scipy.io import mmread
import numpy as np

# Create new training, test and validation 

In [1]:
# Select the type of dataset you want to perform the splitting on 
ds_type = 'integrated'  # integrated or original 

In [6]:
# Get datasplit files
if ds_type == 'original':
    file_train = pd.read_csv('/home/icb/alessandro.palma/data/metadata/original/datasplit1-train.csv')
    file_test = pd.read_csv('/home/icb/alessandro.palma/data/metadata/original/datasplit1-test.csv')
    file_val = pd.read_csv('/home/icb/alessandro.palma/data/metadata/original/datasplit1-val.csv')
    # get the whole data
    dataset = pd.concat([file_train, file_test, file_val], ignore_index=True)

else: 
    dataset = pd.read_csv('/home/icb/alessandro.palma/data/metadata/integrated/splits/total_dataset.csv')

In [41]:
# Pick the heldout dataset from the Mol2Img paper 
unseen_examples = pd.read_csv('/home/icb/alessandro.palma/data/metadata/original/splits_mol2img/eval_heldout_set.csv')

In [44]:
# Check column names of the new dataset 
print(dataset.columns)

Index(['Unnamed: 0', 'SAMPLE_KEY', 'BROAD_ID', 'PLATE_ID', 'WELL_POSITION',
       'SITE', 'SAMPLE_ID', 'CPD_NAME', 'CPD_NAME_TYPE', 'SMILES', 'STATE',
       'ROW_NR_LABEL_MAT'],
      dtype='object')


In [45]:
# Sample 8.5K molecules that will be seen from the training set
molecules = np.unique(dataset.CPD_NAME)
len(np.unique(dataset.CPD_NAME))

10561

In [49]:
# Sample the seen molecules 
n = len(dataset)
train_perc, test_perc, valid_perc = np.round(n*0.80), np.round(n*0.20),  np.round(n*0.10)

# Each molecule counts either 48 or 24 entries 

In [50]:
np.random.seed(42)
unseen = list(np.unique(unseen_examples.CPD_NAME))
seen = list(set(dataset.CPD_NAME)-set(unseen)) 

In [54]:
# Check the seen and unseen do not collide
set(seen).intersection(unseen)

set()

In [55]:
seen_rows = [i for i in range(n) if dataset.CPD_NAME[i] in seen]
unseen_rows = [i for i in range(n) if dataset.CPD_NAME[i] not in seen]

In [57]:
if ds_type == 'original':
    np.savez('/home/icb/alessandro.palma/data/metadata_processed/original/drugs_metadata/seen_unseen_compounds.npz', seen = seen, unseen = unseen) 
else:
    np.savez('/home/icb/alessandro.palma/data/metadata_processed/integrated/drugs_metadata/seen_unseen_compounds.npz', seen = seen, unseen = unseen) 

In [58]:
# Split training and validation set 
np.random.seed(42)
def split_train_val_test(df, seen, unseen):
    # Indices for training, test and validation indexes 
    train_set, test_set, valid_set, ood_set = pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
    # For each molecule shared
    for seen_mol in seen:
        # Slice of dataset for a molecule
        ds_slice = dataset.iloc[dataset.CPD_NAME.values == seen_mol]
        n_slice = len(ds_slice)
        n_train, n_test, n_valid = int(np.round(n_slice*0.70)), int(np.round(n_slice*0.20)), int(np.round(n_slice*0.10))
        # Separate the slice observations belonging to the three sets (randomly)
        df_slice_shuffled = ds_slice.sample(frac=1)
        slice_train = df_slice_shuffled.iloc[0:n_train+1]
        slice_test = df_slice_shuffled.iloc[(n_train+1):(n_train+n_test+1)]
        slice_valid = df_slice_shuffled.iloc[(n_train+n_test+1):]
        # Stack the datasets under the empty frames
        train_set  = pd.concat([train_set, slice_train])
        test_set = pd.concat([test_set, slice_test])
        valid_set = pd.concat([valid_set, slice_valid])
    
    for unseen_mol in unseen:
        ds_slice = dataset.iloc[dataset.CPD_NAME.values == unseen_mol]
        ood_set = pd.concat([ood_set, ds_slice])
    # Set the column names
    train_set.columns, test_set.columns, valid_set.columns, ood_set.columns = df.columns, df.columns, df.columns, df.columns
    return train_set, test_set, valid_set, ood_set

train_set, test_set, valid_set, ood_set = split_train_val_test(dataset, seen, unseen)

In [None]:
train_set.to_csv(f'/home/icb/alessandro.palma/data/metadata_processed/{ds_type}/splits/datasplit-train.csv')
test_set.to_csv(f'/home/icb/alessandro.palma/data/metadata_processed/{ds_type}/splits/datasplit-test.csv')
valid_set.to_csv(f'/home/icb/alessandro.palma/data/metadata_processed/{ds_type}/splits/datasplit-val.csv')
ood_set.to_csv(f'/home/icb/alessandro.palma/data/metadata_processed/{ds_type}/splits//datasplit-ood.csv')

## Create training, test and valdation npy files with labels

In [8]:
# Read files again if not in memory
# Get datasplit files
file_train = pd.read_csv(f'/home/icb/alessandro.palma/data/metadata_processed/{ds_type}/50k/splits/datasplit-train.csv')
file_test = pd.read_csv(f'/home/icb/alessandro.palma/data/metadata_processed/{ds_type}/50k//splits/datasplit-test.csv')
file_val = pd.read_csv(f'/home/icb/alessandro.palma/data/metadata_processed/{ds_type}/50k//splits/datasplit-val.csv')
file_ood = pd.read_csv(f'/home/icb/alessandro.palma/data/metadata_processed/{ds_type}/50k//splits/datasplit-ood.csv')

In [None]:
np.intersect1d(file_train.SAMPLE_KEY.values, file_test.SAMPLE_KEY.values)

### Assay labels

In [15]:
# Get the label matrix
label_mat = mmread(f'/home/icb/alessandro.palma/data/metadata/{ds_type}/label_matrix/label-matrix.mtx').tocsr()
col_labels = pd.read_csv(f'/home/icb/alessandro.palma/data/metadata/{ds_type}/label_matrix/column-assay-index.csv')
row_labels = pd.read_csv(f'/home/icb/alessandro.palma/data/metadata/{ds_type}/label_matrix/row-compound-index.csv')

In [16]:
label_mat

<10575x209 sparse matrix of type '<class 'numpy.float64'>'
	with 55751 stored elements in Compressed Sparse Row format>

In [20]:
def add_assay_label_to_samples(file,
                        label_mat):
    """
    To each observation add its vector of labels 
    """
    file_rows = file.ROW_NR_LABEL_MAT  # Row on the assay label matrix 
    y_assays = []  # List of the assay labels for the dataset 
    for row in file_rows:
        y_assay = label_mat[row].todense()
        y_assays.append(np.array(y_assay))
    return np.array(y_assays).squeeze()

In [18]:
train_assay_labels = add_assay_label_to_samples(file_train, label_mat)
test_assay_labels = add_assay_label_to_samples(file_test, label_mat)
val_assay_labels = add_assay_label_to_samples(file_val, label_mat)
ood_assay_labels = add_assay_label_to_samples(file_ood, label_mat)

In [21]:
len(train_assay_labels)

36143

In [23]:
# Save labels
np.savez(f'/home/icb/alessandro.palma/data/metadata_processed/{ds_type}/50k/labels/labels_train.npz', assay_labs = train_assay_labels)
np.savez(f'/home/icb/alessandro.palma/data/metadata_processed/{ds_type}/50k//labels/labels_test.npz', assay_labs = test_assay_labels)
np.savez(f'/home/icb/alessandro.palma/data/metadata_processed/{ds_type}/50k//labels/labels_valid.npz', assay_labs = val_assay_labels)
np.savez(f'/home/icb/alessandro.palma/data/metadata_processed/{ds_type}/50k//labels/labels_ood.npz', assay_labs = ood_assay_labels)

## Data check

In [24]:
dataset.columns

Index(['Unnamed: 0', 'SAMPLE_KEY', 'BROAD_ID', 'PLATE_ID', 'WELL_POSITION',
       'SITE', 'SAMPLE_ID', 'CPD_NAME', 'CPD_NAME_TYPE', 'SMILES', 'STATE',
       'ROW_NR_LABEL_MAT'],
      dtype='object')

In [25]:
np.unique(np.unique(dataset.CPD_NAME, return_counts=True)[1])

array([   6,   12,   13,   14,   15,   16,   17,   18,   23,   24,   36,
         42,   48,   60,   66,   72,   90,   96,  144, 6168])