# Split Rare disease indications into multiple pieces

Tong Shu Li

For cross validation, the original training data needs to be split into multiple pieces in order to keep training and testing data separate.

In [1]:
import pandas as pd
import numpy as np

from itertools import product

In [2]:
np.__version__

'1.13.1'

In [3]:
np.random.seed(20161018)

---

## Load rare disease indications

In [4]:
goldstd = pd.read_csv("data/rare_dise_indications.tsv", sep = '\t')

In [5]:
goldstd.head(2)

Unnamed: 0,relationship_name,orphanet_id,dise_name,dise_type,drugbank_id,drug_name
0,indication,213711,Endometrial stromal sarcoma,not_genetic,DB00351,Megestrol acetate
1,indication,70573,Small cell lung cancer,not_genetic,DB01030,Topotecan


## Change column names for consistency

In [6]:
goldstd = (goldstd
    .assign(orphanet_id = lambda df: df["orphanet_id"].map(
        lambda v: "ORPHA:{}".format(v)
    ))
    .rename(columns={
        "orphanet_id": "disease_id",
        "drugbank_id": "chemical_id",
        "dise_name": "disease_name",
        "drug_name": "chemical_name"
    })
)

In [7]:
goldstd.shape

(2085, 6)

In [8]:
goldstd.head()

Unnamed: 0,relationship_name,disease_id,disease_name,dise_type,chemical_id,chemical_name
0,indication,ORPHA:213711,Endometrial stromal sarcoma,not_genetic,DB00351,Megestrol acetate
1,indication,ORPHA:70573,Small cell lung cancer,not_genetic,DB01030,Topotecan
2,off-label use,ORPHA:319,Ewing sarcoma,not_genetic,DB01030,Topotecan
3,indication,ORPHA:29073,Multiple myeloma,not_genetic,DB01042,Melphalan
4,off-label use,ORPHA:213711,Endometrial stromal sarcoma,not_genetic,DB00675,Tamoxifen


In [9]:
goldstd["relationship_name"].value_counts()

indication       1305
off-label use     780
Name: relationship_name, dtype: int64

We will treat all indications as proper indications.

In [10]:
# goldstd = goldstd.drop("relationship_name", axis=1).drop_duplicates()

goldstd = (goldstd
    .drop("relationship_name", axis=1)
    .drop_duplicates()
    .assign(category = "DM")
)

In [11]:
goldstd.shape

(2055, 6)

---

## Number of unique chemicals and diseases

In [12]:
goldstd["disease_id"].nunique()

607

In [13]:
goldstd["chemical_id"].nunique()

325

In [14]:
# number of unique chemical/disease combinations

goldstd["disease_id"].nunique() * goldstd["chemical_id"].nunique()

197275

## Split into multiple pieces

For K-fold validation, the entire workflow needs to be run K times. The value of K is chosen to be 5 to avoid excessive computational requirements.

We will split the data by assigning each piece of data a number from 0 to K-1, and group data rows according to the piece number. This will ensure that each row of data is used, and that the ratios of true/false examples per group is the same.

In [15]:
K = 5
goldstd["piece"] = np.random.randint(0, K, len(goldstd))

In [16]:
goldstd.head()

Unnamed: 0,disease_id,disease_name,dise_type,chemical_id,chemical_name,category,piece
0,ORPHA:213711,Endometrial stromal sarcoma,not_genetic,DB00351,Megestrol acetate,DM,0
1,ORPHA:70573,Small cell lung cancer,not_genetic,DB01030,Topotecan,DM,0
2,ORPHA:319,Ewing sarcoma,not_genetic,DB01030,Topotecan,DM,2
3,ORPHA:29073,Multiple myeloma,not_genetic,DB01042,Melphalan,DM,4
4,ORPHA:213711,Endometrial stromal sarcoma,not_genetic,DB00675,Tamoxifen,DM,1


In [17]:
goldstd["piece"].value_counts(normalize = True)

0    0.206326
1    0.201460
2    0.199513
3    0.197080
4    0.195620
Name: piece, dtype: float64

---

## Data separation

Ensure that all possible chemical-disease pairs in the holdout set are missing from the training data. This is to ensure that the algorithm never sees data which will be used to test the trained model.

In [18]:
def all_pairs(df):
    chem = df["chemical_id"].unique()
    dise = df["disease_id"].unique()
    
    return set(product(chem, dise))

def pair_to_df(pairs):
    return pd.DataFrame(list(pairs), columns = ["chemical_id", "disease_id"])

def df_to_pairs(df):
    return set(zip(df["chemical_id"], df["disease_id"]))

In [19]:
def split_data(withheld):
    holdout = goldstd.query("piece == @withheld")
    train = goldstd.query("piece != @withheld")
    
    # create all chemical-disease pairs
    holdout_p = all_pairs(holdout)
    train_p = all_pairs(train)
    
    train_assumed_false = train_p - df_to_pairs(train)
    
    overlap = holdout_p & train_assumed_false
    
    print("Number of overlaps for holdout set {}: {}".format(withheld, len(overlap)))

    train_assumed_false -= overlap
    
    # check that no negative training examples are in the holdout set
    assert train_assumed_false.isdisjoint(holdout_p)
    
    # check that there is no pharmacotherapydb overlap
    assert df_to_pairs(train).isdisjoint(df_to_pairs(holdout))
    
    # there will be some training examples which are potential candidates of the holdout
    
    train_df = train.append(pair_to_df(train_assumed_false))
    
    holdout_df = pair_to_df(holdout_p).merge(holdout, how = "left", on = ["chemical_id", "disease_id"])
    
    return (holdout_df, train_df)

In [20]:
holdouts = dict()
training = dict()

for withheld in range(K):
    hold, train = split_data(withheld)
    
    holdouts[withheld] = hold.sort_values(["chemical_id", "disease_id"])
    training[withheld] = train.sort_values(["chemical_id", "disease_id"])
    
    print("-----")

Number of overlaps for holdout set 0: 25008
-----
Number of overlaps for holdout set 1: 23645
-----
Number of overlaps for holdout set 2: 26538
-----
Number of overlaps for holdout set 3: 25251
-----
Number of overlaps for holdout set 4: 26339
-----


## Sort the data so that the data is stable

In [21]:
goldstd = goldstd.sort_values(["chemical_id", "disease_id"])

## Save split pieces to file

goldstd.to_csv("data/split_indications/labeled_pharmacotherapydb.tsv", sep = '\t', index = False)

In [22]:
for idx in range(K):
    fname = "holdout"
    holdouts[idx].to_csv("data/{}/{}_piece{}.tsv".format(fname, fname, idx), sep = '\t', index = False)
    
    fname = "training"
    training[idx].to_csv("data/{}/{}_piece{}.tsv".format(fname, fname, idx), sep = '\t', index = False)