# Split PharmacotherapyDB into multiple pieces

Tong Shu Li

For cross validation, the original training data needs to be split into multiple pieces in order to keep training and testing data separate.

In [1]:
import pandas as pd
import numpy as np

from itertools import product

In [2]:
np.__version__

'1.11.3'

In [3]:
np.random.seed(20161018)

---

## Load rare disease indications

In [4]:
goldstd = pd.read_csv("data/rare_dise_indications.tsv", sep = '\t')

In [5]:
goldstd = (goldstd
    .assign(orphanet_id = lambda df: df["orphanet_id"].map(
        lambda v: "ORPHA:{}".format(v)
    ))
)

In [6]:
goldstd.shape

(1864, 6)

In [7]:
goldstd.head()

Unnamed: 0,relationship_name,orphanet_id,dise_name,dise_type,drugbank_id,drug_name
0,indication,ORPHA:209981,IRIDA syndrome,rare_genetic,DB01592,Iron
1,indication,ORPHA:83642,Microcytic anemia with liver iron overload,rare_genetic,DB01592,Iron
2,indication,ORPHA:209981,IRIDA syndrome,rare_genetic,DB00158,Folic Acid
3,indication,ORPHA:83642,Microcytic anemia with liver iron overload,rare_genetic,DB00158,Folic Acid
4,indication,ORPHA:79241,Biotinidase deficiency,rare_genetic,DB00158,Folic Acid


In [8]:
goldstd["relationship_name"].value_counts()

indication       1160
off-label use     704
Name: relationship_name, dtype: int64

We will treat all indications as proper indications.

In [9]:
# goldstd = goldstd.drop("relationship_name", axis=1).drop_duplicates()

goldstd = (goldstd
    .drop("relationship_name", axis=1)
    .drop_duplicates()
    .assign(category = "DM")
)

In [10]:
goldstd.shape

(1850, 6)

---

## Number of unique chemicals and diseases

In [11]:
goldstd["orphanet_id"].nunique()

565

In [12]:
goldstd["drugbank_id"].nunique()

273

In [13]:
# number of unique chemical/disease combinations

goldstd["orphanet_id"].nunique() * goldstd["drugbank_id"].nunique()

154245

## Split into multiple pieces

For K-fold validation, the entire workflow needs to be run K times. The value of K is chosen to be 5 to avoid excessive computational requirements.

We will split the data by assigning each piece of data a number from 0 to K-1, and group data rows according to the piece number. This will ensure that each row of data is used, and that the ratios of true/false examples per group is the same.

In [14]:
K = 3
goldstd["piece"] = np.random.randint(0, K, len(goldstd))

In [15]:
goldstd.head()

Unnamed: 0,orphanet_id,dise_name,dise_type,drugbank_id,drug_name,category,piece
0,ORPHA:209981,IRIDA syndrome,rare_genetic,DB01592,Iron,DM,0
1,ORPHA:83642,Microcytic anemia with liver iron overload,rare_genetic,DB01592,Iron,DM,0
2,ORPHA:209981,IRIDA syndrome,rare_genetic,DB00158,Folic Acid,DM,2
3,ORPHA:83642,Microcytic anemia with liver iron overload,rare_genetic,DB00158,Folic Acid,DM,0
4,ORPHA:79241,Biotinidase deficiency,rare_genetic,DB00158,Folic Acid,DM,1


In [16]:
goldstd["piece"].value_counts(normalize = True)

1    0.335676
0    0.335676
2    0.328649
Name: piece, dtype: float64

---

## Data separation

Ensure that all possible chemical-disease pairs in the holdout set are missing from the training data. This is to ensure that the algorithm never sees data which will be used to test the trained model.

In [17]:
def all_pairs(df):
    chem = df["drugbank_id"].unique()
    dise = df["orphanet_id"].unique()
    
    return set(product(chem, dise))

def pair_to_df(pairs):
    return pd.DataFrame(list(pairs), columns = ["drugbank_id", "orphanet_id"])

def df_to_pairs(df):
    return set(zip(df["drugbank_id"], df["orphanet_id"]))

In [18]:
def split_data(withheld):
    holdout = goldstd.query("piece == @withheld")
    train = goldstd.query("piece != @withheld")
    
    # create all chemical-disease pairs
    holdout_p = all_pairs(holdout)
    train_p = all_pairs(train)
    
    train_assumed_false = train_p - df_to_pairs(train)
    
    overlap = holdout_p & train_assumed_false
    
    print("Number of overlaps for holdout set {}: {}".format(withheld, len(overlap)))

    train_assumed_false -= overlap
    
    # check that no negative training examples are in the holdout set
    assert train_assumed_false.isdisjoint(holdout_p)
    
    # check that there is no pharmacotherapydb overlap
    assert df_to_pairs(train).isdisjoint(df_to_pairs(holdout))
    
    # there will be some training examples which are potential candidates of the holdout
    
    train_df = train.append(pair_to_df(train_assumed_false))
    
    holdout_df = pair_to_df(holdout_p).merge(holdout, how = "left", on = ["drugbank_id", "orphanet_id"])
    
    return (holdout_df, train_df)

In [19]:
holdouts = dict()
training = dict()

for withheld in range(K):
    hold, train = split_data(withheld)
    
    holdouts[withheld] = hold.sort_values(["drugbank_id", "orphanet_id"])
    training[withheld] = train.sort_values(["drugbank_id", "orphanet_id"])
    
    print("-----")

Number of overlaps for holdout set 0: 31917
-----
Number of overlaps for holdout set 1: 31960
-----
Number of overlaps for holdout set 2: 29423
-----


## Sort the data so that the data is stable

In [20]:
goldstd = goldstd.sort_values(["orphanet_id", "drugbank_id"])

## Save split pieces to file

goldstd.to_csv("data/split_indications/labeled_pharmacotherapydb.tsv", sep = '\t', index = False)

In [21]:
for idx in range(K):
    fname = "holdout"
    holdouts[idx].to_csv("data/{}/{}_piece{}.tsv".format(fname, fname, idx), sep = '\t', index = False)
    
    fname = "training"
    training[idx].to_csv("data/{}/{}_piece{}.tsv".format(fname, fname, idx), sep = '\t', index = False)