# Split the gold standard and build the network

In [1]:
import pandas as pd
import numpy as np

from itertools import product

In [2]:
np.random.seed(20171113)

## Read cleaned gold standard

In [3]:
gold = pd.read_csv("data/filtered_goldstd.tsv", sep='\t')

In [4]:
gold.shape

(6329, 5)

In [5]:
gold.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,etype
0,C0520442,acetyldigitoxin,C0004238,Atrial fibrillation,TREATS_CDtDO
1,C0520442,acetyldigitoxin,C0018802,Congestive heart failure,TREATS_CDtDO
2,C0771809,acexamic acid,C0037299,Skin ulcer,TREATS_CDtDO
3,C0050558,acipimox,C0020476,Hyperlipoproteinemia,TREATS_CDtDO
4,C0021735,interferon alfa-2b,C0023434,chronic lymphocytic leukemia,TREATS_CDtDO


---

## Split gold standard into K pieces

In [10]:
K = 5
gold["piece"] = np.random.randint(0, K, len(gold))

In [11]:
def all_pairs(df):
    chem = df["chemical_id"].unique()
    dise = df["disease_id"].unique()
    
    return set(product(chem, dise))

def pair_to_df(pairs):
    return pd.DataFrame(list(pairs), columns = ["chemical_id", "disease_id"])

def df_to_pairs(df):
    return set(zip(df["chemical_id"], df["disease_id"]))

In [12]:
def split_data(withheld):
    holdout = gold.query("piece == @withheld")
    train = gold.query("piece != @withheld")
    
    holdout_assumed_false = all_pairs(holdout) - df_to_pairs(holdout) - df_to_pairs(train)
    holdout_final = holdout.append(pair_to_df(holdout_assumed_false))
    
    #---------------------
    
    train_assumed_false = (all_pairs(train) - df_to_pairs(train)
                           - df_to_pairs(holdout_final)
    )
    train_final = train.append(pair_to_df(train_assumed_false))
    
    assert df_to_pairs(train_final).isdisjoint(df_to_pairs(holdout_final))
    return (holdout_final, train_final)

In [13]:
holdouts = dict()
training = dict()

for withheld in range(K):
    hold, train = split_data(withheld)
    
    holdouts[withheld] = hold.sort_values(["chemical_id", "disease_id"])
    training[withheld] = train.sort_values(["chemical_id", "disease_id"])

## Save training and holdout data to file

In [14]:
for idx in range(K):
    fname = "holdout"
    holdouts[idx].to_csv(
        "data/{}/{}_piece{}.tsv".format(fname, fname, idx),
        sep = '\t', index = False
    )
    
    fname = "training"
    training[idx].to_csv(
        "data/{}/{}_piece{}.tsv".format(fname, fname, idx),
        sep = '\t', index = False
    )