pipeline

read cleaned edges
read cleaned gold std

generate list of nodes from edge list
check that gold std is not in edges

for K iterations
    subselect gold std as holdout
    put gold std in network
    
    make adjlist and nodemap
    run deepwalk
    build model
    get results and save to file
    
generate roc curve from results file

In [1]:
import pandas as pd
import numpy as np

from itertools import product
from itertools import chain

from tqdm import tqdm
from collections import defaultdict

In [2]:
np.random.seed(871365713)

## Read data

In [3]:
nodes = pd.read_csv("data/clean/nodes.tsv", sep='\t')
edges = pd.read_csv("data/clean/edges.tsv", sep='\t')
gold = pd.read_csv("data/clean/gold.tsv", sep='\t')

## number of cross validation folds

In [4]:
def all_pairs(df):
    chem = df["chemical_id"].unique()
    dise = df["disease_id"].unique()
    
    return set(product(chem, dise))

def pair_to_df(pairs):
    return pd.DataFrame(list(pairs), columns = ["chemical_id", "disease_id"])

def df_to_pairs(df):
    return set(zip(df["chemical_id"], df["disease_id"]))

In [5]:
all_gpairs = df_to_pairs(gold)
holdout_ratio = 0.2

def split_data():
    
    holdout = gold.sample(frac = holdout_ratio)
    
    tpairs = all_gpairs - df_to_pairs(holdout)
    
    train = (pair_to_df(tpairs)
        .merge(gold, how="left", on=["chemical_id", "disease_id"])
    )
    
    holdout_assumed_false = all_pairs(holdout) - df_to_pairs(holdout) - df_to_pairs(train)
    holdout_final = holdout.append(pair_to_df(holdout_assumed_false))
    
    #---------------------
    
    train_assumed_false = (all_pairs(train) - df_to_pairs(train)
                           - df_to_pairs(holdout_final)
    )
    train_final = train.append(pair_to_df(train_assumed_false))
    
    assert df_to_pairs(train_final).isdisjoint(df_to_pairs(holdout_final))
    return (holdout_final, train_final)

In [6]:
def clean_df(df):
    """Remove empty cells.
    Set numeric labels for edge type.
    """
    
    return (df
        [["chemical_id", "disease_id", "etype"]]
        .fillna(0)
        .replace({"TREATS_CDtDO": 1})
    )

In [7]:
def subsample(df, M=4):
    """Subsample the training data to remove the vast majority of
    negative training examples."""
    
    positives = df.query("etype == 1")
    
    return (positives
        .append(
            (df
                .query("etype == 0")
                .sample(len(positives) * M)
            )
        )
        .reset_index(drop=True)
    )

In [8]:
def add_uids(df):
    return (df
        .merge(
            nodes[["node_uid", "node_id"]],
            how="inner", left_on="chemical_id", right_on="node_id"
        )
        .merge(
            nodes[["node_uid", "node_id"]],
            how="inner", left_on="disease_id", right_on="node_id"        
        )
        .drop(["node_id_x", "node_id_y"], axis=1)
        .rename(columns={
            "node_uid_x": "chemical_uid",
            "node_uid_y": "disease_uid"
        })
    )

In [9]:
def build_adjlist(train, edges):
    
    pos = train.query("etype == 1")

    adjlist = defaultdict(set)
    
    for suid, tuid in tqdm(
        zip(
            chain(pos["chemical_uid"], edges["source_uid"]),
            chain(pos["disease_uid"], edges["target_uid"])
        ),
        desc="Building",
        total=len(pos) + len(edges)
    ):

        adjlist[suid].add(tuid)
        adjlist[tuid].add(suid)
        
    # write to file
    with open("data/temp/adjlist.txt".format(i), "w") as fout:
        for key, vals in tqdm(adjlist.items(), desc="Saving"):
            vals = sorted(list(vals))
            vals = list(map(str, vals))
            
            fout.write("{} {}\n".format(key, " ".join(vals)))

## Meat of the program

In [10]:
K = 1
for i in range(K):
    # sample gold standard and split
    holdout, train = split_data()

    holdout = (holdout
        .pipe(clean_df)
        .pipe(add_uids)
    )

    train = (train
        .pipe(clean_df)
        .pipe(subsample)
        .pipe(add_uids)
    )
    
    holdout.to_csv("data/temp/holdout.tsv", sep='\t', index=False)
    train.to_csv("data/temp/train.tsv", sep='\t', index=False)
    
    
    # append gold training edges to network
    build_adjlist(train, edges)
    
    # run deepwalk here
    # done by hand for now
    
    

Building: 100%|██████████| 9651843/9651843 [00:10<00:00, 925426.46it/s]
Saving: 100%|██████████| 210389/210389 [00:07<00:00, 26398.24it/s]


run deepwalk by hand and time how long it takes

time deepwalk --input ~/walkpred/semmed/data/temp/adjlist.txt --output ~/walkpred/semmed/data/temp/embedding.txt --representation-size 128 --number-walks 50 --window-size 10 --workers 4


runtime for the above command took 61 minutes clock time (so will take forever to run 100 fold validation)...

embedding file is ~250 mb (so also too large to store all at once)