# Assign positives and negatives 

In [1]:
import itertools
import random
import sys
import json

import py2neo
import pandas

## Startup neo4j and connections

In [2]:
with open('servers.json') as read_file:
    instances = json.load(read_file)

for instance in instances:
    uri = 'http://localhost:{}/db/data/'.format(instance['port'])
    instance['py2neo'] = py2neo.Graph(uri)
    if instance['name'] == 'rephetio-v2.0':
        neo_unperm = instance['py2neo']

hetnets = [x['name'] for x in instances]
hetnets

['rephetio-v2.0',
 'rephetio-v2.0_perm-1',
 'rephetio-v2.0_perm-2',
 'rephetio-v2.0_perm-3',
 'rephetio-v2.0_perm-4',
 'rephetio-v2.0_perm-5']

## Create partitions

In [3]:
def to_df(record_list):
    """Convert a py2neo RecordList to a dataframe"""
    return pandas.DataFrame(record_list.records, columns = record_list.columns)

In [4]:
compound_df = pandas.read_table('../summary/compounds.tsv')
disease_df = pandas.read_table('../summary/diseases.tsv')

## All potential pairs

In [5]:
'{} compounds × {} diseases = {} pairs'.format(
    len(disease_df), len(compound_df), len(disease_df) * len(compound_df))

'134 compounds × 1393 diseases = 186662 pairs'

## Nonzero prior pairs

In [6]:
compound_df.query("treats > 0")["compound_id"].nunique()

323

In [7]:
disease_df.query("treats > 0")["disease_id"].nunique()

73

In [8]:
nonzero_prior_pairs = set(itertools.product(
    compound_df.query("treats > 0").compound_id,
    disease_df.query("treats > 0").disease_id)
)

In [9]:
len(nonzero_prior_pairs)

23579

---

## Read training data for cross validation

In [1]:
with open("../../crossval_idx.txt", "r") as fin:
    train_idx = int(fin.read().strip())

In [2]:
train_idx

0

In [10]:
train = pandas.read_csv("~/crossval/data/training/training_piece{}.tsv".format(train_idx), sep = '\t')

In [11]:
train.shape

(38982, 8)

In [12]:
def df_to_pairs(df):
    return set(zip(df["drugbank_id"], df["doid_id"]))    

In [13]:
train_pairs = df_to_pairs(train)

In [14]:
nonzero_prior_pairs <= train_pairs

False

There are some potential pairs which are present in the holdout set, so we will remove these from the potential negative training examples.

In [15]:
nonzero_prior_pairs &= train_pairs

In [16]:
len(nonzero_prior_pairs)

18403

In [17]:
nonzero_prior_pairs <= train_pairs

True

The data has now been sanitized so that no relations which appear in the holdout set are ever seen prior to making predictions.

---

## Split the data

In [18]:
indication_query = '''
MATCH (compound:Compound)-[treatment:TREATS_CtD]->(disease:Disease)
RETURN
  compound.identifier AS compound_id,
  disease.identifier AS disease_id
ORDER BY
  compound_id, disease_id
'''

def partition(neo):
    """
    Extract negative and positive compound-disease pairs from a py2neo.Graph.
    """
    
    # query hetnet for all positive drug-treat-disease examples
    # query hetnet for all potential drug-disease pairs
    # randomly sample 4x the number of non-positive examples for use as negative examples

    
    # Use TREATS_CtD as positives
    # different from training data because edges were permuted
    indication_df = to_df(neo.cypher.execute(indication_query))
    
    positives = set(zip(indication_df.compound_id, indication_df.disease_id))

    # Use nonzero-prior pairs excluding non-negatives as negatives
    negatives = nonzero_prior_pairs - positives
    
    assert negatives <= train_pairs, "Bad training data"
    
    
    negatives = random.sample(negatives, k=len(positives) * 4)
    rows = list()
    for status, pairs in (0, negatives), (1, positives):
        for drug, disease in pairs:
            rows.append((drug, disease, status))
    df = pandas.DataFrame(rows, columns=['compound_id', 'disease_id', 'status'])
    df = df.sort_values(['disease_id', 'status', 'compound_id'])
    return df

In [19]:
part_dfs = list()
for seed, instance in enumerate(instances):
    random.seed(seed, version=2)
    part_df = partition(instance['py2neo'])
    part_df.insert(0, 'hetnet', instance['name'])
    part_dfs.append(part_df)
part_df = pandas.concat(part_dfs)

In [20]:
part_df.head()

Unnamed: 0,hetnet,compound_id,disease_id,status
19,rephetio-v2.0,DB00178,DOID:0050741,0
686,rephetio-v2.0,DB00205,DOID:0050741,0
421,rephetio-v2.0,DB00214,DOID:0050741,0
847,rephetio-v2.0,DB00328,DOID:0050741,0
1355,rephetio-v2.0,DB00332,DOID:0050741,0


In [21]:
part_df.groupby("hetnet")["status"].value_counts()

hetnet                status
rephetio-v2.0         0         2368
                      1          592
rephetio-v2.0_perm-1  0         2368
                      1          592
rephetio-v2.0_perm-2  0         2368
                      1          592
rephetio-v2.0_perm-3  0         2368
                      1          592
rephetio-v2.0_perm-4  0         2368
                      1          592
rephetio-v2.0_perm-5  0         2368
                      1          592
dtype: int64

In [22]:
## Create an even-matrix for unpermuted observations
template_df = pandas.DataFrame(list(itertools.product(hetnets, compound_df.compound_id)), columns=['hetnet', 'compound_id'])
unperm_pair_df = part_df.query("hetnet == 'rephetio-v2.0'")[['compound_id', 'disease_id']]
unperm_pair_df = template_df.merge(unperm_pair_df)
unperm_pair_df = unperm_pair_df.merge(part_df, how='left')
unperm_pair_df = unperm_pair_df[unperm_pair_df.status.isnull()]
unperm_pair_df.status = unperm_pair_df.status.fillna(0).astype(int)
unperm_pair_df.head(2)

Unnamed: 0,hetnet,compound_id,disease_id,status
6,rephetio-v2.0_perm-1,DB01048,DOID:0060073,0
7,rephetio-v2.0_perm-1,DB01048,DOID:219,0


In [23]:
part_df['primary'] = 1
unperm_pair_df['primary'] = 0
full_part_df = pandas.concat([part_df, unperm_pair_df])
full_part_df = full_part_df.sort_values(['compound_id', 'disease_id', 'hetnet'])
full_part_df.head(5)

Unnamed: 0,hetnet,compound_id,disease_id,status,primary
231,rephetio-v2.0_perm-2,DB00014,DOID:0050742,0,1
1335,rephetio-v2.0_perm-5,DB00014,DOID:0050742,0,1
1000,rephetio-v2.0_perm-5,DB00014,DOID:0060073,0,1
2696,rephetio-v2.0,DB00014,DOID:10283,1,1
7648,rephetio-v2.0_perm-1,DB00014,DOID:10283,0,0


In [24]:
full_part_df.to_csv('data/partitions.tsv', sep='\t', index=False)

In [25]:
# Number of hetnet-compound-disease pairs
len(full_part_df)

30137

In [26]:
# Number of positives and negatives per purpose
pandas.crosstab(full_part_df.primary, full_part_df.status)

status,0,1
primary,Unnamed: 1_level_1,Unnamed: 2_level_1
0,12377,0
1,14208,3552


In [27]:
# Number of positives and negatives per hetnet
pandas.crosstab(full_part_df.hetnet, full_part_df.status)

status,0,1
hetnet,Unnamed: 1_level_1,Unnamed: 2_level_1
rephetio-v2.0,2368,592
rephetio-v2.0_perm-1,4849,592
rephetio-v2.0_perm-2,4848,592
rephetio-v2.0_perm-3,4840,592
rephetio-v2.0_perm-4,4836,592
rephetio-v2.0_perm-5,4844,592
