# Compute the prior probability of treatment using permutation 

In [1]:
import itertools
import statistics

import pandas

from hetio.permute import permute_pair_list

In [2]:
from tqdm import tqdm

## Read treatments

In [3]:
treatment_df = pandas.read_table('../summary/indications.tsv')
treatment_df = treatment_df.query("rel_type == 'TREATS_CtD'")
treatment_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,rel_type
0,DB01048,Abacavir,DOID:635,acquired immunodeficiency syndrome,TREATS_CtD
1,DB05812,Abiraterone,DOID:10283,prostate cancer,TREATS_CtD


In [4]:
treatment_df.shape

(592, 5)

In [5]:
# Create node to degree dictionaries
compound_to_degree = dict(treatment_df.compound_id.value_counts())
disease_to_degree = dict(treatment_df.disease_id.value_counts())

In [6]:
# A degree (compound_degree, disease_degree) to all potential edges with that degree
degree_to_edges = dict()

rows = list()
for (c, c_deg), (d, d_deg) in itertools.product(compound_to_degree.items(), disease_to_degree.items()):
    rows.append((c, d, c_deg, d_deg))
    degree = c_deg, d_deg
    edge = c, d
    degree_to_edges.setdefault(degree, set()).add(edge)

pair_df = pandas.DataFrame(rows, columns=['compound_id', 'disease_id', 'compound_treats', 'disease_treats'])
pair_df = pair_df.sort_values(['compound_id', 'disease_id'])

In [7]:
pair_df.shape

(23579, 4)

In [8]:
pair_df.head()

Unnamed: 0,compound_id,disease_id,compound_treats,disease_treats
13401,DB00014,DOID:0050741,1,3
13376,DB00014,DOID:0050742,1,1
13367,DB00014,DOID:0060073,1,9
13430,DB00014,DOID:1024,1,2
13370,DB00014,DOID:10283,1,15


Not sure whether to filter this pair_df down to just the relation pairs which are found in the training set. Will leave it as is at the moment and return to determine if this contaminates the data at all.

In [9]:
treatments = list(zip(treatment_df.compound_id, treatment_df.disease_id))

In [10]:
# Burn In
pair_list, stats = permute_pair_list(treatments, multiplier=10)
pandas.DataFrame(stats)

Unnamed: 0,attempts,complete,cumulative_attempts,duplicate,excluded,same_edge,self_loop,unchanged,undirected_duplicate
0,593,0.100169,592,0.165261,0,0.001686,0,0.221284,0
1,592,0.200169,1184,0.172297,0,0.003378,0,0.103041,0
2,592,0.300169,1776,0.204392,0,0.001689,0,0.101351,0
3,592,0.400169,2368,0.180743,0,0.001689,0,0.103041,0
4,592,0.500169,2960,0.160473,0,0.001689,0,0.10473,0
5,592,0.600169,3552,0.168919,0,0.0,0,0.113176,0
6,592,0.700169,4144,0.172297,0,0.003378,0,0.118243,0
7,592,0.800169,4736,0.175676,0,0.001689,0,0.118243,0
8,592,0.900169,5328,0.165541,0,0.001689,0,0.106419,0
9,591,1.0,5919,0.194585,0,0.001692,0,0.114865,0


In [11]:
# Set the multiplier based on the burn in stats
multiplier = 3

In [12]:
# Calculate the number of perms
n_perm = treatment_df.compound_id.nunique() * treatment_df.disease_id.nunique()
n_perm = int(n_perm * 25)
n_perm

589475

In [13]:
%%time

# Initialize a dictionary of degree to empirical probability list
degree_to_probs = {x: list() for x in degree_to_edges}

# Perform n_perm permutations
for i in tqdm(range(n_perm)):
    # Permute
    pair_list, stats = permute_pair_list(pair_list, multiplier=multiplier, seed=i)
    
    # Update
    pair_set = set(pair_list)
    
    # modifies the original degree_to_probs dictionary
    
    for degree, probs in degree_to_probs.items():
        edges = degree_to_edges[degree]
        probs.append(len(edges & pair_set) / len(edges))

100%|██████████| 589475/589475 [1:43:07<00:00, 95.27it/s]

CPU times: user 1h 43min 10s, sys: 15.2 s, total: 1h 43min 25s
Wall time: 1h 43min 7s





In [14]:
%%time
rows = list()
for (c_deg, d_deg), probs in tqdm(degree_to_probs.items()):
    mean = statistics.mean(probs)
    std_error = statistics.stdev(probs) / len(probs) ** 0.5
    rows.append((c_deg, d_deg, mean, std_error))
perm_df = pandas.DataFrame(rows, columns=['compound_treats', 'disease_treats', 'prior_perm', 'prior_perm_stderr'])
perm_df = perm_df.sort_values(['compound_treats', 'disease_treats'])

100%|██████████| 273/273 [19:35<00:00,  4.29s/it]

CPU times: user 19min 29s, sys: 6.2 s, total: 19min 35s
Wall time: 19min 35s





In [15]:
# Add unpermuted treatment prevalence columns
rows = list()
treatment_set = set(treatments)
for (c_deg, d_deg), edges in degree_to_edges.items():
    n_treatments = len(edges & treatment_set)
    rows.append((c_deg, d_deg, n_treatments, len(edges)))
degree_prior_df = pandas.DataFrame(rows, columns=['compound_treats', 'disease_treats', 'n_treatments', 'n_possible'])
degree_prior_df = perm_df.merge(degree_prior_df)
degree_prior_df = degree_prior_df.sort_values(['compound_treats', 'disease_treats'])

In [16]:
degree_prior_df.tail(2)

Unnamed: 0,compound_treats,disease_treats,prior_perm,prior_perm_stderr,n_treatments,n_possible
271,17,39,0.688496,0.000603,1,1
272,17,59,0.784715,0.000535,1,1


In [17]:
degree_prior_df.to_csv('data/degree-prior.tsv', sep='\t', index=False, float_format='%.6g')

In [18]:
obs_prior_df = pair_df.merge(perm_df)

In [19]:
obs_prior_df.head(2)

Unnamed: 0,compound_id,disease_id,compound_treats,disease_treats,prior_perm,prior_perm_stderr
0,DB00014,DOID:0050741,1,3,0.004618,2e-06
1,DB00014,DOID:11934,1,3,0.004618,2e-06


In [20]:
len(obs_prior_df)

23579

In [21]:
obs_prior_df.to_csv('data/observation-prior.tsv', sep='\t', index=False, float_format='%.6g')