## Featurizing Conditions of the Studies:

How to determine if 2 studies are studing the same condition?
- condition names (lev distance)
- condition bing results (bag of words)
- mesh terms (jaccard distance)
- mesh tree location (tree distance)
- adjective descriptors (such as "chronic")
- type, grade, stage, AJCC (type1, type2) etc.


In [48]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from tqdm import tqdm
import datetime as dt
import pickle
from collections import Counter

import nltk

import pdaactconn as pc
from trialexplorer.mesh_terms import MeSHCatalog
from trialexplorer import AACTStudySet

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
tqdm.pandas()

In [3]:
# selecting all interventional studies
conn = pc.AACTConnection(source=pc.AACTConnection.REMOTE)
ss = AACTStudySet.AACTStudySet(conn= conn, tqdm_handler=tqdm_notebook)
ss.add_constraint("study_type = 'Interventional'")
ss.load_studies()

255092 studies loaded!


In [4]:
# loading all dimensional data
ss.add_dimensions('browse_conditions')
ss.add_dimensions('conditions')
ss.refresh_dim_data()

Successfuly added these 1 dimensions: ['browse_conditions']
Failed to add these 0 dimensions: []
Successfuly added these 1 dimensions: ['conditions']
Failed to add these 0 dimensions: []


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))

Syncing the temp table temp_cur_studies in 511 chunks x 500 records each

Creating index on the temp table
 - Loading dimension browse_conditions
 -- Loading raw data
 -- Sorting index
 - Loading dimension conditions
 -- Loading raw data
 -- Sorting index


# 1. Mesh Terms

In [5]:
# intializing MeSH object
mc = MeSHCatalog()  

Parsing MeSH xml: xml/desc2020.xml ...
Parse Complete! (parsed ElementTree root can be found in the .root attribute)


In [50]:
bc = ss.dimensions['browse_conditions']
c = ss.dimensions['conditions']

In [7]:
bc.data.head()

Unnamed: 0_level_0,id,mesh_term,downcase_mesh_term
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NCT00000102,10563497,"Adrenal Hyperplasia, Congenital","adrenal hyperplasia, congenital"
NCT00000102,10563498,Adrenogenital Syndrome,adrenogenital syndrome
NCT00000102,10563499,Adrenocortical Hyperfunction,adrenocortical hyperfunction
NCT00000102,10563500,Hyperplasia,hyperplasia
NCT00000106,10562244,Rheumatic Diseases,rheumatic diseases


In [8]:
len(bc.data['mesh_term'].unique())

3745

In [11]:
s_mesh = bc.data.groupby('mesh_term').size().sort_values(ascending=False)

### Assign an index value to each mesh_term in the vocab

In [18]:
mesh2idx = dict(zip(s_mesh.index, range(len(s_mesh))))
idx2mesh = dict(zip(range(len(s_mesh)), s_mesh.index))

### Compute jaccard distance of studies on mesh

In [65]:
def get_mesh_terms(nctid1, nctid2, data):
    """ returns 2 lists of mesh terms """
    if nctid1 not in data.index:
        s1terms = []
    else:
        sub_df1 = data.loc[[nctid1]]
        s1terms = list(sub_df1['mesh_term'].unique())
        
    if nctid2 not in data.index:
        s2terms = []
    else:
        sub_df2 = data.loc[[nctid2]]
        s2terms = list(sub_df2['mesh_term'].unique())
    return s1terms, s2terms

In [63]:
def list_jaccard_dist(l1, l2):
    full_list = []
    for cur_term in l1:
        full_list.append(cur_term)
    for cur_term in l2:
        if cur_term not in full_list:
            full_list.append(cur_term)
            
    if len(full_list) == 0:
        return -1.
    intersect_count = 0
    for term in full_list:
        if term in l1 and term in l2:
            intersect_count += 1
    
    return intersect_count / len(full_list)

def mesh_jaccard_dist(nctid1, nctid2, data):
    s1terms, s2terms = get_mesh_terms(nctid1, nctid2, data)
    return list_jaccard_dist(s1terms, s2terms)

In [64]:
mesh_jaccard_dist('NCT00000102', 'NCT03323658', bc.data)

0.09090909090909091

### To compute the jaccard distance, for 1 study vs all the others, takes approx 3.5min

In [31]:
jaccard_dist = {}
for cur_nct in tqdm(list(bc.data.index.unique())):
    jaccard_dist[cur_nct] = mesh_jaccard_dist('NCT00000102', cur_nct, bc.data)

100%|██████████| 211869/211869 [03:32<00:00, 999.23it/s] 


In [38]:
dfjac = pd.DataFrame(jaccard_dist, index=['jdist']).T

In [39]:
dfjac[dfjac['jdist'] > 0].sort_values('jdist', ascending=False)

Unnamed: 0,jdist
NCT00000102,1.000000
NCT00519818,1.000000
NCT00621985,1.000000
NCT00151710,1.000000
NCT01771328,1.000000
...,...
NCT03868475,0.090909
NCT03323658,0.090909
NCT02928978,0.083333
NCT02109224,0.037037


## Computing the min, max, mean distance between the tagged mesh terms

In [45]:
def mesh_tree_dist(nctid1, nctid2, data, mc):
    """ compute the set of all tree distances and returns tuple of min, max, mean """
    s1terms, s2terms = get_mesh_terms(nctid1, nctid2, data)
    
    all_dist = []
    for t1 in s1terms:
        for t2 in s2terms:
            cur_dist = mc.shortest_mesh_dist(t1, t2)
            all_dist.append(cur_dist)
    
    return min(all_dist), max(all_dist), np.mean(all_dist)

In [46]:
mesh_tree_dist('NCT00000102', 'NCT03323658', bc.data, mc)

(0, 9, 7.0625)