In [148]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook
from tqdm import tqdm
import datetime as dt
import pickle

import pdaactconn as pc
from trialexplorer.mesh_terms import MeSHCatalog
from trialexplorer import AACTStudySet
from trialexplorer import studysimilarity as ssim

import matplotlib.pyplot as plt
%matplotlib inline

tqdm.pandas()

In [2]:
# selecting all interventional studies
conn = pc.AACTConnection(source=pc.AACTConnection.LOCAL)
ss = AACTStudySet.AACTStudySet(conn= conn, tqdm_handler=tqdm_notebook)
ss.add_constraint("study_type = 'Interventional'")
ss.load_studies()

255092 studies loaded!


In [3]:
# loading all dimensional data
ss.add_dimensions('conditions')
ss.refresh_dim_data()

Successfuly added these 1 dimensions: ['conditions']
Failed to add these 0 dimensions: []


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))

Syncing the temp table temp_cur_studies in 511 chunks x 500 records each

Creating index on the temp table
 - Loading dimension conditions
 -- Loading raw data
 -- Sorting index


In [4]:
c = ss.dimensions['conditions']
c.data.head()

Unnamed: 0_level_0,id,name,downcase_name
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NCT00000102,10058348,Congenital Adrenal Hyperplasia,congenital adrenal hyperplasia
NCT00000106,10057277,Rheumatic Diseases,rheumatic diseases
NCT00000108,10057274,Cardiovascular Diseases,cardiovascular diseases
NCT00000108,10057275,Coronary Disease,coronary disease
NCT00000110,10057273,Obesity,obesity


In [21]:
all_conds = c.data['downcase_name'].unique()

extracted_adj_vb = {}
extracted_stages = {}

for cur_cond in tqdm(all_conds):
    extracted_adj_vb[cur_cond] = ssim.extract_adj_and_vb(cur_cond)
    extracted_stages[cur_cond] = ssim.full_extract(cur_cond)

100%|██████████| 64114/64114 [00:20<00:00, 3107.12it/s]


# Stage Distances:

In [134]:
reload(ssim)
# only a subset of the data matters here:
typed_conds = [x for x in all_conds if 'stage' in x \
                                      or 'type' in x \
                                      or 'grade' in x \
                                      or 'genotype' in x \
                                      or 'ajcc' in x \
                                      or 'hepatitis' in x]

test_conds = np.random.choice(typed_conds, 20)

# compare everything vs the first selected
for cur_cond in test_conds:
    print("Computing the distance between select 2 stage extractions")
    print(cur_cond)
    print("extracted:", extracted_stages[cur_cond])
    print(test_conds[0])
    print("extracted:", extracted_stages[test_conds[0]])
    print(ssim.stage_sim_dist(extracted_stages[cur_cond], extracted_stages[test_conds[0]]))
    print('')

Computing the distance between select 2 stage extractions
stage iib colorectal cancer
extracted: {'type': [], 'genotyp_': [], 'grade': [], 'stage': ['2b'], 'ajcc': [], 'hepatitis': []}
stage iib colorectal cancer
extracted: {'type': [], 'genotyp_': [], 'grade': [], 'stage': ['2b'], 'ajcc': [], 'hepatitis': []}
0.0

Computing the distance between select 2 stage extractions
locally advanced or metastatic non-clear cell type renal cell carcinoma
extracted: None
stage iib colorectal cancer
extracted: {'type': [], 'genotyp_': [], 'grade': [], 'stage': ['2b'], 'ajcc': [], 'hepatitis': []}
10.0

Computing the distance between select 2 stage extractions
stage iv uterine corpus cancer ajcc v8
extracted: {'type': [], 'genotyp_': [], 'grade': [], 'stage': ['4'], 'ajcc': ['v8'], 'hepatitis': []}
stage iib colorectal cancer
extracted: {'type': [], 'genotyp_': [], 'grade': [], 'stage': ['2b'], 'ajcc': [], 'hepatitis': []}
10.0

Computing the distance between select 2 stage extractions
hepatitis, aut

# Adj, Verb Distances

In [145]:
reload(ssim)
ssim.list_jaccard_sim([], [])

1.0

In [147]:
reload(ssim)
test_conds = np.random.choice(all_conds, 20)

# compare everything vs the first selected
for cur_cond in test_conds:
    print(cur_cond)
    print("extracted:", extracted_adj_vb[cur_cond])
    print(test_conds[0])
    print("extracted:", extracted_adj_vb[test_conds[0]])
    print(ssim.adj_and_vb_dist(extracted_adj_vb[cur_cond], extracted_adj_vb[test_conds[0]]))
    print()

verrucous oral leukoplakia
extracted: (['verrucous', 'oral'], [])
verrucous oral leukoplakia
extracted: (['verrucous', 'oral'], [])
(0.0, 0.0)

lumbar radiculitis
extracted: ([], [])
verrucous oral leukoplakia
extracted: (['verrucous', 'oral'], [])
(1.0, 0.0)

locoregional analgesia in breast surgery
extracted: (['locoregional'], [])
verrucous oral leukoplakia
extracted: (['verrucous', 'oral'], [])
(1.0, 0.0)

biliary neoplasms
extracted: (['biliary'], [])
verrucous oral leukoplakia
extracted: (['verrucous', 'oral'], [])
(1.0, 0.0)

impact of different medications on biomarkers of idiopathic rhinitis
extracted: (['different', 'idiopathic'], [])
verrucous oral leukoplakia
extracted: (['verrucous', 'oral'], [])
(1.0, 0.0)

abdominal wall relaxation after rectus sheath block
extracted: (['abdominal', 'rectus'], [])
verrucous oral leukoplakia
extracted: (['verrucous', 'oral'], [])
(1.0, 0.0)

treatments of striae gravidarum
extracted: ([], [])
verrucous oral leukoplakia
extracted: (['verru