In [60]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from tqdm import tqdm
import datetime as dt
import pickle
from collections import Counter

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

import pdaactconn as pc
from trialexplorer.mesh_terms import MeSHCatalog
from trialexplorer import AACTStudySet

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
tqdm.pandas()

In [4]:
# intializing MeSH object
mc = MeSHCatalog()  

Parsing MeSH xml: xml/desc2020.xml ...
Parse Complete! (parsed ElementTree root can be found in the .root attribute)


In [5]:
# selecting all interventional studies
conn = pc.AACTConnection(source=pc.AACTConnection.REMOTE)
ss = AACTStudySet.AACTStudySet(conn= conn, tqdm_handler=tqdm_notebook)
ss.add_constraint("study_type = 'Interventional'")
ss.load_studies()

254139 studies loaded!


In [6]:
# loading all dimensional data
ss.add_dimensions('browse_conditions')
ss.add_dimensions('conditions')
ss.refresh_dim_data()

Successfuly added these 1 dimensions: ['browse_conditions']
Failed to add these 0 dimensions: []
Successfuly added these 1 dimensions: ['conditions']
Failed to add these 0 dimensions: []


HBox(children=(IntProgress(value=0, max=509), HTML(value='')))

Syncing the temp table temp_cur_studies in 509 chunks x 500 records each

Creating index on the temp table
 - Loading dimension browse_conditions
 -- Loading raw data
 -- Sorting index
 - Loading dimension conditions
 -- Loading raw data
 -- Sorting index


In [7]:
bc = ss.dimensions['browse_conditions']
c = ss.dimensions['conditions']

In [8]:
bc.data.head()

Unnamed: 0_level_0,id,mesh_term,downcase_mesh_term
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NCT00000102,10563497,"Adrenal Hyperplasia, Congenital","adrenal hyperplasia, congenital"
NCT00000102,10563498,Adrenogenital Syndrome,adrenogenital syndrome
NCT00000102,10563499,Adrenocortical Hyperfunction,adrenocortical hyperfunction
NCT00000102,10563500,Hyperplasia,hyperplasia
NCT00000106,10562244,Rheumatic Diseases,rheumatic diseases


In [9]:
c.data.head()

Unnamed: 0_level_0,id,name,downcase_name
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NCT00000102,10058348,Congenital Adrenal Hyperplasia,congenital adrenal hyperplasia
NCT00000106,10057277,Rheumatic Diseases,rheumatic diseases
NCT00000108,10057274,Cardiovascular Diseases,cardiovascular diseases
NCT00000108,10057275,Coronary Disease,coronary disease
NCT00000110,10057273,Obesity,obesity


### Build a dataframe of mesh_tems and conditions

In [11]:
# method for combining all of the conditions for a study
def combine_all(df, nct_id, col):
    all_strings = []
    if nct_id in df.index:
        sub_df = df.loc[[nct_id]]
        for cur_cat in sub_df[col]:
            if cur_cat not in all_strings:
                all_strings.append(cur_cat)
    return all_strings

In [44]:
s_conds = ss.studies.reset_index()['nct_id'].progress_apply(lambda x: 
                                                            combine_all(c.data, 
                                                                        x,
                                                                        'downcase_name')).values


100%|██████████| 254139/254139 [03:07<00:00, 1352.12it/s]


In [45]:
s_meshs = ss.studies.reset_index()['nct_id'].progress_apply(lambda x: 
                                                            combine_all(bc.data, 
                                                                        x,
                                                                        'downcase_mesh_term')).values


100%|██████████| 254139/254139 [02:31<00:00, 1673.24it/s]


In [46]:
df = ss.studies

In [47]:
df['all_conds'] = s_conds
df['all_mesh'] = s_meshs

In [68]:
sub_df = df[['all_conds', 'all_mesh']].copy()

In [49]:
sub_df.head()

Unnamed: 0_level_0,all_conds,all_mesh
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1
NCT02709889,"[malignant melanoma, medullary thyroid cancer,...","[glioblastoma, thyroid neoplasms, carcinoma, n..."
NCT02709876,[retinitis pigmentosa],"[retinitis pigmentosa, retinitis]"
NCT02709863,[pituitary adenoma],"[adenoma, pituitary neoplasms, pituitary disea..."
NCT02709850,"[hypertriglyceridemia, familial hypercholester...","[hyperlipoproteinemia type ii, hypercholestero..."
NCT02709837,[nutrition],[]


## Random item

In [100]:
idx = np.random.choice(sub_df.index)
print('Conditions:')
print(sub_df['all_conds'].loc[idx], '\n')

print('MeSHes:')
print(sub_df['all_mesh'].loc[idx])

Conditions:
['hepatitis c', 'human immunodeficiency virus'] 

MeSHes:
['hepatitis a', 'hepatitis c', 'acquired immunodeficiency syndrome', 'hiv infections', 'hepatitis', 'liver cirrhosis', 'immunologic deficiency syndromes', 'fibrosis']


## let's count the most common conditions and meshes

In [64]:
cond_c = Counter()
mesh_c = Counter()

for cond_list in tqdm(sub_df['all_conds'].values):
    for term in cond_list:
        cond_c[term] += 1
        
for mesh_list in tqdm(sub_df['all_mesh'].values):
    for term in mesh_list:
        mesh_c[term] += 1
         

100%|██████████| 254139/254139 [00:00<00:00, 512429.67it/s]
100%|██████████| 254139/254139 [00:00<00:00, 442673.00it/s]


In [101]:
cond_c.most_common()

[('healthy', 6557),
 ('breast cancer', 4243),
 ('obesity', 3945),
 ('hiv infections', 2836),
 ('hypertension', 2445),
 ('pain', 2401),
 ('prostate cancer', 2379),
 ('depression', 2276),
 ('asthma', 2162),
 ('schizophrenia', 1965),
 ('stroke', 1936),
 ('diabetes mellitus, type 2', 1819),
 ('diabetes', 1727),
 ('coronary artery disease', 1697),
 ('colorectal cancer', 1629),
 ('cancer', 1589),
 ('lung cancer', 1520),
 ('lymphoma', 1483),
 ('heart failure', 1453),
 ('healthy volunteers', 1382),
 ('type 2 diabetes mellitus', 1367),
 ('multiple myeloma', 1329),
 ('rheumatoid arthritis', 1302),
 ('leukemia', 1264),
 ('type 2 diabetes', 1206),
 ('diabetes mellitus', 1170),
 ('hiv', 1112),
 ('atrial fibrillation', 1053),
 ('ovarian cancer', 1051),
 ('pancreatic cancer', 1013),
 ('multiple sclerosis', 1003),
 ('non-small cell lung cancer', 994),
 ('influenza', 984),
 ('osteoarthritis', 957),
 ('anxiety', 951),
 ('major depressive disorder', 940),
 ('cardiovascular diseases', 930),
 ('chronic obs

In [107]:
len(cond_c)

63932

In [66]:
mesh_c.most_common()

[('diabetes mellitus', 6180),
 ('breast neoplasms', 6042),
 ('syndrome', 5252),
 ('diabetes mellitus, type 2', 4834),
 ('neoplasms', 4573),
 ('carcinoma', 4466),
 ('disease', 4362),
 ('leukemia', 4184),
 ('lymphoma', 4114),
 ('lung neoplasms', 3985),
 ('depression', 3949),
 ('hypertension', 3673),
 ('prostatic neoplasms', 3376),
 ('infection', 3260),
 ('depressive disorder', 3215),
 ('carcinoma, non-small-cell lung', 3179),
 ('hiv infections', 2800),
 ('coronary artery disease', 2560),
 ('stroke', 2503),
 ('colorectal neoplasms', 2456),
 ('heart failure', 2452),
 ('hepatitis', 2297),
 ('wounds and injuries', 2238),
 ('osteoarthritis', 2200),
 ('schizophrenia', 2191),
 ('asthma', 2165),
 ('communicable diseases', 2147),
 ('kidney diseases', 2142),
 ('multiple myeloma', 2130),
 ('pulmonary disease, chronic obstructive', 2028),
 ('arthritis', 2011),
 ('adenocarcinoma', 1946),
 ('leukemia, myeloid', 1923),
 ('obesity', 1910),
 ('neoplasms, plasma cell', 1830),
 ('cardiovascular diseases', 

In [67]:
cond1 = 'non small cell lung cancer'
cond2 = 'carcinoma, non-small-cell lung'

In [70]:
sub_df['is_cond1'] = sub_df['all_conds'].apply(lambda x: True if cond1 in x else False)
sub_df['is_cond2'] = sub_df['all_conds'].apply(lambda x: True if cond2 in x else False)

In [73]:
df_cond_nscl = sub_df[(sub_df['is_cond1']) | (sub_df['is_cond2'])]
df_cond_nsc

Unnamed: 0_level_0,all_conds,all_mesh,is_cond1,is_cond2
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NCT02705339,"[carcinoma, non-small-cell lung, non-small cel...","[lung neoplasms, carcinoma, non-small-cell lung]",False,True
NCT02702921,[non small cell lung cancer],"[carcinoma, non-small-cell lung]",True,False
NCT02695290,"[carcinoma, non-small-cell lung, erbb receptors]","[carcinoma, non-small-cell lung]",False,True
NCT02684461,[non small cell lung cancer],"[carcinoma, non-small-cell lung]",True,False
NCT02679963,[non small cell lung cancer],"[lung neoplasms, carcinoma, non-small-cell lung]",True,False
...,...,...,...,...
NCT02733159,"[carcinoma, non-small-cell lung]","[carcinoma, non-small-cell lung]",False,True
NCT02716311,[non small cell lung cancer],"[lung neoplasms, carcinoma, non-small-cell lung]",True,False
NCT02716116,"[carcinoma, non-small-cell lung]","[carcinoma, non-small-cell lung]",False,True
NCT02716038,"[carcinoma, non-small-cell lung]","[carcinoma, non-small-cell lung]",False,True


In [76]:
idx = 'NCT02710396'
print('Conditions:')
print(sub_df['all_conds'].loc[idx], '\n')

print('MeSHes:')
print(sub_df['all_mesh'].loc[idx])

Conditions:
['esophageal squamous cell carcinoma', 'carcinoma, transitional cell', 'carcinoma, non-small-cell lung', 'cancer of the head and neck', 'urinary bladder neoplasms'] 

MeSHes:
['esophageal squamous cell carcinoma', 'head and neck neoplasms', 'urinary bladder neoplasms', 'carcinoma, transitional cell', 'carcinoma', 'carcinoma, squamous cell', 'carcinoma, non-small-cell lung', 'neoplasms']


### Notes
- build vocabulary on the individual conditions
- if a condition has sufficient similarity to and existing dictionary term, map it to that term
- make matrix of conditions-bags to mesh-terms,
- specifically pay attention to those rows where the condition-bags have the smallest number of conditions (ie none of the conditions in the condition set exists as a single bag)
    - this sub matrix is our conds to mesh mapping dictionary
    
- define: 
    - conditions are sufficiently similar when their mesh mappings are sufficiently similar
        - mesh mapping distance can be calculated using jacobian distance?
    - non-single condition bags are computed by collecting the mappings on their sub bags
    - manually tag a few "training data" such as 
        -[carcinoma, non-small-cell lung]
        -[non small cell lung cancer]
    - validate jacobian distance cut off such we balance FPR and TPR 
    