In [14]:
import pandas as pd
import numpy as np

import datetime as dt
import pickle
from importlib import reload

import pdaactconn as pc
from trialexplorer import AACTStudySet
from trialexplorer import studysimilarity as ssim

import matplotlib.pyplot as plt
%matplotlib inline

In [87]:
reload(ssim)

<module 'trialexplorer.studysimilarity' from '/home/rmfeng/Documents/GradSchool/capstone/trial-explorer/trialexplorer/studysimilarity.py'>

In [6]:
## let's load bing's data and make sure we have only values that have data
bd = pickle.load(open('raw_data/all_bing.p', 'rb'))
new_bd = {}
for k, v in bd.items():
    if not ssim.has_no_data(v):
        new_bd[k] = v
bd = new_bd

In [32]:
all_conds = list(bd.keys())

## Featurize the 2 conditions

In [88]:
c1 = all_conds[10]
c2 = all_conds[20]
c1, c2

('Stroke', 'Type 2 Diabetes Mellitus')

In [89]:
f1 = ssim.featurize_one_cond(c1, bd)
f1

{'condition': 'stroke',
 'nouns': 'stroke',
 'bing_tokens': {'stroke': 17, 'symptoms': 4, 'causes': 4},
 'bing_links': ['https://www.mayoclinic.org/diseases-conditions/stroke/symptoms-causes/syc-20350113',
  'https://en.wikipedia.org/wiki/Stroke',
  'https://www.cdc.gov/stroke/index.htm',
  'https://www.ocregister.com/2019/11/29/24-years-after-a-stroke-coach-for-oranges-el-modena-high-is-living-his-dream/',
  'https://thehill.com/homenews/media/472278-larry-king-says-he-had-a-stroke-in-march-and-was-in-coma-its-been-a-rough-year',
  'https://www.bbc.co.uk/sport/rowing/50569807',
  'https://www.webmd.com/stroke/default.htm',
  'https://www.healthline.com/health/stroke',
  'https://www.stroke.org/en/about-stroke',
  'https://medlineplus.gov/stroke.html',
  'https://www.medicalnewstoday.com/articles/7624.php',
  'https://www.medicinenet.com/stroke_symptoms_and_treatment/article.htm',
  'https://www.mayoclinic.org/diseases-conditions/stroke/diagnosis-treatment/drc-20350119',
  'https://www

In [90]:
f2 = ssim.featurize_one_cond(c2, bd)
f2

{'condition': 'type 2 diabetes mellitus',
 'nouns': 'type diabetes',
 'bing_tokens': {'diabetes': 20, 'type': 5},
 'bing_links': ['https://www.webmd.com/diabetes/guide/types-of-diabetes-mellitus',
  'https://www.webmd.com/diabetes/type-2-diabetes-treatments',
  'https://www.webmd.com/diabetes/eating-right',
  'https://www.webmd.com/men/guide/diabetes-men',
  'https://www.webmd.com/diabetes/guide/risk-factors-for-diabetes',
  'https://www.drugs.com/health-guide/type-2-diabetes-mellitus.html',
  'https://www.webmd.com/diabetes/type-2-diabetes',
  'https://emedicine.medscape.com/article/117853-overview',
  'https://www.mayoclinic.org/diseases-conditions/type-2-diabetes/symptoms-causes/syc-20351193',
  'https://en.wikipedia.org/wiki/Diabetes_mellitus_type_2',
  'https://www.diabetes.org/diabetes/type-2',
  'https://medlineplus.gov/diabetestype2.html',
  'https://www.mayoclinic.org/diseases-conditions/type-2-diabetes/diagnosis-treatment/drc-20351199',
  'https://www.sharecare.com/health/typ

In [91]:
ssim.featurize_conditions(f1, f2)

{'full_fuzzy_ratio': 13,
 'noun_fuzzy_ratio': 21,
 'bing_bagoword_dist': 50,
 'bing_link_sim': 0.0,
 'same_wiki': 0.0,
 'stage_dist': 10.0,
 'adj_dist': 0.0,
 'vb_dist': 1.0}

## how long does this take?

In [43]:
%timeit ssim.featurize_cond_full(*np.random.choice(all_conds, 2), bd)

8.92 ms ± 46.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## How much in our entire pair space?

In [48]:
tot_pairs = 65144 * 65145 / 2
tot_pairs / 1000000

2121.90294

> roughly 2 billion pairs of cond1, cond2

In [55]:
# how long to brute force featurize each pair?
secs_per_pair = 8.92 / 1000
tot_pairs * secs_per_pair / 60 / 60

5257.6039513333335

> roughly 5250 hours on my desktop unix, not impossible if we can parallelize

# Example of 10 random features:

In [62]:
rand_conds = np.random.choice(all_conds, 10)
rand_conds

array(['Brachiobasilic Arteriovenous Fistula',
       'Catheter-related Bloodstream Infection (CRBSI) Nos',
       'Lyme Disease', 'Intraoperative Neurophysiological Monitoring',
       'Laparoscopic Gynecological Surgery', 'Sympathetic Integrity',
       'Endovascular Treatment', 'Latent Tuberculosis',
       'Adult Erythroleukemia', 'Type-2 Diabetes Mellitus'], dtype='<U158')

In [92]:
cur_featurized = {}
for i, cond in enumerate(rand_conds):
    cur_f = ssim.featurize_one_cond(cond, bd)
    cur_featurized[cond] = cur_f
    print("\n ###### STUDY %s ###### " % i)
    for k, v in cur_f.items():
        if k != 'bing_links':
            print("'%s': %s" % (k, v))
        else:
            print("num_links (not printed): %s" % len(v))


 ###### STUDY 0 ###### 
'condition': brachiobasilic arteriovenous fistula
'nouns': fistula
'bing_tokens': {'fistula': 12, 'arteriovenous': 11, 'brachiobasilic': 2}
num_links (not printed): 22
'bing_wiki': ['Arteriovenous fistula - Wikipedia']
'stages': None
'adj_and_verbs': (['brachiobasilic', 'arteriovenous'], [])

 ###### STUDY 1 ###### 
'condition': catheter-related bloodstream infection (crbsi) nos
'nouns': bloodstream infection crbsi nos
'bing_tokens': {'bloodstream': 7, 'catheterrelated': 6, 'infections': 4, 'infection': 2, 'crbsicom': 1, 'home': 1, '2020': 1, 'icd10cm': 1, 'diagnosis': 1, 'code': 1}
num_links (not printed): 13
'bing_wiki': ['Taurolidine - Wikipedia']
'stages': None
'adj_and_verbs': (['catheter-related'], [])

 ###### STUDY 2 ###### 
'condition': lyme disease
'nouns': lyme disease
'bing_tokens': {'lyme': 23, 'disease': 2}
num_links (not printed): 32
'bing_wiki': ['Lyme disease - Wikipedia']
'stages': None
'adj_and_verbs': ([], [])

 ###### STUDY 3 ###### 
'condi

# Example distances of random features

In [93]:
used_conds = rand_conds
for i in range(0, len(used_conds)):
    for j in range(i, len(used_conds)):
        f1 = cur_featurized[used_conds[i]]
        f2 = cur_featurized[used_conds[j]]
        cur_fdist = ssim.featurize_conditions(f1, f2)
        print("\n ###### PAIR %s, %s ###### " % (i, j))
        print("c1 =", used_conds[i])
        print("c2 =", used_conds[j])
        print("Distances:")
        for k, v in cur_fdist.items():
            print('-', k, v)


 ###### PAIR 0, 0 ###### 
c1 = Brachiobasilic Arteriovenous Fistula
c2 = Brachiobasilic Arteriovenous Fistula
Distances:
- full_fuzzy_ratio 100
- noun_fuzzy_ratio 100
- bing_bagoword_dist 0
- bing_link_sim 1.0
- same_wiki 1.0
- stage_dist 0.0
- adj_dist 0.0
- vb_dist 0.0

 ###### PAIR 0, 1 ###### 
c1 = Brachiobasilic Arteriovenous Fistula
c2 = Catheter-related Bloodstream Infection (CRBSI) Nos
Distances:
- full_fuzzy_ratio 30
- noun_fuzzy_ratio 16
- bing_bagoword_dist 50
- bing_link_sim 0.0
- same_wiki 0.0
- stage_dist 0.0
- adj_dist 1.0
- vb_dist 0.0

 ###### PAIR 0, 2 ###### 
c1 = Brachiobasilic Arteriovenous Fistula
c2 = Lyme Disease
Distances:
- full_fuzzy_ratio 25
- noun_fuzzy_ratio 32
- bing_bagoword_dist 50
- bing_link_sim 0.0
- same_wiki 0.0
- stage_dist 0.0
- adj_dist 1.0
- vb_dist 0.0

 ###### PAIR 0, 3 ###### 
c1 = Brachiobasilic Arteriovenous Fistula
c2 = Intraoperative Neurophysiological Monitoring
Distances:
- full_fuzzy_ratio 35
- noun_fuzzy_ratio 24
- bing_bagoword_dis

# Examples of "stage", "type", "grade" conditions

In [107]:
typed_conds = [x for x in all_conds if 'stage' in x.lower() \
                                      or 'type' in x.lower() \
                                      or 'grade' in x.lower() \
                                      or 'genotype' in x.lower() \
                                      or 'ajcc' in x.lower() \
                                      or 'hepatitis' in x.lower()]

In [115]:
rand_typed_conds = np.random.choice(typed_conds, 6)
rand_typed_conds

array(['Stage II Small Lymphocytic Lymphoma',
       'Hyperlipoproteinemia Type II',
       'Stage 0a Bladder Urothelial Carcinoma AJCC v6 and v7',
       'Stage III Skin Melanoma', 'Stage IV Grade 1 Follicular Lymphoma',
       'Stage IV Esophageal Cancer'], dtype='<U118')

In [116]:
cur_featurized = {}
for i, cond in enumerate(rand_typed_conds):
    cur_f = ssim.featurize_one_cond(cond, bd)
    cur_featurized[cond] = cur_f
    print("\n ###### STUDY %s ###### " % i)
    for k, v in cur_f.items():
        if k != 'bing_links':
            print("'%s': %s" % (k, v))
        else:
            print("num_links (not printed): %s" % len(v))


 ###### STUDY 0 ###### 
'condition': stage ii small lymphocytic lymphoma
'nouns': stage lymphoma
'bing_tokens': {'lymphoma': 10, 'lymphocytic': 8, 'small': 6, 'nonhodgkin': 1}
num_links (not printed): 10
'bing_wiki': ['Chronic lymphocytic leukemia - Wikipedia']
'stages': {'type': [], 'genotyp_': [], 'grade': [], 'stage': ['2'], 'ajcc': [], 'hepatitis': []}
'adj_and_verbs': (['small', 'lymphocytic'], ['ii'])

 ###### STUDY 1 ###### 
'condition': hyperlipoproteinemia type ii
'nouns': hyperlipoproteinemia type ii
'bing_tokens': {'hyperlipoproteinemia': 8, 'type': 6, 'ii': 4, 'hyperlipidemia': 2, 'symptoms': 2, 'overview': 1, 'sciencedirect': 1, 'topics': 1}
num_links (not printed): 10
'bing_wiki': ['Hyperlipidemia - Wikipedia']
'stages': {'type': ['2'], 'genotyp_': [], 'grade': [], 'stage': [], 'ajcc': [], 'hepatitis': []}
'adj_and_verbs': ([], [])

 ###### STUDY 2 ###### 
'condition': stage 0a bladder urothelial carcinoma ajcc v6 and v7
'nouns': stage bladder carcinoma ajcc v6 v7
'bing_

In [117]:
used_conds = rand_typed_conds
for i in range(0, len(used_conds)):
    for j in range(i, len(used_conds)):
        f1 = cur_featurized[used_conds[i]]
        f2 = cur_featurized[used_conds[j]]
        cur_fdist = ssim.featurize_conditions(f1, f2)
        print("\n ###### PAIR %s, %s ###### " % (i, j))
        print("c1 =", used_conds[i])
        print("c2 =", used_conds[j])
        print("Distances:")
        for k, v in cur_fdist.items():
            print('-', k, v)


 ###### PAIR 0, 0 ###### 
c1 = Stage II Small Lymphocytic Lymphoma
c2 = Stage II Small Lymphocytic Lymphoma
Distances:
- full_fuzzy_ratio 100
- noun_fuzzy_ratio 100
- bing_bagoword_dist 0
- bing_link_sim 1.0
- same_wiki 1.0
- stage_dist 0.0
- adj_dist 0.0
- vb_dist 0.0

 ###### PAIR 0, 1 ###### 
c1 = Stage II Small Lymphocytic Lymphoma
c2 = Hyperlipoproteinemia Type II
Distances:
- full_fuzzy_ratio 29
- noun_fuzzy_ratio 29
- bing_bagoword_dist 50
- bing_link_sim 0.0
- same_wiki 0.0
- stage_dist 0.0
- adj_dist 1.0
- vb_dist 1.0

 ###### PAIR 0, 2 ###### 
c1 = Stage II Small Lymphocytic Lymphoma
c2 = Stage 0a Bladder Urothelial Carcinoma AJCC v6 and v7
Distances:
- full_fuzzy_ratio 37
- noun_fuzzy_ratio 42
- bing_bagoword_dist 50
- bing_link_sim 0.0
- same_wiki 0.0
- stage_dist 10.0
- adj_dist 1.0
- vb_dist 1.0

 ###### PAIR 0, 3 ###### 
c1 = Stage II Small Lymphocytic Lymphoma
c2 = Stage III Skin Melanoma
Distances:
- full_fuzzy_ratio 55
- noun_fuzzy_ratio 61
- bing_bagoword_dist 50
- 

# Examples of very similar conditions

In [94]:
park_conds = [x for x in all_conds if 'parkin' in x.lower()][:6]
park_conds

["Parkinson's Disease",
 'Parkinson Disease',
 "Idiopathic Parkinson's Disease",
 "Parkinson's Disease (PD)",
 'Parkinson',
 'Idiopathic Parkinson Disease']

In [95]:
cur_featurized = {}
for i, cond in enumerate(park_conds):
    cur_f = ssim.featurize_one_cond(cond, bd)
    cur_featurized[cond] = cur_f
    print("\n ###### STUDY %s ###### " % i)
    for k, v in cur_f.items():
        if k != 'bing_links':
            print("'%s': %s" % (k, v))
        else:
            print("num_links (not printed): %s" % len(v))


 ###### STUDY 0 ###### 
'condition': parkinson's disease
'nouns': parkinson disease
'bing_tokens': {'parkinson': 17, 'disease': 8}
num_links (not printed): 14
'bing_wiki': ["Parkinson's disease - Wikipedia"]
'stages': None
'adj_and_verbs': ([], [])

 ###### STUDY 1 ###### 
'condition': parkinson disease
'nouns': parkinson disease
'bing_tokens': {'parkinson': 10, 'disease': 8, '’': 4, 'symptoms': 3}
num_links (not printed): 11
'bing_wiki': ["Parkinson's disease - Wikipedia"]
'stages': None
'adj_and_verbs': ([], [])

 ###### STUDY 2 ###### 
'condition': idiopathic parkinson's disease
'nouns': parkinson disease
'bing_tokens': {'parkinson': 11, 'disease': 8, 'idiopathic': 6}
num_links (not printed): 13
'bing_wiki': ["Parkinson's disease - Wikipedia"]
'stages': None
'adj_and_verbs': (['idiopathic'], [])

 ###### STUDY 3 ###### 
'condition': parkinson's disease (pd)
'nouns': parkinson disease pd
'bing_tokens': {'parkinson': 12, 'disease': 9, 'symptoms': 4}
num_links (not printed): 15
'bing_

In [96]:
used_conds = park_conds
for i in range(0, len(used_conds)):
    for j in range(i, len(used_conds)):
        f1 = cur_featurized[used_conds[i]]
        f2 = cur_featurized[used_conds[j]]
        cur_fdist = ssim.featurize_conditions(f1, f2)
        print("\n ###### PAIR %s, %s ###### " % (i, j))
        print("c1 =", used_conds[i])
        print("c2 =", used_conds[j])
        print("Distances:")
        for k, v in cur_fdist.items():
            print('-', k, v)


 ###### PAIR 0, 0 ###### 
c1 = Parkinson's Disease
c2 = Parkinson's Disease
Distances:
- full_fuzzy_ratio 100
- noun_fuzzy_ratio 100
- bing_bagoword_dist 0
- bing_link_sim 1.0
- same_wiki 1.0
- stage_dist 0.0
- adj_dist 0.0
- vb_dist 0.0

 ###### PAIR 0, 1 ###### 
c1 = Parkinson's Disease
c2 = Parkinson Disease
Distances:
- full_fuzzy_ratio 94
- noun_fuzzy_ratio 100
- bing_bagoword_dist 14
- bing_link_sim 0.25
- same_wiki 1.0
- stage_dist 0.0
- adj_dist 0.0
- vb_dist 0.0

 ###### PAIR 0, 2 ###### 
c1 = Parkinson's Disease
c2 = Idiopathic Parkinson's Disease
Distances:
- full_fuzzy_ratio 78
- noun_fuzzy_ratio 100
- bing_bagoword_dist 12
- bing_link_sim 0.08
- same_wiki 1.0
- stage_dist 0.0
- adj_dist 1.0
- vb_dist 0.0

 ###### PAIR 0, 3 ###### 
c1 = Parkinson's Disease
c2 = Parkinson's Disease (PD)
Distances:
- full_fuzzy_ratio 88
- noun_fuzzy_ratio 92
- bing_bagoword_dist 10
- bing_link_sim 0.38095238095238093
- same_wiki 1.0
- stage_dist 0.0
- adj_dist 0.0
- vb_dist 0.0

 ###### PAIR