In [184]:
import pandas as pd
import numpy as np

import datetime as dt
import pickle
from importlib import reload

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm as tqdm_pandas

import pdaactconn as pc
from trialexplorer import AACTStudySet
from trialexplorer import studysimilarity as ssim

import matplotlib.pyplot as plt
%matplotlib inline

In [185]:
tqdm_pandas.pandas()

In [2]:
## let's load bing's data and make sure we have only values that have data
bd = pickle.load(open('raw_data/all_bing.p', 'rb'))
new_bd = {}
for k, v in bd.items():
    if not ssim.has_no_data(v):
        new_bd[k] = v
bd = new_bd

In [21]:
all_conds = pd.Series(list(bd.keys()))

In [22]:
all_conds

0                                               Healthy
1                                         Breast Cancer
2                                               Obesity
3                                        HIV Infections
4                                          Hypertension
                              ...                      
34157                              Tumor of the Thyroid
34158    Trochanteric and Subtrochanteric Hip Fractures
34159                                   Tumors (Others)
34160       Tumor, Desmoplastic Small Round Cell, Adult
34161    Tumor Appearance of Biliary System Obstruction
Length: 34162, dtype: object

In [114]:
SAME = 'same'
NOTSAME = 'notsame'
SAME_DIFF_QUAL = 'same_diff_qual'

In [121]:
master_list = []

In [117]:
def grp2train(df_in):
    grp_dict = df_in['grp'].to_dict()
    all_labels = []
    for a in df_in.index:
        for b in df_in.index:
            if a != b:
                d1, s1 = ssim.digstr2parts(grp_dict[a])
                d2, s2 = ssim.digstr2parts(grp_dict[b])
                if d1 == d2 and s1 == s2:
                    cur_label = SAME
                elif d1 == d2:
                    cur_label = SAME_DIFF_QUAL
                else:
                    cur_label = NOTSAME
                
                all_labels.append({
                    'cond1': a,
                    'cond2': b,
                    'label': cur_label
                })
    return pd.DataFrame(all_labels)

### Parkinsons

In [125]:
cur_cond = "Metastatic Breast Carcinoma"
ssim.featurize_one_cond(cur_cond, bd)

{'condition': 'metastatic breast carcinoma',
 'nouns': 'breast carcinoma',
 'bing_tokens': {'breast': 13, 'cancer': 12},
 'bing_links': ['https://www.breastcancer.org/symptoms/types/recur_metast',
  'https://www.healthline.com/health/breast-cancer/metastatic-prognosis',
  'https://ww5.komen.org/BreastCancer/RecommendedTreatmentsforMetastaticBreastCancer.html',
  'https://www.verywell.com/stage-1-breast-cancer-429888',
  'https://www.nationalbreastcancer.org/metastatic-breast-cancer',
  'https://www.medpagetoday.com/hematologyoncology/breastcancer/83625',
  'https://www.forbes.com/sites/danadovey/2019/11/29/japan-approves-genetic-testing-for-breast-cancer-treatment/',
  'https://www.nature.com/articles/s41598-019-54471-w',
  'https://www.webmd.com/breast-cancer/treatments-metastatic-breast-cancer',
  'https://ww5.komen.org/BreastCancer/MetastaticBreastCancerIntroduction.html',
  'https://www.healthline.com/health/breast-cancer/treatment-breakthroughs-2019',
  'https://www.medicalnewstod

In [99]:
s_park = [x for x in all_conds if 'parkin' in x.lower()]
pd.DataFrame(s_park).to_excel('training_data/p1.xls')

In [116]:
df_in = pd.read_excel('training_data/p1.xls', index_col='cond')
df_in.head()

Unnamed: 0_level_0,grp
cond,Unnamed: 1_level_1
Parkinson's Disease,1a
Parkinson Disease,1a
Idiopathic Parkinson's Disease,1b
Parkinson's Disease (PD),1a
Parkinson,1a


In [122]:
master_list.append(grp2train(df_in))
len(master_list)

1

### Breast Cancer

In [123]:
s_bc = [x for x in all_conds if 'breast' in x.lower()]
pd.DataFrame(s_bc).to_excel('training_data/p2.xls')

In [126]:
df_in = pd.read_excel('training_data/p2.xls', index_col='cond')
df_in.head()

Unnamed: 0_level_0,grp
cond,Unnamed: 1_level_1
Breast Cancer,1a
Breast Neoplasms,1a
Metastatic Breast Cancer,1b
Stage IV Breast Cancer,1c
Triple Negative Breast Cancer,1d


In [127]:
master_list.append(grp2train(df_in))
len(master_list)

2

# HIV

In [132]:
s_hiv = [x for x in all_conds if 'hiv' in x.lower()]
pd.DataFrame(s_hiv).to_excel('training_data/p3.xls')

In [135]:
df_in = pd.read_excel('training_data/p3.xls', index_col='cond')
df_in.head()

Unnamed: 0_level_0,grp
cond,Unnamed: 1_level_1
- HIV,1a
Acute HIV Infection,1b
AIDS/HIV PROBLEM,1a
Aids/Hiv Problem,1a
Antiretroviral Therapy in HIV-1 Infected Children,2a


In [136]:
master_list.append(grp2train(df_in))
len(master_list)

3

# Colon

In [138]:
s_colon = [x for x in all_conds if 'colon' in x.lower()]
pd.DataFrame(s_colon).to_excel('training_data/p4.xls')

In [139]:
df_in = pd.read_excel('training_data/p4.xls', index_col='cond')
df_in.head()

Unnamed: 0_level_0,grp
cond,Unnamed: 1_level_1
Cancer Colon,1a
Cancer of Colon,1a
Cancer of the Colon,1a
Colon Cancer,1a
Colon Cancer Prevention,1a


In [140]:
master_list.append(grp2train(df_in))
len(master_list)

4

# Genotype

In [143]:
s_genotype = [x for x in all_conds if 'genotype' in x.lower()]
pd.DataFrame(s_genotype).to_excel('training_data/p5.xls')

In [144]:
df_in = pd.read_excel('training_data/p5.xls', index_col='cond')
df_in.head()

Unnamed: 0_level_0,grp
cond,Unnamed: 1_level_1
Carboxylesterase 1 (CES1) Genotype,1a
Chronic Genotype 1 Hepatitis C Virus Infection,2a
Chronic Hepatics C Virus (HCV) Genotype 1,2a
Chronic Hepatitis C Genotype 1,2a
Chronic Hepatitis C Genotype 1B,2b


In [145]:
master_list.append(grp2train(df_in))
len(master_list)

5

# Grade

In [147]:
s_grade = [x for x in all_conds if 'grade' in x.lower()]
pd.DataFrame(s_grade).to_excel('training_data/p6.xls')

In [148]:
df_in = pd.read_excel('training_data/p6.xls', index_col='cond')
df_in.head()

Unnamed: 0_level_0,grp
cond,Unnamed: 1_level_1
"Lymphoma, Low-Grade",1a
"Lymphoma, Intermediate-Grade",1b
"Lymphoma, High-Grade",1c
Low Grade Lymphoma,1a
Low-Grade Lymphoma,1a


In [149]:
master_list.append(grp2train(df_in))
len(master_list)

6

# Cardio

In [154]:
s_card = [x for x in all_conds if 'cardio' in x.lower()]
pd.DataFrame(s_card).to_excel('training_data/p7.xls')

In [155]:
df_in = pd.read_excel('training_data/p7.xls', index_col='cond')
df_in.head()

Unnamed: 0_level_0,grp
cond,Unnamed: 1_level_1
Cardiomyopathy,1a
"Cardiomyopathy, Dilated",1b
Dilated Cardiomyopathy,1b
Ischemic Cardiomyopathy,1c
Cardiomyopathy Ischemic,1c


In [156]:
master_list.append(grp2train(df_in))
len(master_list)

7

In [161]:
### random entries

In [162]:
100000 - 66300

33700

In [168]:
filler_list = []
for i in tqdm(range(40000)):
    a = np.random.choice(all_conds)
    b = np.random.choice(all_conds)
    if fuzz.ratio(a, b) < 50:
        filler_list.append({
            'cond1': a,
            'cond2': b,
            'label': NOTSAME
        })

HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))




In [170]:
df_filler = pd.DataFrame(filler_list)
df_filler.to_excel('training_data/filler.xls')

In [171]:
df_filler.shape

(39731, 3)

In [172]:
master_list.append(df_filler)

# final blend

In [173]:
df_final = pd.concat(master_list)
df_final.shape

(106031, 3)

In [176]:
df_final

Unnamed: 0,cond1,cond2,label
0,Parkinson's Disease,Parkinson Disease,same
1,Parkinson's Disease,Idiopathic Parkinson's Disease,same_diff_qual
2,Parkinson's Disease,Parkinson's Disease (PD),same
3,Parkinson's Disease,Parkinson,same
4,Parkinson's Disease,Idiopathic Parkinson Disease,same_diff_qual
...,...,...,...
39726,Excess Weight,Congenital Malformations,notsame
39727,Reaction of Autonomic Nervous System,Elderly Patients (>65 Years),notsame
39728,Stage IVC Rectal Cancer AJCC v8,Colistin Resistant ABC,notsame
39729,Prostate Specific Antigen,Anesthesia and Analgesia,notsame


# Featurizing

In [179]:
def featurize(row):
    return ssim.featurize_cond_full(row['cond1'], row['cond2'], bd)

In [188]:
feature_res = df_final.progress_apply(featurize, axis=1)

100%|██████████| 106031/106031 [05:37<00:00, 314.12it/s]


In [193]:
all_s = feature_res.progress_apply(pd.Series)

100%|██████████| 106031/106031 [00:27<00:00, 3895.55it/s]


In [195]:
df_final[[x for x in all_s.columns]] = all_s

In [196]:
df_final

Unnamed: 0,cond1,cond2,label,full_fuzzy_ratio,noun_fuzzy_ratio,bing_bagoword_dist,bing_link_sim,same_wiki,stage_dist,adj_dist,vb_dist
0,Parkinson's Disease,Parkinson Disease,same,94.0,100.0,14.0,0.250000,1.0,0.0,0.0,0.0
1,Parkinson's Disease,Idiopathic Parkinson's Disease,same_diff_qual,78.0,100.0,12.0,0.080000,1.0,0.0,1.0,0.0
2,Parkinson's Disease,Parkinson's Disease (PD),same,88.0,92.0,10.0,0.380952,1.0,0.0,0.0,0.0
3,Parkinson's Disease,Parkinson,same,64.0,69.0,2.0,0.142857,1.0,0.0,0.0,0.0
4,Parkinson's Disease,Idiopathic Parkinson Disease,same_diff_qual,72.0,100.0,18.0,0.080000,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
39726,Excess Weight,Congenital Malformations,notsame,22.0,11.0,50.0,0.000000,0.0,0.0,1.0,0.0
39727,Reaction of Autonomic Nervous System,Elderly Patients (>65 Years),notsame,31.0,48.0,50.0,0.000000,0.0,0.0,1.0,0.0
39728,Stage IVC Rectal Cancer AJCC v8,Colistin Resistant ABC,notsame,45.0,38.0,25.0,0.000000,0.0,10.0,1.0,0.0
39729,Prostate Specific Antigen,Anesthesia and Analgesia,notsame,41.0,44.0,25.0,0.000000,0.0,0.0,1.0,0.0


In [189]:
df_final.head()

Unnamed: 0,cond1,cond2,label
0,Parkinson's Disease,Parkinson Disease,same
1,Parkinson's Disease,Idiopathic Parkinson's Disease,same_diff_qual
2,Parkinson's Disease,Parkinson's Disease (PD),same
3,Parkinson's Disease,Parkinson,same
4,Parkinson's Disease,Idiopathic Parkinson Disease,same_diff_qual


In [197]:
df_final.to_pickle('training_data/all.p')