In [14]:
import pandas as pd
import numpy as np

import datetime as dt
import pickle
from importlib import reload

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm as tqdm_pandas

import pdaactconn as pc
from trialexplorer import AACTStudySet
from trialexplorer import studysimilarity as ssim

import matplotlib.pyplot as plt
%matplotlib inline

In [15]:
tqdm_pandas.pandas()

In [16]:
## let's load bing's data and make sure we have only values that have data
bd = pickle.load(open('raw_data/all_bing.p', 'rb'))
new_bd = {}
for k, v in bd.items():
    if not ssim.has_no_data(v):
        new_bd[k] = v
bd = new_bd

In [17]:
all_conds = pd.Series(list(bd.keys()))

In [18]:
all_conds

0                                               Healthy
1                                         Breast Cancer
2                                               Obesity
3                                        HIV Infections
4                                          Hypertension
                              ...                      
34157                              Tumor of the Thyroid
34158    Trochanteric and Subtrochanteric Hip Fractures
34159                                   Tumors (Others)
34160       Tumor, Desmoplastic Small Round Cell, Adult
34161    Tumor Appearance of Biliary System Obstruction
Length: 34162, dtype: object

In [19]:
SAME = 'same'
NOTSAME = 'notsame'
SAME_DIFF_QUAL = 'same_diff_qual'

In [20]:
master_list = []

In [21]:
def grp2train(df_in):
    grp_dict = df_in['grp'].to_dict()
    all_labels = []
    for a in df_in.index:
        for b in df_in.index:
            if a != b:
                d1, s1 = ssim.digstr2parts(grp_dict[a])
                d2, s2 = ssim.digstr2parts(grp_dict[b])
                if d1 == d2 and s1 == s2:
                    cur_label = SAME
                elif d1 == d2:
                    cur_label = SAME_DIFF_QUAL
                else:
                    cur_label = NOTSAME
                
                all_labels.append({
                    'cond1': a,
                    'cond2': b,
                    'label': cur_label
                })
    return pd.DataFrame(all_labels)

### Parkinsons

In [63]:
cur_cond = "Chronic Hepatitis C Genotype 1B"
ssim.featurize_one_cond(cur_cond, bd)

{'condition': 'chronic hepatitis c genotype 1b',
 'nouns': 'hepatitis c genotype',
 'bing_tokens': {'hepatitis': 16, 'c': 9},
 'bing_links': ['https://generichope.com/hepatitis-c-genotype-1a-1b-treatment-guidelines/',
  'https://www.sharecare.com/health/hepatitis-c/what-prognosis-people-hepatitis-c',
  'https://www.healthline.com/health/hepatitis-c/can-it-be-cured',
  'https://www.sharecare.com/health/hepatitis/what-differences-hepatitis-a-b',
  'https://www.hepatitisc.uw.edu/go/treatment-infection/treatment-genotype-1/core-concept/all',
  'https://www.hepatitis.va.gov/provider/reviews/genotypes.asp',
  'https://www.hcvguidelines.org/treatment-naive/gt1b/no-cirrhosis',
  'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3628125/',
  'https://academic.microsoft.com/search?iq=And(Ty%3d%270%27%2cRId%3d2049611986)&filters=&from=0&sort=0&q=papers%20citing%20early%20dynamics%20of%20viremia%20in%20patients%20with%20genotype%201b%20chronic%20hepatitis%20c%20peg%20ifnalpha2a%20shows%20earlier%20vir

In [23]:
#s_park = [x for x in all_conds if 'parkin' in x.lower()]
#pd.DataFrame(s_park).to_excel('training_data/p1.xls')

In [25]:
df_in = pd.read_excel('training_data/p1.xls', index_col='cond')
df_in.head()

Unnamed: 0_level_0,grp
cond,Unnamed: 1_level_1
Parkinson's Disease,1a
Parkinson Disease,1a
Idiopathic Parkinson's Disease,1b
Parkinson's Disease (PD),1a
Parkinson,1a


In [26]:
master_list.append(grp2train(df_in))
len(master_list)

1

### Breast Cancer

In [123]:
s_bc = [x for x in all_conds if 'breast' in x.lower()]
#pd.DataFrame(s_bc).to_excel('training_data/p2.xls')

In [27]:
df_in = pd.read_excel('training_data/p2.xls', index_col='cond')
df_in.head()

Unnamed: 0_level_0,grp
cond,Unnamed: 1_level_1
Breast Cancer,1a
Breast Neoplasms,1a
Metastatic Breast Cancer,1b
Stage IV Breast Cancer,1c
Triple Negative Breast Cancer,1d


In [28]:
master_list.append(grp2train(df_in))
len(master_list)

2

# HIV

In [132]:
s_hiv = [x for x in all_conds if 'hiv' in x.lower()]
#pd.DataFrame(s_hiv).to_excel('training_data/p3.xls')

In [29]:
df_in = pd.read_excel('training_data/p3.xls', index_col='cond')
df_in.head()

Unnamed: 0_level_0,grp
cond,Unnamed: 1_level_1
- HIV,1a
Acute HIV Infection,1b
AIDS/HIV PROBLEM,1a
Aids/Hiv Problem,1a
Antiretroviral Therapy in HIV-1 Infected Children,2a


In [136]:
master_list.append(grp2train(df_in))
len(master_list)

3

# Colon

In [138]:
s_colon = [x for x in all_conds if 'colon' in x.lower()]
#pd.DataFrame(s_colon).to_excel('training_data/p4.xls')

In [30]:
df_in = pd.read_excel('training_data/p4.xls', index_col='cond')
df_in.head()

Unnamed: 0_level_0,grp
cond,Unnamed: 1_level_1
Cancer Colon,1a
Cancer of Colon,1a
Cancer of the Colon,1a
Colon Cancer,1a
Colon Cancer Prevention,1a


In [31]:
master_list.append(grp2train(df_in))
len(master_list)

3

# Genotype

In [143]:
s_genotype = [x for x in all_conds if 'genotype' in x.lower()]
#pd.DataFrame(s_genotype).to_excel('training_data/p5.xls')

In [32]:
df_in = pd.read_excel('training_data/p5.xls', index_col='cond')
df_in.head()

Unnamed: 0_level_0,grp
cond,Unnamed: 1_level_1
Carboxylesterase 1 (CES1) Genotype,1a
Chronic Genotype 1 Hepatitis C Virus Infection,2a
Chronic Hepatics C Virus (HCV) Genotype 1,2a
Chronic Hepatitis C Genotype 1,2a
Chronic Hepatitis C Genotype 1B,2b


In [33]:
master_list.append(grp2train(df_in))
len(master_list)

4

# Grade

In [147]:
s_grade = [x for x in all_conds if 'grade' in x.lower()]
#pd.DataFrame(s_grade).to_excel('training_data/p6.xls')

In [34]:
df_in = pd.read_excel('training_data/p6.xls', index_col='cond')
df_in.head()

Unnamed: 0_level_0,grp
cond,Unnamed: 1_level_1
"Lymphoma, Low-Grade",1a
"Lymphoma, Intermediate-Grade",1b
"Lymphoma, High-Grade",1c
Low Grade Lymphoma,1a
Low-Grade Lymphoma,1a


In [35]:
master_list.append(grp2train(df_in))
len(master_list)

5

# Cardio

In [36]:
s_card = [x for x in all_conds if 'cardio' in x.lower()]
#pd.DataFrame(s_card).to_excel('training_data/p7.xls')

In [38]:
df_in = pd.read_excel('training_data/p7.xls', index_col='cond')
df_in.head()

Unnamed: 0_level_0,grp
cond,Unnamed: 1_level_1
Cardiomyopathy,1a
"Cardiomyopathy, Dilated",1b
Dilated Cardiomyopathy,1b
Ischemic Cardiomyopathy,1c
Cardiomyopathy Ischemic,1c


In [39]:
master_list.append(grp2train(df_in))
len(master_list)

6

In [41]:
df_final = pd.concat(master_list)
df_final.shape

(60448, 3)

In [161]:
### random entries

In [42]:
100000 - 60448

39552

In [43]:
filler_list = []
for i in tqdm(range(40000)):
    a = np.random.choice(all_conds)
    b = np.random.choice(all_conds)
    if fuzz.ratio(a, b) < 50:
        filler_list.append({
            'cond1': a,
            'cond2': b,
            'label': NOTSAME
        })

HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))




In [44]:
df_filler = pd.DataFrame(filler_list)
df_filler.to_excel('training_data/filler.xls')

In [45]:
df_filler.shape

(39694, 3)

In [46]:
master_list.append(df_filler)

# final blend

In [47]:
df_final = pd.concat(master_list)
df_final.shape

(100142, 3)

In [48]:
df_final

Unnamed: 0,cond1,cond2,label
0,Parkinson's Disease,Parkinson Disease,same
1,Parkinson's Disease,Idiopathic Parkinson's Disease,same_diff_qual
2,Parkinson's Disease,Parkinson's Disease (PD),same
3,Parkinson's Disease,Parkinson,same
4,Parkinson's Disease,Idiopathic Parkinson Disease,same_diff_qual
...,...,...,...
39689,Adult Intensive Care Patients,Tangier Disease,notsame
39690,Stage IIIA Non-Small Cell Lung Cancer,NON-SMALL CELL LUNG CANCER,notsame
39691,Hidradenitis Suppurativa,Estrogen Receptor Breast Cancer,notsame
39692,Gender Relations,Complicated Intra-abdominal Infections,notsame


# Featurizing

In [49]:
def featurize(row):
    return ssim.featurize_cond_full(row['cond1'], row['cond2'], bd)

In [50]:
feature_res = df_final.progress_apply(featurize, axis=1)

100%|██████████| 100142/100142 [05:17<00:00, 315.58it/s]


In [51]:
all_s = feature_res.progress_apply(pd.Series)

100%|██████████| 100142/100142 [00:25<00:00, 3975.31it/s]


In [52]:
df_final[[x for x in all_s.columns]] = all_s

In [54]:
df_final.head()

Unnamed: 0,cond1,cond2,label,full_fuzzy_ratio,noun_fuzzy_ratio,bing_bagoword_dist,bing_link_sim,same_wiki,stage_dist,adj_dist,vb_dist
0,Parkinson's Disease,Parkinson Disease,same,94.0,100.0,14.0,0.25,1.0,0.0,0.0,0.0
1,Parkinson's Disease,Idiopathic Parkinson's Disease,same_diff_qual,78.0,100.0,12.0,0.08,1.0,0.0,1.0,0.0
2,Parkinson's Disease,Parkinson's Disease (PD),same,88.0,92.0,10.0,0.380952,1.0,0.0,0.0,0.0
3,Parkinson's Disease,Parkinson,same,64.0,69.0,2.0,0.142857,1.0,0.0,0.0,0.0
4,Parkinson's Disease,Idiopathic Parkinson Disease,same_diff_qual,72.0,100.0,18.0,0.08,1.0,0.0,1.0,0.0


In [55]:
df_final.to_pickle('training_data/all.p')

## manual edits

In [4]:
df = pd.read_pickle('training_data/all.p')

In [7]:
df.to_excel('training_data/all.xlsx')

In [9]:
df = pd.read_excel('training_data/all.xlsx')

In [13]:
df.drop('Unnamed: 0', axis=1).to_pickle('training_data/all.p')