In [1]:
import numpy as np
import pandas as pd
from importlib import reload

from bs4 import BeautifulSoup
import urllib
import requests

from tqdm import tqdm_notebook
from tqdm import tqdm
import datetime as dt

import pdaactconn as pc
from trialexplorer import AACTStudySet
from trialexplorer import bing
import pickle
from collections import Counter

import matplotlib.pyplot as plt
%matplotlib inline

tqdm.pandas()

### Example

In [None]:
feat_doc, feat_links, feat_wiki = bing.bing('non small cell lung cancer')

In [None]:
feat_doc2, feat_links2, feat_wiki2 = bing.bing("carcinoma, non-small-cell lung")

In [None]:
print("---STRING 1 ---\n")
print(feat_doc)

print("\n---STRING 2 ---\n")
print(feat_doc2)

In [None]:
print("---LINK 1 ---\n")
print(feat_links)

print("\n---LINK 2 ---\n")
print(feat_links2)

In [None]:
feat_wiki, feat_wiki2

# What are all of the terms that we have to classify?

In [2]:
# selecting all interventional studies
conn = pc.AACTConnection(source=pc.AACTConnection.REMOTE)
ss = AACTStudySet.AACTStudySet(conn= conn, tqdm_handler=tqdm_notebook)
ss.add_constraint("study_type = 'Interventional'")
ss.load_studies()

255092 studies loaded!


In [3]:
# loading all dimensional data
ss.add_dimensions('browse_conditions')
ss.add_dimensions('conditions')
ss.refresh_dim_data()

Successfuly added these 1 dimensions: ['browse_conditions']
Failed to add these 0 dimensions: []
Successfuly added these 1 dimensions: ['conditions']
Failed to add these 0 dimensions: []


HBox(children=(IntProgress(value=0, max=511), HTML(value='')))

Syncing the temp table temp_cur_studies in 511 chunks x 500 records each

Creating index on the temp table
 - Loading dimension browse_conditions
 -- Loading raw data
 -- Sorting index
 - Loading dimension conditions
 -- Loading raw data
 -- Sorting index


In [4]:
bc = ss.dimensions['browse_conditions']
c = ss.dimensions['conditions']

In [5]:
len(c.data['name'].unique())

65144

## Features:

How to determine if 2 studies are studing the same condition?
- condition names (lev distance)
- condition bing results (bag of words)
- mesh terms (jaccard distance)
- mesh tree location (tree distance)
- adjective descriptors (such as "chronic")
- type, grade, stage, AJCC (type1, type2) etc.

    

In [6]:
c.data.to_pickle('raw_data/all_conditions.p')

### doing top 100:

In [7]:
NUM_BINGS = 100

In [8]:
cond_counts = c.data.groupby('name').size().sort_values(ascending=False)
to_bing = list(cond_counts[:NUM_BINGS].index)

In [None]:
bing_res = {}
for cur_term in to_bing:
    feat_doc, feat_links, feat_wiki = bing.bing(cur_term)
    bing_res[cur_term] = {
        'doc': feat_doc,
        'links': feat_links,
        'wiki': feat_wiki
    }
    bing.do_wait()  # don't forget to wait !!

### next 900

In [17]:
to_bing = list(cond_counts[100:1000].index)

In [19]:
for cur_term in tqdm(to_bing):
    feat_doc, feat_links, feat_wiki = bing.bing(cur_term, do_print=False)
    bing_res[cur_term] = {
        'doc': feat_doc,
        'links': feat_links,
        'wiki': feat_wiki
    }
    bing.do_wait(multiple=0.5)  # don't forget to wait !!

100%|██████████| 900/900 [32:13<00:00,  2.15s/it]


In [21]:
pickle.dump(bing_res, open('raw_data/top1000.p', 'wb'))

### Top 10000

In [22]:
to_bing = list(cond_counts[1000:10000].index)

In [23]:
for cur_term in tqdm(to_bing):
    feat_doc, feat_links, feat_wiki = bing.bing(cur_term, do_print=False)
    bing_res[cur_term] = {
        'doc': feat_doc,
        'links': feat_links,
        'wiki': feat_wiki
    }
    bing.do_wait(multiple=0.5)  # don't forget to wait !!

100%|██████████| 9000/9000 [5:31:04<00:00,  2.21s/it]  


In [24]:
pickle.dump(bing_res, open('raw_data/top10000.p', 'wb'))

### Everything else:

In [26]:
to_bing = list(cond_counts[10000:].index)

In [None]:
for cur_term in tqdm(to_bing):
    feat_doc, feat_links, feat_wiki = bing.bing(cur_term, do_print=False)
    bing_res[cur_term] = {
        'doc': feat_doc,
        'links': feat_links,
        'wiki': feat_wiki
    }
    bing.do_wait(multiple=0.5)  # don't forget to wait !!

 18%|█▊        | 9946/55144 [6:17:30<26:20:16,  2.10s/it]

In [None]:
pickle.dump(bing_res, open('raw_data/all_bing.p', 'wb'))