In [28]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook
from tqdm import tqdm
import datetime as dt
import pickle

import pdaactconn as pc
from trialexplorer.mesh_terms import MeSHCatalog
from trialexplorer import AACTStudySet
from trialexplorer import studysimilarity as ssim

import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import string

import matplotlib.pyplot as plt
%matplotlib inline

tqdm.pandas()

In [17]:
stop_words = set(stopwords.words('english'))

In [2]:
# selecting all interventional studies
conn = pc.AACTConnection(source=pc.AACTConnection.LOCAL)
ss = AACTStudySet.AACTStudySet(conn= conn, tqdm_handler=tqdm_notebook)
ss.add_constraint("study_type = 'Interventional'")
ss.load_studies()

250890 studies loaded!


In [6]:
# loading all dimensional data
ss.add_dimensions('conditions')
ss.add_dimensions('browse_conditions')
ss.refresh_dim_data()

Successfuly added these 1 dimensions: ['conditions']
Failed to add these 0 dimensions: []
Successfuly added these 1 dimensions: ['browse_conditions']
Failed to add these 0 dimensions: []


HBox(children=(IntProgress(value=0, max=502), HTML(value='')))

Syncing the temp table temp_cur_studies in 502 chunks x 500 records each

Creating index on the temp table
 - Loading dimension conditions
 -- Loading raw data
 -- Sorting index
 - Loading dimension browse_conditions
 -- Loading raw data
 -- Sorting index


In [7]:
c = ss.dimensions['conditions']
bc = ss.dimensions['browse_conditions']
c.data.head()

Unnamed: 0_level_0,id,name,downcase_name
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NCT00000102,8627542,Congenital Adrenal Hyperplasia,congenital adrenal hyperplasia
NCT00000106,8626467,Rheumatic Diseases,rheumatic diseases
NCT00000108,8626464,Cardiovascular Diseases,cardiovascular diseases
NCT00000108,8626465,Coronary Disease,coronary disease
NCT00000110,8626463,Obesity,obesity


In [9]:
bc.data.head()

Unnamed: 0_level_0,id,mesh_term,downcase_mesh_term
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NCT00000102,9144384,"Adrenal Hyperplasia, Congenital","adrenal hyperplasia, congenital"
NCT00000102,9144385,Adrenogenital Syndrome,adrenogenital syndrome
NCT00000102,9144386,Adrenocortical Hyperfunction,adrenocortical hyperfunction
NCT00000102,9144387,Hyperplasia,hyperplasia
NCT00000106,9143121,Rheumatic Diseases,rheumatic diseases


# Finding extra stop words in conditions

In [10]:
all_conds = c.data['downcase_name'].unique()
all_mesh = bc.data['downcase_mesh_term'].unique()

In [11]:
len(all_conds), len(all_mesh)

(63257, 3738)

In [19]:
joint_mesh = ' '.join(all_mesh)
joint_conds = ' '.join(all_conds)

raw_mesh_tokens = nltk.word_tokenize(joint_mesh)
raw_cond_tokens = nltk.word_tokenize(joint_conds)

In [25]:
def strip_tokens(tokens, stop_words):
    table = str.maketrans('', '', string.punctuation)
    stripped_tokens = [w.translate(table) for w in tokens]
    no_blank = [w for w in stripped_tokens if w != '']
    no_stop_words = [w for w in no_blank if not w in stop_words]
    return no_stop_words

In [35]:
stripped_mesh_tokens = strip_tokens(raw_mesh_tokens, stop_words)
stripped_cond_tokens = strip_tokens(raw_cond_tokens, stop_words)

fdist_mesh = FreqDist(stripped_mesh_tokens)
fdist_cond = FreqDist(stripped_cond_tokens)

In [44]:
mesh_vocab = fdist_mesh.keys()
len(mesh_vocab)

3221

In [46]:
cond_dict = dict(fdist_cond)

In [48]:
cond_extras = [x for x in cond_dict.keys() if x not in mesh_vocab]

In [51]:
extras_dict = FreqDist()
for k in cond_extras:
    extras_dict[k] = fdist_cond[k]

In [53]:
extras_dict.most_common()

[('patients', 1043),
 ('ajcc', 944),
 ('metastatic', 811),
 ('advanced', 732),
 ('v7', 474),
 ('healthy', 467),
 ('v8', 463),
 ('therapy', 416),
 ('solid', 377),
 ('health', 366),
 ('treatment', 356),
 ('care', 355),
 ('risk', 343),
 ('grade', 323),
 ('prostate', 307),
 ('post', 287),
 ('due', 267),
 ('transplant', 260),
 ('children', 260),
 ('nonsmall', 247),
 ('mutation', 243),
 ('non', 233),
 ('moderate', 231),
 ('gene', 229),
 ('patient', 222),
 ('associated', 216),
 ('effect', 208),
 ('locally', 205),
 ('cavity', 202),
 ('study', 201),
 ('transplantation', 199),
 ('relapsed', 197),
 ('subjects', 191),
 ('prevention', 190),
 ('complication', 182),
 ('bleeding', 177),
 ('mild', 176),
 ('total', 159),
 ('unresectable', 155),
 ('women', 154),
 ('lumbar', 153),
 ('volunteers', 147),
 ('medical', 147),
 ('early', 147),
 ('iiib', 146),
 ('lesion', 143),
 ('iva', 141),
 ('malignancies', 137),
 ('cancers', 137),
 ('pharmacokinetics', 136),
 ('training', 136),
 ('ivb', 131),
 ('junction', 1