In [21]:
import pandas as pd
import numpy as numpy
from fuzzywuzzy import fuzz
from tqdm import tqdm_notebook as tqdm

import pdaactconn as pc
from trialexplorer import AACTStudySet

import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
conn = pc.AACTConnection(source=pc.AACTConnection.LOCAL)
ss = AACTStudySet.AACTStudySet(conn=conn, tqdm_handler=tqdm)

# Understanding the Distribution of Keywords

In [50]:
df = conn.query("SELECT * FROM keywords")
df.head(1)

Unnamed: 0,id,nct_id,name,downcase_name
0,13772176,NCT02140372,colchicine,colchicine


In [15]:
len(df['downcase_name'].unique())

202888

In [56]:
s_counts = df.groupby('downcase_name')['name'].count()
s_counts = s_counts.sort_values(ascending=False)

In [62]:
freq_thres = 100
df_freq = s_counts[s_counts > freq_thres]
df_freq.shape

(1136,)

In [61]:
for kw in df_freq.index:
    print(df_freq[kw], ' - ', kw)

2626  -  quality of life
2521  -  pharmacokinetics
2502  -  obesity
2377  -  pain
2357  -  breast cancer
2289  -  safety
2250  -  cancer
2213  -  exercise
2177  -  depression
1929  -  children
1849  -  hiv
1818  -  diabetes
1626  -  stroke
1401  -  physical activity
1346  -  hypertension
1321  -  inflammation
1291  -  asthma
1284  -  schizophrenia
1271  -  rehabilitation
1244  -  prevention
1208  -  pregnancy
1191  -  treatment
1180  -  copd
1166  -  heart failure
1151  -  prostate cancer
1117  -  chemotherapy
1031  -  surgery
1026  -  randomized controlled trial
1015  -  anxiety
1013  -  pediatric
978  -  vaccine
974  -  immunotherapy
920  -  efficacy
903  -  mri
883  -  elderly
865  -  lung cancer
836  -  type 2 diabetes
832  -  cognition
828  -  healthy volunteers
819  -  coronary artery disease
812  -  diabetes mellitus
792  -  atrial fibrillation
791  -  osteoarthritis
743  -  colorectal cancer
739  -  metastatic
736  -  multiple sclerosis
731  -  radiotherapy
726  -  insulin resi

### Notes: 

need to classify the diseases into various categories of "what is being researched"

# Understanding the Distribution of Meshterms

These terms are on 2 tables:
- browse_interventions
- browse_conditions

### browse_interventions

In [64]:
df_bi = conn.query('SELECT * FROM browse_interventions')
df_bi.shape

(318367, 4)

In [128]:
s_bi_mt = df_bi.groupby('mesh_term')['mesh_term'].count().sort_values(ascending=False)

In [129]:
freq_thres_bi = 100
top_bi = s_bi_mt[s_bi_mt > freq_thres_bi]
top_bi.shape

(637,)

In [130]:
for kw in top_bi.index:
    print(top_bi[kw], ' - ', kw)

6284  -  Vaccines
3800  -  Antibodies, Monoclonal
3270  -  Insulin
3198  -  Anesthetics
2998  -  Paclitaxel
2923  -  Cyclophosphamide
2744  -  Albumin-Bound Paclitaxel
2554  -  Vitamins
2553  -  Cisplatin
2499  -  Antibodies
2438  -  Pharmaceutical Solutions
2301  -  Dexamethasone
2193  -  Insulin, Globin Zinc
2188  -  Carboplatin
2146  -  Gemcitabine
2075  -  Bevacizumab
2013  -  BB 1101
2013  -  Dexamethasone acetate
1932  -  Docetaxel
1869  -  Sirolimus
1823  -  Metformin
1766  -  Everolimus
1762  -  Rituximab
1759  -  Doxorubicin
1702  -  Hormones
1664  -  Vitamin D
1655  -  Immunoglobulins
1645  -  Anti-Bacterial Agents
1508  -  Liposomal doxorubicin
1507  -  Ergocalciferols
1504  -  Oxaliplatin
1434  -  Methotrexate
1432  -  Fluorouracil
1414  -  Capecitabine
1329  -  Bupivacaine
1305  -  Prednisone
1270  -  Lidocaine
1254  -  Interferons
1234  -  Etoposide
1175  -  Ethanol
1168  -  Fludarabine
1137  -  Calcium
1114  -  Irinotecan
1092  -  Cytarabine
1068  -  Propofol
1057  -  Ta

135  -  Ziprasidone
135  -  Nintedanib
135  -  Fibrin Tissue Adhesive
135  -  Haloperidol
134  -  Voriconazole
134  -  Hypnotics and Sedatives
134  -  Digoxin
134  -  Selenium
134  -  Neostigmine
134  -  Nelfinavir
133  -  Resveratrol
133  -  Daratumumab
133  -  Bimatoprost
132  -  Lubricant Eye Drops
132  -  Chloroquine diphosphate
131  -  Amoxicillin-Potassium Clavulanate Combination
131  -  Ceftriaxone
131  -  Atropine
130  -  Trimethoprim
130  -  Medroxyprogesterone Acetate
129  -  Travoprost
129  -  Aluminum Hydroxide
128  -  Epidiolex
128  -  Povidone
128  -  Natriuretic Peptide, Brain
127  -  Linezolid
126  -  Nifedipine
126  -  Colchicine
124  -  Pseudoephedrine
124  -  Liposomal amphotericin B
124  -  Vemurafenib
124  -  Arsenic Trioxide
124  -  Amphotericin B
124  -  Sulfamethoxazole
123  -  Glycerol
123  -  Medroxyprogesterone
123  -  Saxagliptin
123  -  Axitinib
123  -  Solifenacin Succinate
123  -  Adefovir
122  -  Adefovir dipivoxil
122  -  Benzoyl Peroxide
122  -  Indome

### Notes:

These are the kind of drugs used as interventions

# browse_conditions

In [75]:
df_bc = conn.query('SELECT * FROM browse_conditions')
df_bc.shape

(543355, 4)

In [125]:
s_bc_mt = df_bc.groupby('mesh_term')['mesh_term'].count().sort_values(ascending=False)

In [126]:
freq_thres_bc = 100
top_bc = s_bc_mt[s_bc_mt > freq_thres_bc]
top_bc.shape

(815,)

In [127]:
for kw in top_bc.index:
    print(top_bc[kw], ' - ', kw)

7419  -  Diabetes Mellitus
7307  -  Breast Neoplasms
6724  -  Syndrome
5644  -  Diabetes Mellitus, Type 2
5237  -  Disease
5214  -  Neoplasms
5028  -  Carcinoma
4737  -  Lung Neoplasms
4730  -  Leukemia
4553  -  Hypertension
4468  -  Lymphoma
4449  -  Depression
4388  -  Infection
3997  -  Prostatic Neoplasms
3726  -  Coronary Artery Disease
3601  -  Depressive Disorder
3573  -  Carcinoma, Non-Small-Cell Lung
3354  -  HIV Infections
3313  -  Heart Failure
3307  -  Stroke
3307  -  Osteoarthritis
3067  -  Wounds and Injuries
3014  -  Cardiovascular Diseases
2992  -  Colorectal Neoplasms
2959  -  Hepatitis
2933  -  Communicable Diseases
2797  -  Arthritis
2759  -  Kidney Diseases
2752  -  Asthma
2724  -  Heart Diseases
2621  -  Pulmonary Disease, Chronic Obstructive
2532  -  Schizophrenia
2504  -  Lung Diseases
2413  -  Obesity
2406  -  Myocardial Ischemia
2371  -  Multiple Myeloma
2318  -  Sclerosis
2317  -  Parkinson Disease
2296  -  Fibrosis
2271  -  Hepatitis A
2193  -  Diabetes Melli

### Let's classify the top diseases ...

In [80]:
# ftp://nlmpubs.nlm.nih.gov/online/mesh/MESH_FILES/xmlmesh/desc2020.xml

import xml.etree.ElementTree as ET
tree = ET.parse('raw_data/desc2020.xml')
root = tree.getroot()

In [97]:
len(root.getchildren())

29640

In [136]:
all_text = []
for c in root.getchildren():
    cur_text = c.find('DescriptorName').find('String').text
    all_text.append(cur_text)

In [174]:
num_search = 815
found = []
not_found = []

for i in range(0, num_search):
    cur_find = top_bc.index[i]
    if cur_find in all_text:
        found.append(cur_find)
    else:
        not_found.append(cur_find)
        
print('%.1f%% found! in top %s' % (len(found) / num_search * 100, num_search))

99.9% found! in top 815


In [175]:
not_found

['Infection']

In [177]:
'Infections' in all_text

True

In [182]:
for cur_str in root.iter('String'):
    if cur_str.text == 'Infection':
        print(cur_str.text)
        break

Infection


In [185]:
for c in root.getchildren():
    cur_text = c.find('DescriptorName').find('String').text
    if cur_text == 'Infections':
        break

In [187]:
c.getchildren()

[<Element 'DescriptorUI' at 0x7fabef92fea8>,
 <Element 'DescriptorName' at 0x7fabef92fef8>,
 <Element 'DateCreated' at 0x7fabef92ff98>,
 <Element 'DateRevised' at 0x7fabef934138>,
 <Element 'DateEstablished' at 0x7fabef934278>,
 <Element 'AllowableQualifiersList' at 0x7fabef9343b8>,
 <Element 'Annotation' at 0x7fabef8c5728>,
 <Element 'HistoryNote' at 0x7fabef8c5778>,
 <Element 'OnlineNote' at 0x7fabef8c5818>,
 <Element 'PublicMeSHNote' at 0x7fabef8c5868>,
 <Element 'EntryCombinationList' at 0x7fabef8c5908>,
 <Element 'SeeRelatedList' at 0x7fabef8c5e08>,
 <Element 'TreeNumberList' at 0x7fabef8c9048>,
 <Element 'ConceptList' at 0x7fabef8c90e8>]

In [194]:
c.find('TreeNumberList').find('TreeNumber').text

'C01'

In [202]:
if_terms = c.find('ConceptList').find('Concept').find('TermList').getchildren()

In [204]:
for term in if_terms:
    print(term.find('String').text)

Infections
Infection and Infestation
Infestation and Infection
Infections and Infestations
Infestations and Infections
Infection


### let's see the terms and their mapped concept lists

In [209]:
for term in root[0].find('ConceptList').find('Concept').find('TermList').getchildren():
    print(term.find('String').text)

Calcimycin


In [211]:
term_dict = {}
for c in root.getchildren():
    cur_text = c.find('DescriptorName').find('String').text
    all_terms = c.find('ConceptList').find('Concept').find('TermList').getchildren()
    cur_terms = []
    for term in all_terms:
        cur_terms.append(term.find('String').text)
    term_dict[cur_text] = cur_terms

In [212]:
term_dict

{'Calcimycin': ['Calcimycin'],
 'Temefos': ['Temefos', 'Temephos'],
 'Abattoirs': ['Abattoirs',
  'Abattoir',
  'Slaughter Houses',
  'House, Slaughter',
  'Houses, Slaughter',
  'Slaughter House',
  'Slaughterhouses',
  'Slaughterhouse'],
 'Abbreviations as Topic': ['Abbreviations as Topic'],
 'Abdomen': ['Abdomen', 'Abdomens'],
 'Abdomen, Acute': ['Abdomen, Acute',
  'Abdomens, Acute',
  'Acute Abdomen',
  'Acute Abdomens'],
 'Abdominal Injuries': ['Abdominal Injuries',
  'Injuries, Abdominal',
  'Abdominal Injury',
  'Injury, Abdominal'],
 'Abdominal Neoplasms': ['Abdominal Neoplasms',
  'Abdominal Neoplasm',
  'Neoplasm, Abdominal',
  'Neoplasms, Abdominal'],
 'Abdominal Muscles': ['Abdominal Muscles',
  'Abdominal Muscle',
  'Muscle, Abdominal',
  'Muscles, Abdominal'],
 'Abducens Nerve': ['Abducens Nerve',
  'Nerve, Abducens',
  'Sixth Cranial Nerve',
  'Cranial Nerve, Sixth',
  'Nerve, Sixth Cranial',
  'Nerves, Sixth Cranial',
  'Sixth Cranial Nerves',
  'Nerve VI',
  'Nerve VI