In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from tqdm import tqdm
tqdm.pandas()

import pdaactconn as pc
from trialexplorer.mesh_terms import MeSHCatalog
from trialexplorer import AACTStudySet

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# intializing MeSH object
mc = MeSHCatalog()  

No Local XML detected at xml/desc2020.xml, Fetching file from FTP server for the first time ...
Remote Server: nlmpubs.nlm.nih.gov
Remote Dir: online/mesh/MESH_FILES/xmlmesh
local file written to xml/desc2020.xml
Parsing MeSH xml: xml/desc2020.xml ...
Parse Complete! (parsed ElementTree root can be found in the .root attribute)


In [59]:
# selecting all interventional studies
conn = pc.AACTConnection(source=pc.AACTConnection.LOCAL)
ss = AACTStudySet.AACTStudySet(conn= conn, tqdm_handler=tqdm_notebook)
ss.add_constraint("study_type = 'Interventional'")
ss.load_studies()

250890 studies loaded!


In [61]:
# loading all dimensional data
ss.add_dimensions('browse_conditions')
ss.add_dimensions('sponsors')
ss.refresh_dim_data()

Successfuly added these 1 dimensions: ['browse_conditions']
Failed to add these 0 dimensions: []
Successfuly added these 1 dimensions: ['sponsors']
Failed to add these 0 dimensions: []


HBox(children=(IntProgress(value=0, max=502), HTML(value='')))

Syncing the temp table temp_cur_studies in 502 chunks x 500 records each

Creating index on the temp table
 - Loading dimension browse_conditions
 -- Loading raw data
 -- Sorting index
 - Loading dimension sponsors
 -- Loading raw data
 -- Sorting index


In [69]:
# map the mesh term in the browse_conditions dimension to the level 1 mesh term
bc = ss.dimensions['browse_conditions']
MESH_LEVEL = 1  # level that we're interested in classifying the studies

bc.data['mesh_l1'] = bc.data['mesh_term'].progress_apply(lambda x: mc.lookup_higher_level(x, MESH_LEVEL))
bc.data.head()

100%|██████████| 424545/424545 [00:01<00:00, 340864.26it/s]


Unnamed: 0_level_0,id,mesh_term,downcase_mesh_term,mesh_l1
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NCT00000102,9144384,"Adrenal Hyperplasia, Congenital","adrenal hyperplasia, congenital","[Congenital, Hereditary, and Neonatal Diseases..."
NCT00000102,9144385,Adrenogenital Syndrome,adrenogenital syndrome,"[Congenital, Hereditary, and Neonatal Diseases..."
NCT00000102,9144386,Adrenocortical Hyperfunction,adrenocortical hyperfunction,[Endocrine System Diseases]
NCT00000102,9144387,Hyperplasia,hyperplasia,"[Pathological Conditions, Signs and Symptoms]"
NCT00000106,9143121,Rheumatic Diseases,rheumatic diseases,"[Musculoskeletal Diseases, Skin and Connective..."


### Now, need to collect all of the level 1 mesh terms from the dimension into the main table

In [71]:
# method for combining all of the level mesh values for a specific nct_id
def combine_all_mesh_l1(df, nct_id):
    all_mesh_l1 = []
    if nct_id in df.index:
        sub_df = df.loc[[nct_id]]
        for meshes in sub_df['mesh_l1']:
            for mesh in meshes:
                if mesh not in all_mesh_l1:
                    all_mesh_l1.append(mesh)
    return all_mesh_l1

In [84]:
s_mesh_l1 = ss.studies.reset_index()['nct_id'].progress_apply(lambda x: 
                                                              combine_all_mesh_l1(bc.data, x)).values

100%|██████████| 250890/250890 [01:27<00:00, 2882.50it/s]


In [85]:
s_mesh_l1[:10]

array([list([]), list(['Pathological Conditions, Signs and Symptoms']),
       list(['Diagnosis', 'Nutritional and Metabolic Diseases', 'Pathological Conditions, Signs and Symptoms', 'Physiological Phenomena']),
       list(['Female Urogenital Diseases and Pregnancy Complications', 'Mental Disorders', 'Behavior and Behavior Mechanisms']),
       list(['Neoplasms', 'Pathological Conditions, Signs and Symptoms', 'Chemically-Induced Disorders']),
       list(['Mental Disorders']), list(['Musculoskeletal Diseases']),
       list(['Female Urogenital Diseases and Pregnancy Complications', 'Neoplasms', 'Infections']),
       list(['Neoplasms']), list(['Respiratory Tract Diseases'])],
      dtype=object)

In [86]:
ss.studies['mesh_l1_list'] = s_mesh_l1

In [89]:
ss.studies[['mesh_l1_list']].head()

Unnamed: 0_level_0,mesh_l1_list
nct_id,Unnamed: 1_level_1
NCT04072757,[]
NCT04073069,"[Pathological Conditions, Signs and Symptoms]"
NCT04073056,"[Diagnosis, Nutritional and Metabolic Diseases..."
NCT04073043,[Female Urogenital Diseases and Pregnancy Comp...
NCT04073017,"[Neoplasms, Pathological Conditions, Signs and..."
