In [1]:
import sys
if "../" not in sys.path:
    sys.path.append("../")

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from tqdm import tqdm
tqdm.pandas()

import pdaactconn as pc
from trialexplorer.mesh_terms import MeSHCatalog
from trialexplorer import AACTStudySet

import matplotlib.pyplot as plt
%matplotlib inline

### Initialization

The mesh catalog from nih is saved in the ./xml folder. When initialized for the first time, if the xml is not found locally, the util will download it from the internet save it locally.

In [3]:
# intializing MeSH object
mc = MeSHCatalog()  

Parsing MeSH xml: xml/desc2020.xml ...
Parse Complete! (parsed ElementTree root can be found in the .root attribute)


### Using the main tool, load some MeSH terms to demostrate the MeSH Utility

In [5]:
# selecting all interventional studies
conn = pc.AACTConnection(source=pc.AACTConnection.REMOTE)
ss = AACTStudySet.AACTStudySet(conn= conn, tqdm_handler=tqdm_notebook)
ss.add_constraint("study_type = 'Interventional'")
ss.load_studies()

# loading all dimensional data
ss.add_dimensions('browse_conditions')
ss.add_dimensions('sponsors')
ss.refresh_dim_data()

266207 studies loaded!
Successfuly added these 1 dimensions: ['browse_conditions']
Failed to add these 0 dimensions: []
Successfuly added these 1 dimensions: ['sponsors']
Failed to add these 0 dimensions: []


HBox(children=(IntProgress(value=0, max=533), HTML(value='')))

Syncing the temp table temp_cur_studies in 533 chunks x 500 records each

Creating index on the temp table
 - Loading dimension browse_conditions
 -- Loading raw data
 -- Sorting index
 - Loading dimension sponsors
 -- Loading raw data
 -- Sorting index


### The dimension browse_conditions stores MeSH terms about diseases/conditions that are associated with each study:

The mesh_term field contains mesh terms at any level

In [6]:
bc = ss.dimensions['browse_conditions']
bc.data.head()

Unnamed: 0_level_0,id,mesh_term,downcase_mesh_term
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NCT00000102,5750378,"Adrenal Hyperplasia, Congenital","adrenal hyperplasia, congenital"
NCT00000102,5750379,Adrenogenital Syndrome,adrenogenital syndrome
NCT00000102,5750380,Adrenocortical Hyperfunction,adrenocortical hyperfunction
NCT00000102,5750381,Hyperplasia,hyperplasia
NCT00000106,5749144,Rheumatic Diseases,rheumatic diseases


Let's use the first mesh term as an example:

In [11]:
cur_mesh = 'Adrenal Hyperplasia, Congenital'

For any single MeSH, we can look at which levels it appears in the tree structure

In [12]:
mc.get_levels(cur_mesh)

[3, 5, 6]

MeSH is organized by a index structure as specified by NIH, with each . delimiting a node transition.

This method returns all off the indices that lead to the follow mesh term.

Note that often each mesh term appears multiple times on the tree

In [14]:
mc.get_trees(cur_mesh)

['C12.706.316.090.500',
 'C13.351.875.253.090.500',
 'C16.131.939.316.129.500',
 'C16.320.033',
 'C16.320.565.925.249',
 'C18.452.648.925.249',
 'C19.053.440',
 'C19.391.119.090.500']

With trees, the level of a node is defined as the number of steps it takes to reach the node from the root of the tree. 

For any particular MeSH term, we can look at what is above it in the tree:

In [21]:
mc.lookup_higher_level(cur_mesh, 1)

['Congenital, Hereditary, and Neonatal Diseases and Abnormalities',
 'Endocrine System Diseases',
 'Female Urogenital Diseases and Pregnancy Complications',
 'Male Urogenital Diseases',
 'Nutritional and Metabolic Diseases']

In [23]:
mc.lookup_higher_level(cur_mesh, 2)

['Adrenal Gland Diseases',
 'Congenital Abnormalities',
 'Female Urogenital Diseases',
 'Genetic Diseases, Inborn',
 'Gonadal Disorders',
 'Metabolic Diseases',
 'Urogenital Abnormalities']

In [24]:
mc.lookup_higher_level(cur_mesh, 3)

['Adrenal Hyperplasia, Congenital',
 'Disorders of Sex Development',
 'Metabolism, Inborn Errors',
 'Urogenital Abnormalities']

For any 2 MeSH terms, we can measure their closest distance on the tree structure

In [28]:
m1 = 'Adrenal Hyperplasia, Congenital'
m2 = 'Hyperplasia'
mc.shortest_mesh_dist(m1, m2)

6

In [29]:
m1 = 'Adrenal Hyperplasia, Congenital'
m2 = 'Adrenogenital Syndrome'
mc.shortest_mesh_dist(m1, m2)

1