In [63]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns

# use the whole width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# ensure plots don't have a transparent background
plt.rcParams['axes.facecolor']='white' 
plt.rcParams['figure.facecolor']='white'

pd.set_option("display.max_rows",100)

# Using pre-trained medical concepts embeddings to associate therapeutic areas to clinical studies

### Overview

![Overview](overview.png)

In [4]:
# some constants
ct_gov_dir = '/home/therrmann/dev/datasets/clinical-trials-gov'
mesh_dir = '/home/therrmann/dev/datasets/mesh'
umls_dir = '/home/therrmann/dev/datasets/umls/2018AB-full/2018AB/2018AB/META'
emb_dir = '/home/therrmann/dev/datasets/clinicalml-embeddings/embeddings'

## Load Embeddings

For the following to work you need to download relevant embeddings file from https://github.com/clinicalml/embeddings

In [5]:
from gensim.models.keyedvectors import KeyedVectors
w2v_path = emb_dir+'/DeVine_etal_200.txt'     # len(w2v.vocab) = 52102
w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=False)

In [9]:
if False:
    w2v_path = emb_dir + '/stanford_cuis_svd_300.txt' # len(w2v.vocab) = 22705
    w2v_stanford = KeyedVectors.load_word2vec_format(w2v_path, binary=False)

In [10]:
if False:
    w2v_path = emb_dir + '/claims_cuis_hs_300.txt' # len(w2v.vocab) = 14852
    w2v_claims = KeyedVectors.load_word2vec_format(w2v_path, binary=False)

In [11]:
vocab_keys = w2v.vocab.keys()
len(vocab_keys)

52102

In [12]:
# Find concepts similar to C0018681 headache

w2v.most_similar('C0018681',  topn=6)

# Returns:

# [('C0012833', 0.782675564289093),     dizziness
#  ('C0220870', 0.7098520994186401),    Lightheadedness
#  ('C0917801', 0.6731902360916138),    Sleeplessness
#  ('C0043352', 0.669024646282196),     Xerostomia
#  ('C0027497', 0.6656966805458069),    Nausea
#  ('C0221512', 0.6654322147369385)]    Stomach ache

[('C0012833', 0.782675564289093),
 ('C0220870', 0.7098520994186401),
 ('C0917801', 0.6731902360916138),
 ('C0043352', 0.669024646282196),
 ('C0027497', 0.6656966805458069),
 ('C0221512', 0.6654322147369385)]

In [13]:
# Find concepts similar to C0019158 hepatitis

w2v.most_similar('C0019158')

# Returns:

# [('C0037140', 0.6207730770111084),  B Virus Infection
#  ('C0019196', 0.5936840772628784),  Hepatitis C
#  ('C0019163', 0.5609232187271118),  Hepatitis B
#  ('C0019169', 0.5462726950645447),  Hepatitis B Virus
#  ('C0019159', 0.5288532376289368),  Hepatitis A
#  ('C0042721', 0.5198029279708862),  Viral hepatitis
#  ('C0524909', 0.5124529004096985),  Hepatitis B, Chronic
#  ('C1443861', 0.4951326549053192),  Post-Exposure Prophylaxis
#  ('C0011226', 0.4950075149536133),  Hepatitis D Infection
#  ('C0019189', 0.4948180317878723)]  Hepatitis, Chronic

[('C0037140', 0.6207730770111084),
 ('C0019196', 0.5936840772628784),
 ('C0019163', 0.5609232187271118),
 ('C0019169', 0.5462726950645447),
 ('C0019159', 0.5288532376289368),
 ('C0042721', 0.5198029279708862),
 ('C0524909', 0.5124529004096985),
 ('C1443861', 0.4951326549053192),
 ('C0011226', 0.4950075149536133),
 ('C0019189', 0.4948180317878723)]

## 1. Get MeSH terms associated to Studies

### DB Schema

![aact_schema.png](aact_schema.png)


### Utility Functions

In [14]:
def df_mem(df):
    return '%.1f Mb' % (df.memory_usage(index=True, deep=True).values.sum()/1024/1024) 

def load_df(file_name, nrows=1000, header='infer', names=None):
    df = pd.read_csv(file_name, sep='|', nrows=nrows, low_memory=False, header=header, names=names)
    #print("loaded '%s', %d rows (%s)" % (file_name, len(df), df_mem(df)))
    return df

### Load / Inspect Studies

For the following to work you need to download CTTI files from https://aact.ctti-clinicaltrials.org/download

In [16]:
df_studies = load_df(ct_gov_dir + '/studies.txt', 300000)
if False:
    display(df_studies[:5].transpose())

### Map Studies to MeSH Terms

In [18]:
df_mesh_ct = load_df(ct_gov_dir + '/browse_conditions.txt', nrows=None)
df_mesh_ct = df_mesh_ct[['nct_id', 'downcase_mesh_term']]
df_mesh_ct[:5]

Unnamed: 0,nct_id,downcase_mesh_term
0,NCT03732872,fibrosis
1,NCT03427424,disease
2,NCT03366129,stroke
3,NCT03304704,malaria
4,NCT03304691,malaria


In [19]:
from collections import defaultdict
nct_to_mesh_term = defaultdict(set)

for row in df_mesh_ct[['nct_id','downcase_mesh_term']].itertuples():
    nct_to_mesh_term[row[1]].add(row[2])

In [20]:
nct_to_mesh_term['NCT03366129']

{'cerebrovascular disorders', 'stroke'}

## 2. Convert MeSH Terms to their Unique Identifiers

### Find MeSH Terms and Codes

For the following to work you need to download files (at least `d2018.bin`) from ftp://nlmpubs.nlm.nih.gov/online/mesh/MESH_FILES/asciimesh/

Format: 
- The various record types: https://www.nlm.nih.gov/mesh/intro_record_types.html  
- Descriptors types: https://www.nlm.nih.gov/mesh/dtype.html  
- SCR types: https://www.nlm.nih.gov/mesh/ctype.html


In [21]:
def parse_mesh_file_as_df(filepath):
    import re
    tups = []
    import re
    
    with open(filepath) as fp:
        heading,mesh_nb,ui = None,None,None
        for cnt, line in enumerate(fp):
            if line.startswith('MN ='):
                record_type = re.search(r'MN = (.+)', line).group(1)
                #print('MN: ' + MN)
            elif line.startswith('MH ='):
                name = re.search(r'MH = (.+)', line).group(1).lower()
                #print('name: ' + name)
            elif line.startswith('UI ='):
                ui = re.search(r'UI = (.+)', line).group(1)
                #print('ui: ' + ui)
                tups.append((ui, name, record_type))
    return pd.DataFrame(tups, columns=['ui','name','mesh_number'])


In [22]:
df_mesh=parse_mesh_file_as_df(mesh_dir + '/d2018.bin')
print(df_mesh[10:20][['ui','name']])

         ui                                name
10  D000011       abelson murine leukemia virus
11  D000012                abetalipoproteinemia
12  D000013            congenital abnormalities
13  D000014         abnormalities, drug-induced
14  D000015             abnormalities, multiple
15  D000016    abnormalities, radiation-induced
16  D000017              abo blood-group system
17  D000018                            abomasum
18  D000019                abortifacient agents
19  D000020  abortifacient agents, nonsteroidal


### Build the MeSH term to ID Dictionary

In [23]:
from collections import defaultdict
mesh_term_to_id = {}

for row in df_mesh[['name','ui']].itertuples():
    mesh_term_to_id[row[1]] = row[2]
    
mesh_term_to_id['neuroblastoma']

'D009447'

In [24]:
# MeSH terms in CT.gov but not in df_mesh
mesh_missing = set(df_mesh_ct.downcase_mesh_term.values) - set(df_mesh.name.values)
print('mesh terms in CT.gov but not in MeSH official list: %d' % len(mesh_missing))
if len(mesh_missing) <= 10:
    print(mesh_missing)

mesh terms in CT.gov but not in MeSH official list: 3
{'metabolic syndrome x', 'giant lymph node hyperplasia', 'drug-induced liver injury, chronic'}


## 3. Load UMLS CUIs:
- Map MeSH IDs to CUIs
- Map CUIs to strings

For the following to work you need to download files from the National Library of Medicine (NLM) from https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/ after obtaining a UMLS license from NLM.  
Also reduce the file size with the `cut` command below.

### MRCONSO.RRF

`MRCONSO.RRF` file format:  https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.concept_names_and_sources_file_mr/?report=objectonly

```

1 CUI	Unique identifier for concept
2 LAT	Language of term
3 TS	Term status
4 LUI	Unique identifier for term
5 STT	String type
6 SUI	Unique identifier for string
7 ISPREF	Atom status - preferred (Y) or not (N) for this string within this concept
8 AUI	Unique identifier for atom - variable length field, 8 or 9 characters
9 SAUI	Source asserted atom identifier [optional]
10 SCUI	Source asserted concept identifier [optional]
11 SDUI	Source asserted descriptor identifier [optional]
12 SAB	Abbreviated source name (SAB). Maximum field length is 20 alphanumeric characters. Two source abbreviations are assigned:
13 TTY	Abbreviation for term type in source vocabulary, for example PN (Metathesaurus Preferred Name) or CD (Clinical Drug). Possible values are listed on the Abbreviations Used in Data Elements page.
14 CODE	Most useful source asserted identifier (if the source vocabulary has more than one identifier), or a Metathesaurus-generated source entry identifier (if the source vocabulary has none)
15 STR	String
16 SRL	Source restriction level

CUI,TS,LUI,SUI,AUI,SCUI,SDUI,TTY,CODE,STR

1        2   3  4       5   6       7      8         9    10       11      12  13  14     15
CUI     |lng|TS|LUI    |STT|SUI    |ISPREF|AUI      |SAUI|SCUI    |SDUI   |SAB|TTY|CODE  |STR    ||||||||||||
C0004095|ENG|P|L0004095|PF|S0016438|Y|     A0027325 |    |M0001884|D001248|MSH|MH|D001248|Asthenopia|0|N|256|
C0004095|ENG|S|L0015427|PF|S0040055|Y|     A26649322|    |M0001884|D001248|MSH|ET|D001248|Eyestrain|0|N|256| 
C0004095|ENG|S|L0301164|PF|S0412953|Y|     A26649321|    |M0001884|D001248|MSH|ET|D001248|Visual Fatigue|0|N|256| 
C0004095|ENG|S|L0301164|VW|S0372317|Y|     A26600615|    |M0001884|D001248|MSH|ET|D001248|Fatigue, Visual|0|N|| 
C0004095|ENG|S|L0832617|PF|S0877142|Y|     A26630539|    |M0001884|D001248|MSH|ET|D001248|Eye Fatigue|0|N|256| 
C0004095|ENG|S|L0832617|VW|S0882316|Y|     A0940491 |    |M0001884|D001248|MSH|PM|D001248|Fatigue, Eye|0|N|256|

We need columns: CUI,SAB,CODE,STR
1 CUI	Unique identifier for concept

cut -d'|' -f1,12,14,15  ~/dev/datasets/umls/2018AB-full/2018AB/2018AB/META/MRCONSO.RRF > ~/dev/datasets/umls/2018AB-full/2018AB/2018AB/META/MRCONSO_reduced.RRF

```


In [30]:
# since the file is big, need a special function to read in streaming mode and eliminate on the fly
# CUIs that are not in the embeddings
def load_conso(file_name, vocab_keys):
    rows=[]
    cnt=0
    with open(file_name) as fp:  
        for cnt, line in enumerate(fp):
            line=line.strip()
            cols = line.split('|')
            cols[3] = cols[3].lower()
            if cols[0] in vocab_keys:
                rows.append(cols)
            cnt += 1
    
    #print("loaded '%s', %d rows (%s)" % (file_name, len(df), df_mem(df)))
    df = pd.DataFrame(rows, columns=['CUI','SAB','CODE','STR'])
    print("loaded '%s', %d rows (%s)" % (file_name, len(df), df_mem(df)))
    print('processed rows: %d' % cnt)
    return df

In [31]:
df_c = load_conso(umls_dir + '/MRCONSO_reduced.RRF', vocab_keys)

loaded '/home/therrmann/dev/datasets/umls/2018AB-full/2018AB/2018AB/META/MRCONSO_reduced.RRF', 456262 rows (116.3 Mb)
processed rows: 7148656


In [32]:
df_c[232000:232005]

Unnamed: 0,CUI,SAB,CODE,STR
232000,C0043241,CHV,0000013196,wound infections
232001,C0043241,NDFRT,N0000003168,"infection, wound"
232002,C0043241,MSH,D014946,"infection, wound"
232003,C0043241,MSH,D014946,wound infect
232004,C0043241,CHV,0000013196,infected wounds


In [33]:
df_c[df_c.CODE=='D014947']

Unnamed: 0,CUI,SAB,CODE,STR
232046,C0043250,MSH,D014947,wounds
232048,C0043250,MSH,D014947,wound
232057,C0043251,MSH,D014947,wounds and injuries
232067,C0043251,MSH,D014947,"injuries, wounds"
232068,C0043251,MSH,D014947,injury and wounds
232070,C0043251,MSH,D014947,wounds and injury
232072,C0043251,MSH,D014947,"wounds, injury"
232076,C0043251,MSH,D014947,injuries wounds
232077,C0043251,MSH,D014947,wounds injuries
232080,C0043251,MSH,D014947,injuries and wounds


In [34]:
df_c[df_c.STR=='wound']

Unnamed: 0,CUI,SAB,CODE,STR
232048,C0043250,MSH,D014947,wound
232049,C0043250,LNC,MTHU001326,wound
232050,C0043250,LNC,LP7726-5,wound
232051,C0043250,DXP,U004475,wound
232052,C0043250,AOD,0000004441,wound
232053,C0043250,CHV,0000013199,wound
232054,C0043250,CSP,1583-7938,wound


### 3.1 Map MeSH Codes to CUIs

In [35]:
from collections import defaultdict
mesh_code_to_cui = defaultdict(set) # used to link a study to CUIs (through mesh codes)

                                   # used later to associate therapeutic area (from their strings) to CUIs
for row in df_c[df_c.SAB=='MSH'][['CODE','CUI']].itertuples():
    code, cui = row[1], row[2]
    if cui in vocab_keys:
        mesh_code_to_cui[code].add(cui) 

In [36]:
mesh_code_to_cui['D002787']

{'C0008385'}

### Use Above Results to Map Studies to CUIs

In [37]:
std_to_cuis = defaultdict(set)
imperfect_studies=set()
for idx,(std,terms) in enumerate(nct_to_mesh_term.items()):
    for term in terms:
        mesh_id = mesh_term_to_id.get(term)
        if mesh_id is None:
            #print('mesh term "' + term + \
            #                '" is in CT.gov but not in the official MeSH terms. Ignore term for study %s' % std)
            imperfect_studies.add(std)
            break
        for cui in mesh_code_to_cui[mesh_id]:
            std_to_cuis[std].add(cui)
                
print('removing (imperfect) studies containing at least 1 mesh term not in the official mesh list: %d' % len(imperfect_studies))
for imperfect in imperfect_studies:
    if std_to_cuis.get(imperfect) is not None:
        del std_to_cuis[imperfect]

removing (imperfect) studies containing at least 1 mesh term not in the official mesh list: 1160


In [38]:
std_to_cuis = defaultdict(set)
imperfect_studies=set()
for idx,(std,terms) in enumerate(nct_to_mesh_term.items()):
    for term in terms:
        mesh_id = mesh_term_to_id.get(term)
        if mesh_id is None:
            imperfect_studies.add(std)
            break
        for cui in mesh_code_to_cui[mesh_id]:
            std_to_cuis[std].add(cui)

# removing studies containing at least 1 mesh term not in the official mesh terms                
for imperfect in imperfect_studies:
    if std_to_cuis.get(imperfect) is not None:
        del std_to_cuis[imperfect]

In [39]:
std_to_cuis['NCT03366129']

{'C0007820', 'C0038454', 'C0241832'}

### 3.2 Map Concepts CUIs to Strings

In [40]:
from collections import defaultdict

cui_to_strings = defaultdict(set)  # a set of descriptions (lowercased) for each CUI (obtained from the STR column)
                                   # used later to associate therapeutic area (from their strings) to CUIs
for row in df_c[['CUI','STR']].itertuples():
    cui, term = row[1], row[2]
    cui_to_strings[cui].add(term)

In [41]:
from collections import defaultdict

cui_to_strings = defaultdict(set)
for row in df_c[['CUI','STR']].itertuples():
    cui, term = row[1], row[2]
    cui_to_strings[cui].add(term)

In [42]:
cui_to_strings['C0007820']

{'brain vascular dis',
 'brain vascular disorder',
 'brain vascular disorders',
 'cerebral vascular disorder',
 'cerebral vascular disorders',
 'cerebral vascular disturbance (nos)',
 'cerebral vascular lesion (nos)',
 'cerebrovasc disease nos',
 'cerebrovascular dis',
 'cerebrovascular disease',
 'cerebrovascular disease or lesion nos',
 'cerebrovascular diseases',
 'cerebrovascular disorder',
 'cerebrovascular disorder (nos)',
 'cerebrovascular disorders',
 'cerebrovascular disorders [disease/finding]',
 'disease, cerebrovascular',
 'diseases, cerebrovascular',
 'disorder cerebrovascular',
 'intracranial vascular dis',
 'intracranial vascular disease',
 'intracranial vascular diseases',
 'intracranial vascular disorder',
 'intracranial vascular disorders',
 'unspecified cerebrovascular disease',
 'vascular dis intracranial',
 'vascular disease, intracranial',
 'vascular diseases, intracranial',
 'vascular disorder, brain',
 'vascular disorder, intracranial',
 'vascular disorders, bra

In [54]:
def get_study_terms(std):
    # browse conditions/interventions -> mesh terms -> CUIs (UMLS) -> 
    # -> CUI terms from MRCONSO.RRF (all strings for the CUI)
    cuis = std_to_cuis.get(std)
    terms = set()
    if cuis is None:
        return terms
    for cui in cuis:
        terms.update(cui_to_strings[cui])
    return terms

In [44]:
# get an idea about study NCT03366129
print(df_studies[df_studies.nct_id=='NCT03366129'].brief_title.values[0])
print(df_studies[df_studies.nct_id=='NCT03366129'].official_title.values[0])# 

Blood-Brain Barrier Disruption in People With White Matter Hyperintensities Who Have Had a Stroke
The Natural History of Blood-Brain Barrier Disruption in Stroke Patients With White Matter Hyperintensities (A Cohort Study)


In [45]:
# check the corresponding terms
list(get_study_terms('NCT03366129'))[:5]

['accident cerebrovascular',
 'syndrome, stroke',
 'cerebral vascular disorder',
 'brain vascular disorder',
 'cerebrovascular accident, (cva)']

## 4. Manually Associate Therapeutic Areas to UMLS Concepts (CUIs)

In function `find_cuis_for_terms()`, uncomment the `print('accepted cui ...')` line to see retained CUIs 

In [52]:
def any_term_in_strings(term_list, strings):
    """
    Returns True if any term of term_list is in strings.
    Args:
    - term_list: list of strings
    - strings: other list of strings
    """
    for term in term_list:
        term_ok=False
        for string in strings:
            if term in string:
                #print('  term "%s" ok! in string: %s' % (term,string))
                return True
    return False

if False: # test
    any_term_in_strings(['virus','viral'], {'aa viral', 'bb'})

def find_cuis_for_terms(term_lists, exclude_terms=None):
    """
    Finds the set of CUIs whose terms match a list of terms.
    Args:
    - term_lists: list of list of terms: for each of the term_list in term_lists, 
      at least one term must be in at least one of the strings. Thus it's an OR 
      inside the term_list, but an AND between term_list.
    - exclude_terms: the CUI is rejected if contains one of these terms.
    Returns:
    - a set of CUIs as a set of strings.
    """
    cuis=set()
    cnt=0
    for cui,strings in cui_to_strings.items():
        # if any excluded term is in any strings, reject this cui
        if exclude_terms is not None:
            excluded = False
            for ex_term in exclude_terms:
                for string in strings:
                    if ex_term in string:
                        excluded = True
                        break
                if excluded:
                    break
            if excluded:
                #print('cui %s excluded. Strings: %s' % (cui, str(strings)))
                continue
                
        # process the term_lists
        cui_ok = True
        for term_list in term_lists:
            #print('searching term_list "%s"' % (term_list))
            if not any_term_in_strings(term_list, strings):
                #print('term list "%s" does not match strings "%s"' % (term_list, strings))
                cui_ok = False
                break
            #else:
            #    #print('term list "%s" matches strings "%s"' % (term_list, strings))
            #    cnt+=1
            #    if cnt == 100:
            #        return
                
        if cui_ok:
            # THIS IS THE LINE TO UNCOMMENT TO DEBUG CUIs
            #print('accepted cui "%s" that corresponds to strings "%s"\n' % (cui, str(strings)))
            cuis.add(cui)
    return cuis     

In [47]:
cardiology_cuis = find_cuis_for_terms([['cardiolog', 'cardiovascul']])
#dental_cuis = find_cuis_for_terms([['dental', ]], exclude_terms=['accidental', 'incidental', 'occidental', 'osteodental'])
#dental_cuis = find_cuis_for_terms([['caries', 'cavity', 'cavities', 'orthodon', 'endodon',]]) # worse!
dental_cuis = find_cuis_for_terms([['tooth cavit', 'caries', 'cavities']], 
                                  exclude_terms=['peccaries', 'cotyloid cavities', 'nasal cavities', 'cavities, glenoid', 'cavities paranasal',
                                                 'pleural cavities', 'cavities, pleural', 'pericardial cavities', 'cavities pelvic',
                                                 'cavities uterine', 'abdominal cavities', 'cavities, tympanic', 'body cavities'])
dermatology_cuis = find_cuis_for_terms([['dermatol', ]]) # 'skin' adds too many matches in other contexts
device_cuis = find_cuis_for_terms([['device', ]])
environ_cuis = find_cuis_for_terms([['environmental', 'environments', 'pollut']])
endocrinology_cuis = find_cuis_for_terms([['endocrinol', ]])
family_med_cuis = find_cuis_for_terms([['family medicine', ]])
gastro_cuis = find_cuis_for_terms([['gastroentero', ]])
genetic_cuis = find_cuis_for_terms([['geneti', ], ['diseas']])
volunteer_cuis = find_cuis_for_terms([['volunteer', ]]) # difficult. Other contexts than health
hematology_cuis = find_cuis_for_terms([['hematol', ]], exclude_terms=['non-hemato'])
#hepatology_cuis = find_cuis_for_terms([['hepatol', 'hepatic']])
#hepatology_cuis = find_cuis_for_terms([['liver']])
hepatology_cuis = find_cuis_for_terms([['hepatitis']]) # needs to be refined. Might too restrictive but liver/hepatol match too much
immunology_cuis = find_cuis_for_terms([['immunolog']])
infect_cuis=find_cuis_for_terms([['infectious', 'infected', 'infection'], ['disease']])
intern_cuis=find_cuis_for_terms([['intern'],['medicin']])
muskuloskel_cuis=find_cuis_for_terms([['musculoskelet']])
nephrology_cuis=find_cuis_for_terms([['nephrolog']])
neurology_cuis=find_cuis_for_terms([['neurolog']])
nutrition_cuis=find_cuis_for_terms([['nutrition', 'body weight', 'weight reduc', 'weight gain', 'overweight']])
obstetrics_cuis=find_cuis_for_terms([['obstetri', 'gynecol']])
oncology_cuis = find_cuis_for_terms([['oncolog', 'cancer']])
occupdisease_cuis = find_cuis_for_terms([['occupational disease']])
ophtalmo_cuis = find_cuis_for_terms([['ophthalmol', 'eye']])
orthopedics_cuis = find_cuis_for_terms([['orthopedi']])
otorino_cuis = find_cuis_for_terms([['otolaryngol']])
pediatrics_cuis = find_cuis_for_terms([['pediatr', 'neonat']])
parasitic_cuis = find_cuis_for_terms([['parasit'],['disease']])
pharmacol_toxicol_cuis = find_cuis_for_terms([['pharmacol', 'toxicol']])
#podiatrics_cuis = find_cuis_for_terms([['podiat', 'foot diseases', 'foot injur']]) # podiatry not found in embeddings, although in UMLS
podiatrics_cuis = find_cuis_for_terms([['podiat']]) # including foot diseases/injur make it match dental studies
psy_cuis = find_cuis_for_terms([['psychiatr', 'psycholog']])
pulmon_cuis = find_cuis_for_terms([['pulmonar', 'respirat'], ['diseas']])
rare_cuis = find_cuis_for_terms([['orphan drug', ]])# C0178786 orphan disease/drug
                                                    # C0178604 drug design/synthesis/production
                                                    # C0013232 Drugs, Orphan        --> THE ONLY ONE PRESENT IN EMBEDDINGS :(
                                                    # C0920627 Orphan Diseases
                                                    # C0029308 Orphan Drug Production
                                                    # C0599036 unprofitable drug development
                                                    # C0678236 Rare Diseases
rheumatology_cuis = find_cuis_for_terms([['rheumat', ]]) # C0035452 (Rheumatology specialty) badly missing + should we limit to 'rheumatolog' ?
sleep_cuis = find_cuis_for_terms([['sleep', ]])
symptoms_cuis = find_cuis_for_terms([['general manifestation of disorders', ]]) # matches only C1457887: symptoms
traume_cuis = find_cuis_for_terms([['traumas', ]]) # seems more focused than 'trauma' and avoids 'non-trauma' and 'nontrauma' strings
urology_cuis = find_cuis_for_terms([['urology', ]])
vaccine_cuis = find_cuis_for_terms([['vaccine', ]])

areas = {
    'Cardiology/Vascular Diseases' :       cardiology_cuis,
    'Dental and Oral Health' :             dental_cuis,
    'Dermatology' :                        dermatology_cuis,
    'Devices' :                            device_cuis,
    'Disorders of Environmental Origin' :  environ_cuis,
    'Endocrinology' :                      endocrinology_cuis,
    'Family Medicine' :                    family_med_cuis,
    'Gastroenterology' :                   gastro_cuis,
    'Genetic Disease' :                    genetic_cuis,
    'Healthy Volunteers' :                 volunteer_cuis,
    'Hematology' :                         hematology_cuis,
    'Hepatology' :                         hepatology_cuis,
    'Immunology' :                         immunology_cuis,
    'Infections and Infectious Diseases' : infect_cuis,
    'Internal Medicine' :                  intern_cuis,
    'Musculoskeletal' :                    muskuloskel_cuis,
    'Nephrology' :                         nephrology_cuis,
    'Neurology' :                          neurology_cuis,
    'Nutrition and Weight Loss' :          nutrition_cuis,
    'Obstetrics/Gynecology' :              obstetrics_cuis,
    'Oncology' :                           oncology_cuis,
    'Occupational Diseases' :              occupdisease_cuis,
    'Ophthalmology' :                      ophtalmo_cuis,
    'Orthopedics/Orthopedic Surgery' :     orthopedics_cuis,
    'Otolaryngology' :                     otorino_cuis,
    'Pediatrics/Neonatology' :             pediatrics_cuis,
    'Parasitic Diseases' :                 parasitic_cuis,
    'Pharmacology/Toxicology' :            pharmacol_toxicol_cuis,
    'Podiatry' :                           podiatrics_cuis,
    'Psychiatry/Psychology' :              psy_cuis,
    'Pulmonary/Respiratory Diseases' :     pulmon_cuis,
    'Rare Diseases and Disorders' :        rare_cuis,
    'Rheumatology' :                       rheumatology_cuis,
    'Sleep' :                              sleep_cuis,
    'Symptoms and General Pathology' :     symptoms_cuis,
    'Trauma' :                             traume_cuis,
    'Urology' :                            urology_cuis,
    'Vaccines' :                           vaccine_cuis,
}


## 5. Match Studies with Therapeutic Areas using Concept Embeddings

In [48]:
def find_best_areas(std):
    """
    Finds the therapeutic areas having the closest concepts to those of a given study
    Args:
    - std: study identifier
    Returns:
    - the study terms for debugging purposes
    - a list of 5 tuple2 containing the area and the similarity score with the study
      in similarity decreasing order
    """
    std_cuis = std_to_cuis[std]
    sims, area_list = [], []
    for area, cuis in areas.items():
        if len(std_cuis) == 0:
            raise Exception('no cuis for std %s' % std)
        sims.append(w2v.n_similarity(std_cuis, cuis))
        area_list.append(area)
    indices = list(reversed(np.argsort(sims)))
    best_areas = np.array(area_list)[indices][:5]
    best_sims = np.array(sims)[indices][:5]
    return get_study_terms(std), list(zip(best_areas,best_sims))

In [49]:
find_best_areas('NCT02569047')

({'caries',
  'caries dent',
  'caries dental',
  'caries, dental',
  'cariosity of teeth',
  'carious teeth',
  'cavities',
  'cavity',
  'decay dent',
  'decay dental',
  'decay tooth',
  'decay, dental',
  'decayed teeth',
  'decaying tooth',
  'decays tooth',
  'dent caries',
  'dent decay',
  'dental caries',
  'dental caries [disease/finding]',
  'dental caries nos',
  'dental caries, unspecified',
  'dental cavities',
  'dental cavity',
  'dental decay',
  'early dental caries',
  'frequent caries',
  'saprodontia',
  'tooth caries',
  'tooth cavities',
  'tooth cavity',
  'tooth decay'},
 [('Dental and Oral Health', 0.6765870190595883),
  ('Infections and Infectious Diseases', 0.6115365183823446),
  ('Dermatology', 0.5787846411339332),
  ('Pediatrics/Neonatology', 0.5696169758068482),
  ('Parasitic Diseases', 0.5573589695061624)])

In [57]:
def find_best_areas(std):
    """
    Finds the therapeutic areas having the closest concepts to those of a given study
    Args:
    - std: study identifier
    Returns:
    - the study terms for debugging purposes
    - a list of 5 tuple2 containing the area and the similarity score with the study
      in similarity decreasing order
    """
    std_cuis = std_to_cuis[std]
    sims, area_list = [], []
    for area, cuis in areas.items():
        if len(std_cuis) == 0:
            raise Exception('no cuis for std %s' % std)
        sims.append(w2v.n_similarity(std_cuis, cuis))
        area_list.append(area)
    indices = list(reversed(np.argsort(sims)))
    best_areas = np.array(area_list)[indices][:5]
    best_sims = np.array(sims)[indices][:5]
    return get_study_terms(std), list(zip(best_areas,best_sims))

def classify_studies(nb_studies):
    """
    Finds therapeutic areas for a given number of studies. Prints a basic summary 
    of the results
    Args:
    - nb_studies: number of studies to consider.
    Returns:
    - a dictionary indexed by area whose value is a list of 3-tuples containing:
      - study identifier
      - study strings (see get_study_terms()) for evaluation
      - the list of closest areas order with their similarity score, the first one being
        the corresponding dictionary key.
    """
    from collections import defaultdict
    # stds_by_area: key: area, value: list( (study_id, list(study_term), list( (area,similarity) )) )
    stds_by_area = defaultdict(list) 
    
    for cnt,std in enumerate(list(std_to_cuis.keys())[:nb_studies]):
        std_terms, areas_sim = find_best_areas(std)
        area = areas_sim[0][0]
        if area not in stds_by_area.keys():
            print("study found for area '%s' after analyzing %d studies" % (area, cnt+1))
        stds_by_area[area].append((std, std_terms, areas_sim))

    # sort results by number of studies in each area, just to print a summary of results
    res_areas, res_lens = [], []
    for area,ranks_list in stds_by_area.items():
        res_areas.append(area)
        res_lens.append(len(ranks_list))
    res_areas, res_lens = np.array(res_areas), np.array(res_lens)
    indices = list(reversed(np.argsort(res_lens)))
    sorted_areas = np.array(res_areas)[indices]
    sorted_lens = np.array(res_lens)[indices]
    areas_results = list(zip(sorted_areas,sorted_lens))
    print('%d studies in %d areas' % (nb_studies, len(areas_results)))
    for res in areas_results:
        print('%s: %d' %(res[0], res[1]))
        
    return stds_by_area

In [58]:
studies_by_area = classify_studies(3000)

study found for area 'Ophthalmology' after analyzing 1 studies
study found for area 'Psychiatry/Psychology' after analyzing 2 studies
study found for area 'Cardiology/Vascular Diseases' after analyzing 3 studies
study found for area 'Parasitic Diseases' after analyzing 4 studies
study found for area 'Hematology' after analyzing 6 studies
study found for area 'Oncology' after analyzing 7 studies
study found for area 'Rheumatology' after analyzing 9 studies
study found for area 'Dental and Oral Health' after analyzing 17 studies
study found for area 'Infections and Infectious Diseases' after analyzing 20 studies
study found for area 'Trauma' after analyzing 21 studies
study found for area 'Nutrition and Weight Loss' after analyzing 26 studies
study found for area 'Immunology' after analyzing 31 studies
study found for area 'Pulmonary/Respiratory Diseases' after analyzing 38 studies
study found for area 'Devices' after analyzing 47 studies
study found for area 'Neurology' after analyzing 

In [60]:
# print results for 5 studies for each found therapeutic area
for area,results_list in studies_by_area.items():
    print('======= %s ==========' % area)
    for results in results_list[:5]:
        std, terms, ranks = results
        print('study: %s' % std)
        print('terms: %s' % str(terms))
        print('ranks: %s' % str(ranks))
        print()

study: NCT03732872
terms: {'submucous fibroses, oral', 'oral submucous fibrosis', 'fibroplasia', 'fibrosis', 'fibrosis [disease/finding]', 'submucous fibrosis, oral', 'fibroses', 'oral submucous fibrosis [disease/finding]', 'fibrosis (formation of excess fibrous connective tissue)', 'fibrose', 'oral submucous fibroses', 'fibroses, oral submucous', 'desmoplasia', 'fibrosis, oral submucous'}
ranks: [('Ophthalmology', 0.46099383559810203), ('Musculoskeletal', 0.4586235881356827), ('Trauma', 0.44332802130587534), ('Oncology', 0.43512978293404564), ('Rheumatology', 0.42288106100305)]

study: NCT02569892
terms: {'age related macular degeneration (armd)', 'retinal drusen [disease/finding]', 'armd', 'drusen of retina', 'macular degeneration (senile), unspecified', 'drusen, retinal', 'degeneration, macular', 'drusen (degenerative)', 'senile macular degeneration of retina, unspecified', 'macular degeneration [disease/finding]', 'age relat maculopathy', 'maculopathies, age-related', 'age related 

#### When focusing on misclassification, always look at the second and third therapeutic areas and how close they are to the first one.
In this wrong classification example look at how  
`Urology: 0.50133`  
is close to probably the best therapeutic area  
`Trauma, 0.49841`

```
study: NCT03677856

terms: {'thoracic diseases', 'anesthetic drug', 'anesthesia agent', 'thoracic diseases [disease/finding]', 'anaesthetics', 'thoracic disorder', 'anaesthetic drugs', 'anaesthetic agent', 'anesthetics drugs', 'anesthetic agent', 'diseases, thoracic', 'anesthetic', 'agents, anesthetic', 'anesthetic agents', '[cn200] anesthetics', 'thoracic disease', 'disease thoracic', 'anesthetic drugs', 'disease, thoracic', 'thoracic dis', 'anesthetics', 'drugs, anesthetic', 'anesthestic drugs', 'drugs causing loss of sensation'}

ranks: [('Urology', 0.5013389765305643), ('Trauma', 0.49841319317069177), ('Neurology', 0.4932273906260839), ('Dermatology', 0.4843776150397936), ('Obstetrics/Gynecology', 0.47121876125844847)]
```

### Some Spectacular Misclassifications

It might be necessary to inspect CUI by CUI, manually remove or add some from the therapeutic area vs from the study and find the one that causes the mismatch (with the help of gensim's similarity functions). Then try to infer the interaction of this CUI with others to understand the semantic aspect 
that causes the disagreement.

```
study: NCT03732872
terms: {'fibrosis [disease/finding]', 'submucous fibroses, oral', 'oral submucous fibrosis [disease/finding]', 'submucous fibrosis, oral', 'desmoplasia', 'fibrosis', 'fibroses', 'fibroses, oral submucous', 'fibroplasia', 'oral submucous fibroses', 'fibrose', 'fibrosis (formation of excess fibrous connective tissue)', 'fibrosis, oral submucous', 'oral submucous fibrosis'}
ranks: [('Ophthalmology', 0.46099380804260986), ('Musculoskeletal', 0.4586235972848014), ('Trauma', 0.44332801909004704), ('Oncology', 0.43512981948530577), ('Rheumatology', 0.4228810632570852)]

study: NCT03731468
terms: {'postoperative pain', 'pain post op', 'pain postop', 'pain, postoperative [disease/finding]', 'postoperative pains', 'postoperative pain nos', 'post-operative pain', 'pain postoperative', 'post-op pain', 'pain post-operative', 'pain, postoperative', 'postop pain', 'pain post-op'}
ranks: [('Sleep', 0.3419794324119675), ('Obstetrics/Gynecology', 0.3029373153734034), ('Pharmacology/Toxicology', 0.29956028966879744), ('Orthopedics/Orthopedic Surgery', 0.29678834520237735), ('Devices', 0.28408312192622875)]

study: NCT03732313
terms: {'nails ingrown', 'ingrown toenail', 'ingrowing nail', 'toe ingrown nail', 'unguis incarnatus', 'onychocryptosis', 'ingrown nails', 'ingrowing toe nail', 'nails, ingrown', 'ingrow toenail', 'ingrown toenails', 'nails, ingrown [disease/finding]', 'ingrown nails toe', 'ingrown toe nail', 'nail, ingrown', 'ingrowing toenails', 'ingrowing toenail', 'ingrowing nails toe', 'ingrown nail'}
ranks: [('Urology', 0.7157004763916529), ('Ophthalmology', 0.712349276520087), ('Musculoskeletal', 0.6882745308485965), ('Trauma', 0.6804783806481205), ('Devices', 0.6529208348451772)]

study: NCT00081523
terms: {'sickle cell anemia', 'anemia sickle-cell', 'disease sickle cell', 'disease hb s', 'sickle cell disorder', 'anaemia cell sickle', 'sickle-cell disease', 'sickle cell disorders', 'cell disorder, sickle', 'sicklemia', 'anemias, sickle cell', 'sickle cell anemias', 'anemia sickle cell', 'cell sickle syndrome', 'sickle cell hemolytic anemia', 'cell diseases, sickle', 'syndrome sickle cell', 'sickle-cell disease, unspecified', 'sickle-cell anemia nos', 'hemoglobin s dis', 'anemia cells sickles', 'disease sickle-cell', 'cell diseases sickle', 'anemia cell disorder sickle', 'anemia cells sickle', 'hemoglobin s diseases', 'scd', 'sickling disorder due to hemoglobin s', 'cell disease, sickle', 'herrick syndrome', 'hb s disease', 'sickle cell diseases', "herrick's anemia", 'anemia, sickle cell [disease/finding]', 'anemia, sickle cell', 'sickle cell syndrome', 'sickle cell disease nos', 'disease, hemoglobin s', 'scds', 'cell disorder sickle', 'cell disorders, sickle', 'hemoglobin s disease', 'anemia sickle celled', 'sickle cell anaemia', 'cells disease sickle', 'hbs disease', 'sickle cell disease', 'cell sickle syndromes', 'sickle-cell anaemia', 'hemoglobin ss disease', 'anemia cell disorders sickle', 'sickle-cell anemia'}
ranks: [('Pediatrics/Neonatology', 0.41464538302978027), ('Hematology', 0.373590769455491), ('Neurology', 0.3539260878498255), ('Nephrology', 0.3450246964629372), ('Pulmonary/Respiratory Diseases', 0.34097418802657)]

study: NCT03687554
terms: {'failures, kidney', 'failures, renal', 'renal failure syndrome', 'failure, kidney', 'polycystic kidney disease, adult', 'kidney diseases, polycystic', 'syndrome renal failure', 'unspecified renal failure', 'adult disease kidney polycystic', 'renal failure nos', 'polycystic kidney, autosomal dominant [disease/finding]', 'polycystic kidney disease, autosomal dominant', 'polycystic kidney disease', 'fibrocystic renal disease', 'kidney failures', 'autosomal dominant polycystic kidney', 'polycystic kidney, autosomal dominant', 'renal disease, polycystic', 'renal failures', 'kidney, polycystic', 'polycystic renal diseases', 'renal diseases, polycystic', 'kidney polycystic', 'disease, polycystic renal', 'polycystic kidney diseases [disease/finding]', 'adult polycystic kidney disease', 'esrd', 'kidney, polycystic, autosomal dominant', 'adpkd', 'kidney disease, polycystic', 'failure kidney', 'kidney failure [disease/finding]', 'kidney failure', 'renal insufficiency', 'disease, polycystic kidney', 'kpad', 'polycystic kidney, unspecified type', 'adpk', 'autosomal dominant polycystic kidney disease', 'polycystic kidneys', 'polycystic kidney', 'renal failure, unspecified', 'polycystic kidney dis', 'kidney, polycystic disease', 'failure, renal', 'polycystic kidney dysplasia', 'renal failure', 'polycystic renal disease', 'pkd - polycystic kidney disease', 'polycystic kidney nos', 'diseases, polycystic renal', 'polycyst kid-autosom dom', 'congenital polycystic kidney', 'polycystic kidney dis autosomal dominant', 'diseases, polycystic kidney', 'adult polycystic kidney dis', 'polycystic kidney diseases'}
ranks: [('Genetic Disease', 0.4044536559670432), ('Pulmonary/Respiratory Diseases', 0.38349711595154196), ('Nephrology', 0.3833275137813915), ('Hematology', 0.3822182095745539), ('Neurology', 0.36639787110844013)]
```

## Conclusions and possible improvements
- Capturing the CUIs for each therapeutic area is critical. It can become tricky as shown with the above examples (dental or infectious diseases).
If we cannot find the right compromise, one can use NLM's [RRF Browser](https://www.nlm.nih.gov/research/umls/new_users/online_learning/UMLST_009.html) to navigate through related concepts and manually assign each CUIs to a problematic therapeutic area. Sometimes some concepts overlap and similarity will fail to give good results. More tuning/investigation is required starting from the negative cases.
- Do ensembling: use the two other CUI embeddings available [here](https://github.com/clinicalml/embeddings) (smaller but may still be good) hoping they can make up for missing concepts in the embeddings tried here. Not a huge improvement is expected from that as these other embeddings are smaller.
- If it is better to not classify a study than misclassify it, consider using a threshold for the similarity difference between the first and second area. Only if the difference is greater than the threshold, emit a predicted area since the separation between the first and second areas is considered wide enough.