# MetaMap NLP content view
This notebook intends to investigate the utility of MetaMap NLP content view (CVF=256) which claims to help NLP research. \
Here's a complete list of content views:
https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/content_views.html

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
mrconso = pd.read_csv('/Users/qinyilong/Desktop/ScAi/MRCONSO.RRF', sep='|', header=None, dtype=object)

Meaning of columns: https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.concept_names_and_sources_file_mr/?report=objectonly

In [3]:
mrconso = mrconso.drop(18, axis=1) # last column is meaningless because the entry ends with '|'
mrconso.columns = ["CUI", "LAT", "TS", "LUI", "STT", "SUI", "ISPREF", "AUI", "SAUI", "SCUI", "SDUI", "SAB", "TTY", "CODE", "STR", "SRL", "SUPPRESS", "CVF"]

In [4]:
# Extract entries within MetaMap view
mrconso_metamap_on = mrconso[mrconso.CVF == "256"]

In [5]:
# Randomly sample 100 entries and display
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(mrconso_metamap_on.sample(100))

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF
7123265,C1189696,ENG,P,L2609809,PF,S3061078,N,A2736422,,130788,,NCBI,SCN,130788,Sedimentibacter saalensis,0,N,256
12094084,C3494245,ENG,P,L10842059,VO,S13489904,Y,A20901751,,M0560854,D061987,MSH,PM,D061987,Sf9 Cell,0,N,256
1974355,C0189573,ENG,P,L0228806,VO,S9046606,Y,A18652516,63273.0,0000019321,,CHV,PT,0000019321,cardiovascular procedures,0,N,256
7896375,C1412442,ENG,P,L5133263,PF,S5872488,Y,A6898457,,,,MTH,PN,NOCODE,AP2A2 gene,0,N,256
13450158,C4161913,ENG,P,L13192753,PF,S16174600,Y,A26725405,,1713789,,NCBI,SCN,1713789,Haliclona cf. caerulea Hawaii Kan 8,0,N,256
4632034,C0524433,ENG,S,L0723345,PF,S0945647,N,A1001963,,,,SNMI,SY,T-43127,Mid circumflex coronary artery,9,N,256
10581504,C2641837,ENG,P,L8630787,PF,S10735414,Y,A16587703,,537356,,NCBI,SCN,537356,Chalarus fimbriatus,0,N,256
12876809,C3876513,ENG,P,L11803691,PF,S14674327,Y,A24240783,2992470017.0,701210002,,SNOMEDCT_US,PT,701210002,"Mobile cephalometric x-ray system, analog",9,N,256
8073304,C1445505,ENG,P,L13103163,PF,S16056918,Y,A26574894,3285943017.0,411900002,,SNOMEDCT_US,PT,411900002,Horse chestnut pollen diagnostic allergen extract,9,N,256
780821,C0026850,ENG,S,L1412288,VO,S11916484,Y,A18648618,29495.0,0000008362,,CHV,SY,0000008362,myodystrophy,0,N,256


In [6]:
# Frequencies of entries in different languages
mrconso_metamap_on["LAT"].value_counts()

ENG    4802688
Name: LAT, dtype: int64

In [7]:
sources_metamap_on = mrconso_metamap_on["SAB"].value_counts()

In [8]:
# Extract entries without MetaMap view
mrconso_metamap_off = mrconso[mrconso.CVF != "256"]

In [9]:
# Randomly sample 100 entries and display
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(mrconso_metamap_off.sample(100))

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF
9481806,C2065223,ENG,S,L7273125,PF,S8730711,Y,A13892990,,42222,,MEDCIN,FN,42222,work restrictions no exposure to platinum fume...,3,N,
12431412,C3662551,ENG,P,L14906876,VO,S18100673,Y,A29536913,3660459010.0,609083001,,SNOMEDCT_US,SY,609083001,Adult of Ancylostoma braziliense,9,N,
8945733,C1831490,EST,P,L12669192,PF,S15126651,Y,A25178532,,47444-5,,LNC-ET-EE,LN,47444-5,A-gripi viirus H1 antikehad:Titr:Pt:S:Qn,0,N,
3574148,C0376633,DUT,P,L2051275,PF,S2389497,Y,A8182762,,M0028955,D019464,MSHDUT,MH,D019464,Belastingvrijstelling,3,N,
7207959,C1245153,ENG,P,L2656218,PF,S3123690,Y,A10492711,1576072.0,371116,,RXNORM,SCDF,371116,Bromelains Oral Capsule,0,O,
10144214,C2350514,RUS,S,L8589020,PF,S10683676,Y,A16462733,,M0519505,D055687,MSHRUS,SY,D055687,KOSTNO-MYSHECHNOI I NERVNOI SISTEM FIZIOLOGIIA,3,N,
9361605,C2022698,ENG,P,L9004393,PF,S11200520,Y,A17264186,,19827,,MEDCIN,PT,19827,echocardiography: diastolic turbulence of pros...,3,N,
6322469,C0941961,KOR,P,L12469211,PF,S15869447,Y,A25420544,,25912-7,,LNC-KO-KR,LN,25912-7,불화물:물질농도:24시간:뇨:정량,0,N,
2044581,C0195913,ENG,S,L2884274,PF,S3304076,Y,A3442836,779063010.0,42191001,,SNOMEDCT_US,FN,42191001,Excisional biopsy of brain (procedure),9,N,
5306986,C0693979,SPA,S,L14344087,PF,S17437578,Y,A28661409,378991000209114.0,324847008,,SCTSPA,PT,324847008,saquinavir (base libre) 200 mg por cada cápsul...,9,N,


In [10]:
# Frequencies of entries in different languages
mrconso_metamap_off["LAT"].value_counts()

ENG    5508879
SPA    1470221
FRE     424471
JPN     322983
POR     300417
DUT     286526
ITA     236976
GER     232823
RUS     180945
CZE     180525
HUN     104811
CHI      77622
NOR      61724
TUR      51114
POL      50108
KOR      38660
EST      31204
SWE      29737
FIN      25486
SCR       9981
GRE       2241
LAV       1405
DAN        723
BAQ        695
HEB        485
Name: LAT, dtype: int64

In [11]:
sources_metamap_off = mrconso_metamap_off["SAB"].value_counts()

In [12]:
# Let's what english words are excluded from MetaMap view
mrconso_metamap_off_eng = mrconso_metamap_off[mrconso_metamap_off.LAT == "ENG"]

In [13]:
# Randomly sample 100 entries and display
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(mrconso_metamap_off_eng.sample(100))

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF
8529859,C1609327,ENG,P,L5642663,PF,S6470341,Y,A8600695,,M0489701,C504009,MSH,NM,C504009,2-(4-hydroxyphenyl)ethyl 4-O-feruloylglucopyra...,0,N,
2054454,C0196965,ENG,S,L2874560,PF,S3263890,Y,A3401147,755591015.0,25291006,,SNOMEDCT_US,FN,25291006,Dissection of orbital fibrous bands (procedure),9,N,
39827,C0001911,ENG,S,L2811869,PF,S18100794,Y,A29558076,3669710017.0,96119002,,SNOMEDCT_US,PT,96119002,Albendazole product,9,N,
11756528,C3246195,ENG,S,L10251028,PF,S12791751,Y,A19864400,,0QHK08Z,,ICD10PCS,PX,0QHK08Z,Medical and Surgical @ Lower Bones @ Insertion...,0,N,
702025,C0024032,ENG,S,L6193676,VO,S0814224,Y,A0873907,,,,CST,GT,BIRTH WEIGHT SUBNORM,BIRTHWEIGHT SUBNORMAL,0,N,
2612794,C0265840,ENG,S,L0550656,PF,S0743230,Y,A0804119,,,,RCD,AB,X201F,TR - Cong tricusp regurgitatn,3,Y,
13983010,C4480320,ENG,P,L14090591,VO,S17139225,Y,A28278445,,364206,,MEDCIN,SY,364206,enteropathic arthropathy other site,3,N,
13264674,C4065413,ENG,S,L13027668,PF,S15964929,Y,A26474389,,379114,,MEDCIN,FN,379114,d-kefs proverb test common proverb achievement...,3,N,
11370215,C2977641,ENG,P,L10347029,PF,S12929449,Y,A20142003,,,Z45.02,ICD10CM,ET,Z45.02,Encounter for adjustment and management of aut...,4,N,
5419800,C0726736,ENG,P,L1333837,PF,S1577502,Y,A9431202,,,,MMSL,MS,NOCODE,"Irrigation Set 97""",1,N,


In [14]:
sources = pd.DataFrame(dict(included = sources_metamap_on, excluded = sources_metamap_off))

In [15]:
# Frequencies of entries from different source vocabularies
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(sources)

Unnamed: 0,included,excluded
AIR,337.0,347
ALT,200.0,8865
AOD,17933.0,2746
AOT,444.0,27
ATC,5506.0,1310
BI,951.0,300
CCC,62.0,348
CCPSS,1764.0,14070
CCS,936.0,681
CCS_10,444.0,436


In [16]:
# Some shapes
print("Entire MRCONSO: {}".format(mrconso.shape))
print("MetaMap view: {}".format(mrconso_metamap_on.shape))
print("MetaMap view excluded: {}".format(mrconso_metamap_off.shape))
print("English entries excluded by MetaMap view: {}".format(mrconso_metamap_off_eng.shape))

Entire MRCONSO: (14433450, 18)
MetaMap view: (4802688, 18)
MetaMap view excluded: (9630762, 18)
English entries excluded by MetaMap view: (5508879, 18)


### Observations:
Metamap NLP content view extracts 4802688 english concepts and discards 5508879 english concepts and all the non-english concepts. \
Source Vocabularies like LONIC (LNC-), which are suggested by reference manual to be detrimental to NLP, are completely excluded. \

### TODO:
What kinds of English concepts were excluded?

#### Check if MetaMap helps to reduce mappings from one CUI to multiple STYs 

In [23]:
import pickle
with open('pickles/STY_to_CUIs.pickle', 'rb') as handle:
    sty_to_cuis = pickle.load(handle)

In [27]:
cui_to_stys_metamap = dict((el, [])for el in mrconso_metamap_on.CUI.drop_duplicates())
for sty in sty_to_cuis:
    for cui in sty_to_cuis[sty]:
        if cui in cui_to_stys_metamap:
            cui_to_stys_metamap[cui] += [sty]

In [29]:
from collections import Counter
ctr = Counter([len(x) for x in cui_to_stys_metamap.values()])
ctr.most_common(10)

[(1, 1975275), (2, 210001), (3, 8652), (4, 45)]

In [30]:
with open('pickles/CUI_to_STYs.pickle', 'rb') as handle:
    cui_to_stys = pickle.load(handle)

In [31]:
ctr1 = Counter([len(x) for x in cui_to_stys.values()])
ctr1.most_common(10)

[(1, 3538314), (2, 271362), (3, 13088), (4, 68)]

#### Check if Metamap affects the integrity of tree structures in MRHIER

In [32]:
mrhier = pd.read_csv('/Users/qinyilong/Desktop/ScAi/MRHIER.RRF', sep='|', header=None, dtype=object)
mrhier = mrhier.drop(9, axis=1)
mrhier.columns = ['CUI', 'AUI', 'CXN', 'PAUI', 'SAB', 'RELA', 'PTR', 'HCD', 'CVF']

In [33]:
mrhier_ncbi = mrhier[mrhier['SAB'] == 'NCBI']
mrhier_ncbi = mrhier_ncbi[mrhier_ncbi['PTR'].notna()]

In [44]:
import networkx as nx
G = nx.DiGraph(name='MRHIER NCBI hierarchy graph')
for index, row in mrhier_ncbi.iterrows():
    parents = row['PTR'].split('.')
    for index, aui in enumerate(parents):
        if aui in mrconso_metamap_on.AUI and parents[index+1] in mrconso_metamap_on.AUI:
            G.add_edge(aui, parents[index+1])
    if parents[-1] in mrconso_metamap_on.AUI and row['AUI'] in mrconso_metamap_on.AUI:
        G.add_edge(parents[-1], row['AUI'])

In [45]:
nx.is_tree(G)

NetworkXPointlessConcept: G has no nodes.

In [46]:
nx.info(G)

'Name: MRHIER NCBI hierarchy graph\nType: DiGraph\nNumber of nodes: 0\nNumber of edges: 0\n'