# MetaMap NLP content view
This notebook intends to investigate the utility of MetaMap NLP content view (CVF=256) which claims to help NLP research. \
Here's a complete list of content views:
https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/content_views.html

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
mrconso = pd.read_csv('/Users/qinyilong/Desktop/ScAi/MRCONSO.RRF', sep='|', header=None, dtype=object)

Meaning of columns: https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.concept_names_and_sources_file_mr/?report=objectonly

In [3]:
mrconso = mrconso.drop(18, axis=1) # last column is meaningless because the entry ends with '|'
mrconso.columns = ["CUI", "LAT", "TS", "LUI", "STT", "SUI", "ISPREF", "AUI", "SAUI", "SCUI", "SDUI", "SAB", "TTY", "CODE", "STR", "SRL", "SUPPRESS", "CVF"]

In [4]:
# Extract entries within MetaMap view
mrconso_metamap_on = mrconso[mrconso.CVF == "256"]

In [5]:
# Randomly sample 100 entries and display
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(mrconso_metamap_on.sample(100))

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF
4144944,C0449546,ENG,P,L0800556,PF,S0849933,N,A0908656,,,,RCD,AT,X903M,Cowan serotype,3,N,256
12005268,C3432640,ENG,P,L10685815,PF,S13205143,Y,A20567502,,1143064,,NCBI,SCN,1143064,Macroderes sp. BMNH679805,0,N,256
3124965,C0329058,ENG,P,L0516177,PF,S0744100,N,A0804924,,,,SNMI,PT,L-C8431,Taractes asper,9,N,256
9132674,C1936187,ENG,P,L6658440,PF,S7688728,Y,A12919250,,385265,,NCBI,SCN,385265,Eurydactylodes agricolae,0,N,256
8980168,C1850055,ENG,P,L6468075,PF,S11794726,Y,A18470743,,M0530778,C536317,MSH,NM,C536317,PEHO syndrome,0,N,256
2880931,C0301304,ENG,P,L0598641,VO,S13956832,N,A23075124,,,,NDDF,IN,003123,phenylmercuric nitrate,3,N,256
12743769,C3829001,ENG,S,L11665161,PF,S14461537,Y,A23939462,,C113507,,NCI,SY,C113507,MLL-PTD,0,N,256
8663880,C1666686,ENG,S,L6395571,PF,S7327381,N,A23088241,3241000009117.0,1671000009105,,SNOMEDCT_VET,OP,1671000009105,Genus Cyanoliseus,9,O,256
10573547,C2635433,ENG,P,L8651374,PF,S10723554,Y,A16598591,,526527,,NCBI,SCN,526527,Aoraki,0,N,256
2228427,C0222167,ENG,S,L2848512,PF,S3546145,Y,A3708206,484648012.0,30598005,,SNOMEDCT_US,PT,30598005,Skin structure of epigastric area,9,N,256


In [6]:
# Frequencies of entries in different languages
mrconso_metamap_on["LAT"].value_counts()

ENG    4802688
Name: LAT, dtype: int64

In [7]:
sources_metamap_on = mrconso_metamap_on["SAB"].value_counts()

In [8]:
# Extract entries without MetaMap view
mrconso_metamap_off = mrconso[mrconso.CVF != "256"]

In [9]:
# Randomly sample 100 entries and display
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(mrconso_metamap_off.sample(100))

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF
1939584,C0186085,ENG,S,L3108374,PF,S3595776,Y,A3760601,794164013.0,55747005,,SNOMEDCT_US,FN,55747005,Transection of muscle of pelvis (procedure),9,N,
11064088,C2858118,ENG,P,L9370743,PF,S11651081,N,A17857832,,,S72.356F,ICD10CM,PT,S72.356F,Nondisplaced comminuted fracture of shaft of u...,4,N,
5398577,C0717309,ENG,S,L9104624,PF,S11358553,Y,A17399200,,M0540398,C056076,MSH,CE,C056076,"acetaminophen, caffeine, paracetamol drug comb...",0,N,
13112765,C4003259,ENG,S,L11905022,PF,S14891419,Y,A24499805,,1611710,,NCBI,AUN,1611710,"Proctophyllodes tenericaulus Atyeo & Vasilev, ...",0,Y,
7090285,C1177892,ENG,S,L11394887,PF,S17865454,Y,A29254884,,,,NDDF,CDC,048160,L-carnitine fumarate 200 mg ORAL CAPSULE,3,O,
8159633,C1483328,ENG,S,L8691523,PF,S10780139,Y,A16510597,,201563,,NCBI,AUN,201563,Polytrichum subpilosum P.Beauv.,0,Y,
12834106,C3861884,ENG,P,L11765940,PF,S15033924,Y,A24767918,,372782,,MEDCIN,PT,372782,"chest and abdomen x-ray, upright and PA chest ...",3,N,
6152596,C0881262,ITA,P,L12890220,PF,S15339259,Y,A25101439,,23917-8,,LNC-IT-IT,LN,23917-8,"Fragilità osmotica^0,60% cloruro di sodio:NFr:...",0,N,
3256160,C0340424,SPA,P,L4452293,PF,S5135763,Y,A5908758,1292823019.0,195030007,,SCTSPA,PT,195030007,miocardiopatía en la ataxia de Friedreich,9,N,
10943983,C2805902,ENG,S,L9266823,PF,S11488120,Y,A17510432,,692721,,NCBI,AUN,692721,Nicotiana truncata Symon,0,Y,


In [10]:
# Frequencies of entries in different languages
mrconso_metamap_off["LAT"].value_counts()

ENG    5508879
SPA    1470221
FRE     424471
JPN     322983
POR     300417
DUT     286526
ITA     236976
GER     232823
RUS     180945
CZE     180525
HUN     104811
CHI      77622
NOR      61724
TUR      51114
POL      50108
KOR      38660
EST      31204
SWE      29737
FIN      25486
SCR       9981
GRE       2241
LAV       1405
DAN        723
BAQ        695
HEB        485
Name: LAT, dtype: int64

In [11]:
sources_metamap_off = mrconso_metamap_off["SAB"].value_counts()

In [12]:
# Let's what english words are excluded from MetaMap view
mrconso_metamap_off_eng = mrconso_metamap_off[mrconso_metamap_off.LAT == "ENG"]

In [13]:
# Randomly sample 100 entries and display
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(mrconso_metamap_off_eng.sample(100))

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF
5407999,C0720580,ENG,S,L6685369,PF,S7783691,Y,A13271797,,,,MMSL,BN,119531,Gas-X Infant Drops,1,N,
9678933,C2137556,ENG,P,L9048167,PF,S11188184,Y,A17220512,,268034,,MEDCIN,PT,268034,corn of distal interphalangeal joint of fifth ...,3,N,
12280226,C3573596,ENG,S,L11161815,PF,S13764253,Y,A21353725,,1197293,,NCBI,AUN,1197293,"Amplirhagada intermedia (Solem, 1981)",0,Y,
10221166,C2384273,ENG,P,L8270604,PF,S10501672,Y,A16330708,,06V40CZ,,ICD10PCS,PT,06V40CZ,Restriction of Hepatic Vein with Extraluminal ...,0,N,
14293578,C4622182,ENG,P,L14680706,PF,S17830071,Y,A29052927,,2060963,,NCBI,SCN,2060963,unclassified Microbulbiferaceae,0,N,
12447436,C3667081,ENG,S,L11688438,PF,S14495436,Y,A24035421,,,,MTHSPL,DP,49035-463,COCOA BUTTER 2211 mg / PHENYLEPHRINE HYDROCHLO...,0,N,4096.0
3311653,C0344928,ENG,S,L2988479,PF,S3473499,Y,A3625058,644326017.0,253554002,,SNOMEDCT_US,FN,253554002,Perimembranous ventricular septal defect with ...,9,N,
5056591,C0592445,ENG,S,L1046707,PF,S1260990,Y,A1219947,,,,RCD,AB,ske6.,Second Nature 2CT57 ost system,3,Y,
13338419,C4087190,ENG,P,L13135913,PF,S16096190,Y,A26715362,,,10077497,MDR,PT,10077497,Sulcus vocalis,3,N,
11444205,C3022403,ENG,P,L9867468,PF,S12285066,Y,A19173416,,824392,,NCBI,SCN,824392,Hymenoptera sp. BOLD:AAB2872,0,N,


In [14]:
sources = pd.DataFrame(dict(included = sources_metamap_on, excluded = sources_metamap_off))

In [15]:
# Frequencies of entries from different source vocabularies
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(sources)

Unnamed: 0,included,excluded
AIR,337.0,347
ALT,200.0,8865
AOD,17933.0,2746
AOT,444.0,27
ATC,5506.0,1310
BI,951.0,300
CCC,62.0,348
CCPSS,1764.0,14070
CCS,936.0,681
CCS_10,444.0,436


In [20]:
# Some shapes
print("Entire MRCONSO: {}".format(mrconso.shape))
print("MetaMap view: {}".format(mrconso_metamap_on.shape))
print("MetaMap view excluded: {}".format(mrconso_metamap_off.shape))
print("English entries excluded by MetaMap view: {}".format(mrconso_metamap_off_eng.shape))

Entire MRCONSO: (14433450, 18)
MetaMap view: (4802688, 18)
MetaMap view excluded: (9630762, 18)
English entries excluded by MetaMap view: (5508879, 18)


### Observations:
Metamap NLP content view extracts 4802688 english concepts and discards 5508879 english concepts and all the non-english concepts. \
Source Vocabularies like LONIC (LNC-), which are suggested by reference manual to be detrimental to NLP, are completely excluded. \

### TODO:
What kinds of English concepts were excluded?