In [None]:
import os
os.path.abspath("")

In [None]:
import os, sys
from loguru import logger

LOG_ROOT = os.path.abspath("./")
LOG_FILE = LOG_ROOT + "/logs/metamap_processing.log"

# Remove all handlers and reset stderr
logger.remove(handler_id=None)
logger.add(
    LOG_FILE,
    level="TRACE",
    mode="w",
    backtrace=False,
    diagnose=True,
    colorize=False,
    format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
)
logger.info("\r\n" + ">" * 29 + "\r\n" + ">>> New execution started >>>" + "\r\n" + ">" * 29)
# To filter log level: TRACE=5, DEBUG=10, INFO=20, SUCCESS=25, WARNING=30, ERROR=40, CRITICAL=50
logger.add(sys.stdout, level="INFO", filter=lambda record: record["level"].no < 40, colorize=True)
logger.add(sys.stderr, level="ERROR", backtrace=False, diagnose=True, colorize=True)

# Install Metamap

Follow the following instructions:
- Install Metamap2020: https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/documentation/Installation.html 
- Install additional datasets (2022 Specialist Lexicon, 2022AA UMLS NLM Datasets): https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/additional-tools/DataSetDownload.html

# Processing

## Check if the servers started
- taggerServer
- DisambiguatorServer

In [None]:
import os
cmd = 'ps -ef | grep java'
out = os.popen(cmd)
print(out.read())

## Check metamap human readable output

In [None]:
import subprocess, shlex
text =  "There is no focal consolidation, pleural effusion or pneumothorax.  Cardiomediastinal silhouette and hilar contours are otherwise unremarkable."
input_command = f"echo -e {text}"
input_process = subprocess.Popen(shlex.split(input_command), stdout=subprocess.PIPE)
meta_command = "metamap -V NLM -Z 2022AA -A --silent -I"
metamap_process = subprocess.Popen(shlex.split(meta_command), stdout=subprocess.PIPE, stdin=input_process.stdout)
output, error = metamap_process.communicate()
print(output.decode())

## Load data

In [None]:
import pandas
REPORT_PATH = "/home/yuxiangliao/PhD/data/mimic_cxr_reports_core.json"
df = pandas.read_json(REPORT_PATH,orient="records",lines=True)
print(df)

id_list = df.loc[:,'sid'].to_list()
findings_list = df.loc[:,'findings'].to_list()
impression_list = df.loc[:,'impression'].to_list()
pfi_list = df.loc[:,'provisional_findings_impression'].to_list()
fai_list = df.loc[:,'findings_and_impression'].to_list()

DATA_SIZE = len(id_list)

## Run multiprocessing in jupyter

Construct metama command

In [None]:
import subprocess, shlex
# Documentation: https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/MM_2016_Usage.pdf
def get_metamap_command():
    command = format_command_arg("metamap")
    command += format_command_arg("-V NLM")                # Data Version: -V (--mm data version) [Base, USAbase, NLM]
    command += format_command_arg("-Z 2022AA")             # Knowledge Source: -Z (--mm data year)
    command += format_command_arg("-A")                    # Data Model: [-A (--strict model), -C (--relaxed model)]
    command += format_command_arg("--silent")              # Hide Header Output: --silent
    command += format_command_arg("--JSONn")               # Output format: [-q (--machine output), --XMLf, --XMLn, --XMLf1, --XMLn1, --JSONf, --JSONn, -N (--fielded mmi output), -F (--formal tagger output)]
    # command += " --conj"                                   # Turn on Conjunction Processing
    # command += " -y"                                       # Word-Sense Disambiguation: -y (--word sense disambiguation)
    # UDA_path = "/home/yuxiangliao/PhD/UMLS/custom-resources/custom-word-replacement"
    # command += format_command_arg(f"--UDA {UDA_path}")     # User-Defined Acronyms/Abbreviations (word replacement): --UDA <file>
    # semantic_types = "virs,cgab,acab,ffas,bpoc,medd,tmco,qlco,qnco,bsoj,blor,fndg,sosy,topp,ortf,patf,dsyn,inpo"
    # commend += f"-J {semantic_types}"                      # Retain only Concepts with Specified Semantic Types: -J (--restrict to sts) <list>
    # command += format_command_arg("-I")                    # For human readable output
    return command

def format_command_arg(arg):
    return " " + arg

def run_metamap(startIndex,batch_size):
    endIndex = startIndex + batch_size if startIndex + batch_size < DATA_SIZE else DATA_SIZE
    input_list = [(record if record else "None") for record in findings_list[startIndex:endIndex]]
    input = repr("\n\n".join(input_list))
    input_command = f"echo -e {input}"
    input_process = subprocess.Popen(shlex.split(input_command), stdout=subprocess.PIPE)
    
    meta_command = get_metamap_command()
    metamap_process = subprocess.Popen(shlex.split(meta_command), stdout=subprocess.PIPE, stdin=input_process.stdout)
   
    output_bytes, error_bytes = metamap_process.communicate()
    if error:
        logger.error(error_bytes.decode())
    return output_bytes.decode(), [startIndex,endIndex]

Data Object for JSON output

In [None]:
def resolveTokenIndices(section:Section, startPos, length) -> list:
    indicesInSection = []
    doInsert = False
    posPointer = startPos
    for i, currPos in enumerate(section.tokenPos):
        nextPos = section.tokenPos[i+1] if i + 1 < len(section.tokenPos) else section.tokenPos[i] + 99
        if not doInsert and posPointer >= currPos and posPointer < nextPos:
            doInsert = True
            posPointer = startPos + length - 1
        elif doInsert and posPointer < currPos: 
            break # break the loop in advance, othewise will stop when finish the loop.
        if doInsert:
            indicesInSection.append(i)
    return indicesInSection

class Concept(object):
    def __init__(self, sourceTokens:list, startPosList:list, lengthList:list, umlsCUI:str, preferedName:str, hitTerm:str, categories:list, isHead:int, isNegated:int):
        self.sourceTokens = sourceTokens
        self.startPosList = startPosList
        self.lengthList = lengthList
        self.indicesInSection = []
        self.umlsCUI = umlsCUI
        self.preferedName = preferedName
        self.hitTerm = hitTerm
        self.categories = categories
        self.isHead =  1 if isHead == "yes" else 0
        self.isNegated =  1 if isNegated == "1" else 0
    def update(self, section:Section):
        for startPos, length in zip(self.startPosList, self.lengthList):
            indicesInSection = resolveTokenIndices(section, startPos, length)
            self.indicesInSection.extend(indicesInSection)
        
class ConceptGroup(object):
    def __init__(self):
        self.concepts = []
    def addConcept(self, concept:Concept):
        self.concepts.append(concept)
    def update(self, section:Section):
        for obj in self.concepts:
            obj.update(section)
        
class SyntaxChunk(object):
    def __init__(self, text:str, lexicalMatch:str, syntaxType:str, partOfSpeech:str, tokens:list):
        self.text = text # The original form of the text (case sensitive)
        self.lexicalMatch = lexicalMatch
        self.syntaxType = syntaxType
        self.partOfSpeech = partOfSpeech
        self.tokens = tokens
                
class Phrase(object):
    def __init__(self, text:str, startPos:int, length:int):
        self.text = text
        self.startPos = startPos
        self.length = length
        self.tokens = []
        self.indicesInSection = []
        self.syntaxChunks = []
        self.mappings = []
    def addSyntaxChunk(self, syntaxChunk:SyntaxChunk):
        self.syntaxChunks.append(syntaxChunk)
        self.tokens.extend(syntaxChunk.tokens)
    def addConceptGroup(self, conceptGroup:ConceptGroup):
        self.mappings.append(conceptGroup)
    def update(self, section:Section):
        indicesInSection = resolveTokenIndices(section, self.startPos, self.length)
        self.indicesInSection.extend(indicesInSection)
        for obj in self.mappings:
            obj.update(section)
        
        
class Sentence(object):
    def __init__(self, text:str, startPos:int, length:int):
        self.text = text
        self.startPos = startPos
        self.length = length
        self.tokens = []
        self.indicesInSection = []
        self.phrases = []
    def addPhrase(self, phrase:Phrase):
        self.phrases.append(phrase)
        self.tokens.extend(phrase.tokens)
    def update(self, section:Section):
        indicesInSection = resolveTokenIndices(section, self.startPos, self.length)
        self.indicesInSection.extend(indicesInSection)
        # Update the children objs
        for obj in self.phrases:
            obj.update(section)

class Negation(object):
    def __init__(self, text:str, triStartPosList:list, triLengthList:list, conceptsCUIs:list, tarStartPosList:list, tarLengthList:list):
        self.trgger = {
            'text': text,
            'startPosList': triStartPosList,
            'lengthList': triLengthList,
            'indicesInSection': []
        }
        self.tarrget = {
            'conceptsCUIs': conceptsCUIs,
            'startPosList': tarStartPosList,
            'lengthList': tarLengthList,
            'indicesInSection': []
        }
    def update(self, section:Section):
        for startPos, length in zip(self.trgger['startPosList'], self.trgger['lengthList']):
            indicesInSection = resolveTokenIndices(section, startPos, length)
            self.trgger['indicesInSection'].extend(indicesInSection)
        for startPos, length in zip(self.tarrget['startPosList'], self.tarrget['lengthList']):
            indicesInSection = resolveTokenIndices(section, startPos, length)
            self.tarrget['indicesInSection'].extend(indicesInSection) 

class Section(object):
    def __init__(self, name:str):
        self.name = name
        self.text = "" # context
        self.tokens = []
        self.tokenPos = []
        self.sentences = []
        self.negations = []
    def addSentence(self, sentence:Sentence):
        self.sentences.append(sentence)
        self.text += sentence.text
        self.tokens.extend(sentence.tokens)
    def addNegation(self, negation:Negation):
        self.negations.append(negation)
    def update(self):
        offset = [0]
        for i,substring in enumerate(self.tokens):
            offset.append(self.text.lower().find(substring,offset[i],len(self.text)))
        offset = offset[1:]
        self.tokenPos = offset
        # Update the children objs
        for obj in self.sentences:
            obj.update(self)
        for obj in self.negations:
            obj.update(self)
        
class Record(object):
    def __init__(self, sid:str):
        self.sid = sid
        self.sections = []
    def addSection(self, section:Section):
        section.update()
        self.sections.append(section)
        
class Records(object):
    def __init__(self):
        self.records = []
    def addRecord(self, record:Record):
        self.records.append(record)

Methods to resolve specific JSON subtags

In [None]:
def resolveSyntaxUnit(syntaxUnit):
    text = syntaxUnit['InputMatch']
    syntaxType = syntaxUnit['SyntaxType']
    tokens = syntaxUnit['Tokens']
    # Add punc to token list
    if not tokens:
        logger.trace(f"Empty token detected: SyntaxType:{syntaxType}, InputMatch:{text}")
        tokens = [text]
    try:
        lexicalMatch = syntaxUnit['LexMatch']
        partOfSpeech = syntaxUnit['LexCat']
    except KeyError:
        lexicalMatch = ""
        partOfSpeech = ""
    if text.lower() != lexicalMatch and text.isalnum():
        logger.trace(f"text:[{text}], lexicalMatch:[{lexicalMatch}]")
    return SyntaxChunk(text, lexicalMatch, syntaxType, partOfSpeech, tokens)

def resolveConcept(mappingCandidate):
    sourceTokens = mappingCandidate['MatchedWords']
    startPosList = [int(i['StartPos']) for i in mappingCandidate['ConceptPIs']]
    lengthList = [int(i['Length']) for i in mappingCandidate['ConceptPIs']]
    umlsCUI = mappingCandidate['CandidateCUI']
    preferedName = mappingCandidate['CandidatePreferred']
    hitTerm = mappingCandidate['CandidateMatched']
    categories = mappingCandidate['SemTypes']
    isHead = 1 if mappingCandidate['IsHead'] == "yes" else 0
    isNegated = 1 if mappingCandidate['Negated'] == "1" else 0
    return Concept(sourceTokens, startPosList, lengthList, umlsCUI, preferedName, hitTerm, categories, isHead, isNegated)

def resolveNegation(negation):
    trigger = negation['NegTrigger']
    triggerStartPosList = [int(i['StartPos']) for i in negation['NegTriggerPIs']]
    triggerLengthList = [int(i['Length']) for i in negation['NegTriggerPIs']]
    conceptCUIs = [i['NegConcCUI'] for i in negation['NegConcepts']]
    targetStartPosList = [int(i['StartPos']) for i in negation['NegConcPIs']]
    targetLengthList = [int(i['Length']) for i in negation['NegConcPIs']]
    return Negation(trigger, triggerStartPosList, triggerLengthList, conceptCUIs, targetStartPosList, targetLengthList)
    

Method to resolve JSON format output

In [None]:
def parseMetamapJSON(json_obj,id_subList) -> Records:
    records = Records()
    for _idx, _document in enumerate(json_obj['AllDocuments']):
        # print(_document.keys())
        # print(record['Document']['Negations'])
        record = Record(id_subList[_idx])
        section = Section("findings")
        for _utterance in _document['Document']['Utterances']:
            # print(_utterance.keys())
            sentence = Sentence(text=_utterance['UttText'], startPos=int(_utterance['UttStartPos']), length=int(_utterance['UttLength']))
            for _phrase in _utterance['Phrases']:
                # print(_phrase.keys())
                phrase = Phrase(text=_phrase['PhraseText'], startPos=int(_phrase['PhraseStartPos']), length=int(_phrase['PhraseLength']))
                for _syntaxUnit in _phrase['SyntaxUnits']:
                    # print(_syntaxUnit.keys())
                    syntaxChunk = resolveSyntaxUnit(_syntaxUnit)
                    phrase.addSyntaxChunk(syntaxChunk)
                for _mapping in _phrase['Mappings']:
                    # print(_mapping.keys())
                    conceptGroup = ConceptGroup()
                    for _mappingCandidate in _mapping['MappingCandidates']:
                        # print(_mappingCandidate.keys())
                        concept = resolveConcept(_mappingCandidate)
                        conceptGroup.addConcept(concept)
                    phrase.addConceptGroup(conceptGroup)
                sentence.addPhrase(phrase)
            section.addSentence(sentence)
        for _negation in _document['Document']['Negations']:
            negation = resolveNegation(_negation)
            section.addNegation(negation)
        record.addSection(section)
        records.addRecord(record)
    return records

In [None]:
import jsonpickle

def classToJSON(obj) -> str:
    return jsonpickle.encode(obj,unpicklable=False)

Execute metamap only

In [None]:
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import Lock
import json

BATCH_SIZE = 5
# mp.cpu_count()
WORKERS_NUM = 5
DATA_START_POS = 0
DATA_STOP_POS = 100
# DATA_END_INDEX = DATA_SIZE

executor = ProcessPoolExecutor(max_workers=WORKERS_NUM)
all_task = [executor.submit(run_metamap, startIndex, BATCH_SIZE) for startIndex in range(DATA_START_POS, DATA_STOP_POS, BATCH_SIZE)]

lock=Lock()
with open("/home/yuxiangliao/PhD/output/metamap/metamap_output_100.json","w") as f:
    for future in as_completed(all_task):
        output, idx_inteval = future.result()
        # Only the second line is the required JSON string.
        id_subList = id_list[idx_inteval[0]:idx_inteval[1]]
        json_output = list(output.split("\n"))[1]
        # with open("/home/yuxiangliao/PhD/output/metamap_output_test.json","a") as f:
        #     f.write(json_output)
        json_obj = json.loads(json_output)
        records_batch = parseMetamapJSON(json_obj, id_subList)
        # print(classToJSON(records_batch))
        lock.acquire()
        f.write(classToJSON(records_batch))
        f.write("\n")
        f.flush
        lock.release()

Execute with spaCy

In [None]:
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import Lock
import json
import spacy

BATCH_SIZE =1
# mp.cpu_count()
WORKERS_NUM = 1
DATA_START_POS = 376
DATA_END_POS = 377
# DATA_END_INDEX = DATA_SIZE

nlp = spacy.load("en_core_web_md",disable=['ner'])
nlp.add_pipe("metamap_processor", last=True)

executor = ProcessPoolExecutor(max_workers=WORKERS_NUM)
all_task = [executor.submit(run_metamap, startIndex, BATCH_SIZE) for startIndex in range(DATA_START_POS, DATA_END_POS, BATCH_SIZE)]

lock=Lock()
with open("/home/yuxiangliao/PhD/output/metamap/metamap_output_test.json","w") as f:
    for future in as_completed(all_task):
        # Metamap
        metamap_output, idx_inteval = future.result()
        id_subList = id_list[idx_inteval[0]:idx_inteval[1]] 
        metamap_json_output = list(metamap_output.split("\n"))[1] # Only the second line is the required JSON string.
        # with open("/home/yuxiangliao/PhD/output/metamap_output_test.json","a") as f:
        #     f.write(json_output)
        metamap_json_obj = json.loads(metamap_json_output)
        parsed_obj_batch = parseMetamapJSON(metamap_json_obj, id_subList)
        # print(classToJSON(batch_record))
        # lock.acquire()
        # f.write(classToJSON(batch_record))
        # f.write("\n")
        # f.flush
        # lock.release()
    
        # SpaCy
        text_tuples = [(record.sections[0].text,{"record":record}) for record in parsed_obj_batch.records]
        for doc, context in nlp.pipe(text_tuples, as_tuples=True):
            print(context['record'].sid)
            nounChunks = [-1] * len(doc)
            for id, chunk in enumerate(doc.noun_chunks):
                nounChunks[chunk.start:chunk.end] = [id] * (chunk.end-chunk.start)
            sentences = [-1] * len(doc)
            for id, sent in enumerate(doc.sents):
                sentences[sent.start:sent.end] = [id] * (sent.end-sent.start)
            offset = [0]
            for i,tok in enumerate(doc):
                offset.append(text.find(tok.text,offset[i],len(text)))
            offset = offset[1:]
            data = {
                'token': [tok for tok in doc],
                'tokenOffset': offset,
                'sentenceGroup': sentences,
                'nounChunk': nounChunks,
                'lemma': [tok.lemma_ for tok in doc],
                'pos_core': [f"[{tok.pos_}]{spacy.explain(tok.pos_)}" for tok in doc],
                'pos_feature': [f"[{tok.tag_}]{spacy.explain(tok.tag_)}" for tok in doc],
                'dependency': [f"[{tok.dep_}]{spacy.explain(tok.dep_)}" for tok in doc],
                'dependency_head': [tok.head.text for tok in doc],
                'dependency_children': [[child for child in tok.children] for tok in doc],
                'morphology': [tok.morph for tok in doc],
                'is_alpha': [tok.is_alpha for tok in doc],
                'is_stop': [tok.is_stop for tok in doc],
                'is_pronoun': [True if tok.pos_ == 'PRON' else False for tok in doc],
                'trailing_space': [True if tok.whitespace_ else False for tok in doc]
            }
            output = pd.DataFrame(data=data)