In [None]:
import os

os.path.abspath("")


In [None]:
import os, sys
from loguru import logger

LOG_ROOT = os.path.abspath("./")
LOG_FILE = LOG_ROOT + "/logs/sr-3.log"

# Remove all handlers and reset stderr
logger.remove(handler_id=None)
logger.add(
    LOG_FILE,
    level="TRACE",
    mode="w",
    backtrace=False,
    diagnose=True,
    colorize=False,
    format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
)
logger.info("\r\n" + ">" * 29 + "\r\n" + ">>> New execution started >>>" + "\r\n" + ">" * 29)
# To filter log level: TRACE=5, DEBUG=10, INFO=20, SUCCESS=25, WARNING=30, ERROR=40, CRITICAL=50
logger.add(sys.stdout, level="INFO", filter=lambda record: record["level"].no < 40, colorize=True)
logger.add(sys.stderr, level="ERROR", backtrace=False, diagnose=True, colorize=True)


# Install Metamap

Follow the following instructions:
- Install Metamap2020: https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/documentation/Installation.html 
- Install additional datasets (2022 Specialist Lexicon, 2022AA UMLS NLM Datasets): https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/additional-tools/DataSetDownload.html

# Processing

## Check if the servers started
- taggerServer
- DisambiguatorServer

In [None]:
import os

cmd = "ps -ef | grep java"
out = os.popen(cmd)
print(out.read())


## Check metamap human readable output

In [None]:
# import subprocess, shlex
# text =  "There is no focal consolidation, pleural effusion or pneumothorax.  Cardiomediastinal silhouette and hilar contours are otherwise unremarkable."
# input_command = f"echo -e {text}"
# input_process = subprocess.Popen(shlex.split(input_command), stdout=subprocess.PIPE)
# meta_command = "metamap -V NLM -Z 2022AA -A --silent -I"
# metamap_process = subprocess.Popen(shlex.split(meta_command), stdout=subprocess.PIPE, stdin=input_process.stdout)
# output, error = metamap_process.communicate()
# print(output.decode())


## Load data

In [None]:
import pandas as pd

REPORT_PATH = "/home/yuxiangliao/PhD/data/mimic_cxr_reports_core.json"
df = pd.read_json(REPORT_PATH, orient="records", lines=True)
print(df)

id_list = df.loc[:, "sid"].to_list()
findings_list = df.loc[:, "findings"].to_list()
impression_list = df.loc[:, "impression"].to_list()
pfi_list = df.loc[:, "provisional_findings_impression"].to_list()
fai_list = df.loc[:, "findings_and_impression"].to_list()

DATA_SIZE = len(id_list)


## Run multiprocessing in jupyter

Construct metama command

In [None]:
import subprocess, shlex

# Documentation: https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/MM_2016_Usage.pdf
def get_metamap_command():
    command = format_command_arg("metamap")
    command += format_command_arg("-V NLM")  # Data Version: -V (--mm data version) [Base, USAbase, NLM]
    command += format_command_arg("-Z 2022AA")  # Knowledge Source: -Z (--mm data year)
    command += format_command_arg("-A")  # Data Model: [-A (--strict model), -C (--relaxed model)]
    command += format_command_arg("--silent")  # Hide Header Output: --silent
    command += format_command_arg(
        "--JSONn"
    )  # Output format: [-q (--machine output), --XMLf, --XMLn, --XMLf1, --XMLn1, --JSONf, --JSONn, -N (--fielded mmi output), -F (--formal tagger output)]
    # command += " --conj"                                   # Turn on Conjunction Processing
    # command += " -y"                                       # Word-Sense Disambiguation: -y (--word sense disambiguation)
    # UDA_path = "/home/yuxiangliao/PhD/UMLS/custom-resources/custom-word-replacement"
    # command += format_command_arg(f"--UDA {UDA_path}")     # User-Defined Acronyms/Abbreviations (word replacement): --UDA <file>
    # semantic_types = "virs,cgab,acab,ffas,bpoc,medd,tmco,qlco,qnco,bsoj,blor,fndg,sosy,topp,ortf,patf,dsyn,inpo"
    # commend += f"-J {semantic_types}"                      # Retain only Concepts with Specified Semantic Types: -J (--restrict to sts) <list>
    # command += format_command_arg("-I")                    # For human readable output
    return command


def format_command_arg(arg):
    return " " + arg


def run_metamap(startIndex, batch_size):
    endIndex = startIndex + batch_size if startIndex + batch_size < DATA_SIZE else DATA_SIZE
    input_list = [(record if record else "None") for record in findings_list[startIndex:endIndex]]
    input = repr("\n\n".join(input_list))
    input_command = f"echo -e {input}"
    input_process = subprocess.Popen(shlex.split(input_command), stdout=subprocess.PIPE)

    meta_command = get_metamap_command()
    metamap_process = subprocess.Popen(shlex.split(meta_command), stdout=subprocess.PIPE, stdin=input_process.stdout)

    output_bytes, error_bytes = metamap_process.communicate()
    if error_bytes:
        logger.error(error_bytes.decode())
    return output_bytes.decode(), [startIndex, endIndex], input_list



Data Object for JSON output

In [None]:
class Concept(object):
    def __init__(
        self,
        sourceTokens: list,
        startPosList: list,
        lengthList: list,
        umlsCUI: str,
        preferedName: str,
        hitTerm: str,
        categories: list,
        isHead: int,
        isNegated: int,
    ):
        self.sourceTokens = sourceTokens
        self.startPosList = startPosList
        self.lengthList = lengthList
        self.umlsCUI = umlsCUI
        self.preferedName = preferedName
        self.hitTerm = hitTerm
        self.categories = categories
        self.isHead = isHead
        self.isNegated = isNegated


class ConceptGroup(object):
    def __init__(self):
        self.concepts = []

    def addConcept(self, concept: Concept):
        self.concepts.append(concept)


class SyntaxChunk(object):
    def __init__(self, text: str, lexicalMatch: str, syntaxType: str, partOfSpeech: str, tokens: list):
        self.text = text  # The original form of the text (case sensitive)
        self.lexicalMatch = lexicalMatch
        self.syntaxType = syntaxType
        self.partOfSpeech = partOfSpeech
        self.tokens = tokens


class Phrase(object):
    def __init__(self, text: str, startPos: int, length: int):
        self.text = text
        self.startPos = startPos
        self.length = length
        self.syntaxChunks = []
        self.mappings = []

    def addSyntaxChunk(self, syntaxChunk: SyntaxChunk):
        self.syntaxChunks.append(syntaxChunk)

    def addConceptGroup(self, conceptGroup: ConceptGroup):
        self.mappings.append(conceptGroup)


class Sentence(object):
    def __init__(self, text: str, startPos: int, length: int):
        self.text = text
        self.startPos = startPos
        self.length = length
        self.phrases = []

    def addPhrase(self, phrase: Phrase):
        self.phrases.append(phrase)


class Negation(object):
    def __init__(
        self,
        text: str,
        triStartPosList: list,
        triLengthList: list,
        conceptsCUIs: list,
        tarStartPosList: list,
        tarLengthList: list,
    ):
        self.trgger = {
            "text": text,
            "startPosList": triStartPosList,
            "lengthList": triLengthList,
        }
        self.tarrget = {
            "conceptsCUIs": conceptsCUIs,
            "startPosList": tarStartPosList,
            "lengthList": tarLengthList,
        }


class Section(object):
    def __init__(self, name: str):
        self.name = name
        self.text = ""  # context
        self.sentences = []
        self.negations = []

    def addSentence(self, sentence: Sentence):
        self.sentences.append(sentence)
        self.text += sentence.text

    def addNegation(self, negation: Negation):
        self.negations.append(negation)


class Record(object):
    def __init__(self, sid: str):
        self.sid = sid
        self.sections = []

    def addSection(self, section: Section):
        self.sections.append(section)

    def getFindingSection(self) -> Section:
        assert self.sections[0].name == "findings"
        return self.sections[0]


class Records(object):
    def __init__(self):
        self.records = []

    def addRecord(self, record: Record):
        self.records.append(record)


Methods to resolve specific JSON subtags

In [None]:
def resolveSyntaxUnit(syntaxUnit):
    text = syntaxUnit["InputMatch"]
    syntaxType = syntaxUnit["SyntaxType"]
    tokens = syntaxUnit["Tokens"]
    # Add punc to token list
    if not tokens:
        logger.trace(f"Empty token detected: SyntaxType:{syntaxType}, InputMatch:{text}")
        tokens = [text]
    try:
        lexicalMatch = syntaxUnit["LexMatch"]
        partOfSpeech = syntaxUnit["LexCat"]
    except KeyError:
        lexicalMatch = ""
        partOfSpeech = ""
    if text.lower() != lexicalMatch and text.isalnum():
        logger.trace(f"text:[{text}], lexicalMatch:[{lexicalMatch}]")
    return SyntaxChunk(text, lexicalMatch, syntaxType, partOfSpeech, tokens)


def resolveConcept(mappingCandidate):
    sourceTokens = mappingCandidate["MatchedWords"]
    startPosList = [int(i["StartPos"]) for i in mappingCandidate["ConceptPIs"]]
    lengthList = [int(i["Length"]) for i in mappingCandidate["ConceptPIs"]]
    umlsCUI = mappingCandidate["CandidateCUI"]
    preferedName = mappingCandidate["CandidatePreferred"]
    hitTerm = mappingCandidate["CandidateMatched"]
    categories = mappingCandidate["SemTypes"]
    isHead = 1 if mappingCandidate["IsHead"] == "yes" else 0
    isNegated = 1 if mappingCandidate["Negated"] == "1" else 0
    return Concept(
        sourceTokens, startPosList, lengthList, umlsCUI, preferedName, hitTerm, categories, isHead, isNegated
    )


def resolveNegation(negation):
    trigger = negation["NegTrigger"]
    triggerStartPosList = [int(i["StartPos"]) for i in negation["NegTriggerPIs"]]
    triggerLengthList = [int(i["Length"]) for i in negation["NegTriggerPIs"]]
    conceptCUIs = [i["NegConcCUI"] for i in negation["NegConcepts"]]
    targetStartPosList = [int(i["StartPos"]) for i in negation["NegConcPIs"]]
    targetLengthList = [int(i["Length"]) for i in negation["NegConcPIs"]]
    return Negation(trigger, triggerStartPosList, triggerLengthList, conceptCUIs, targetStartPosList, targetLengthList)



Method to resolve JSON format output

In [None]:
def parseMetamapJSON(json_obj, id_subList) -> Records:
    records = Records()
    for _idx, _document in enumerate(json_obj["AllDocuments"]):
        # print(_document.keys())
        # print(record['Document']['Negations'])
        record = Record(id_subList[_idx])
        section = Section("findings")
        for _utterance in _document["Document"]["Utterances"]:
            # print(_utterance.keys())
            sentence = Sentence(
                text=_utterance["UttText"], startPos=int(_utterance["UttStartPos"]), length=int(_utterance["UttLength"])
            )
            for _phrase in _utterance["Phrases"]:
                # print(_phrase.keys())
                phrase = Phrase(
                    text=_phrase["PhraseText"],
                    startPos=int(_phrase["PhraseStartPos"]),
                    length=int(_phrase["PhraseLength"]),
                )
                for _syntaxUnit in _phrase["SyntaxUnits"]:
                    # print(_syntaxUnit.keys())
                    syntaxChunk = resolveSyntaxUnit(_syntaxUnit)
                    phrase.addSyntaxChunk(syntaxChunk)
                for _mapping in _phrase["Mappings"]:
                    # print(_mapping.keys())
                    conceptGroup = ConceptGroup()
                    for _mappingCandidate in _mapping["MappingCandidates"]:
                        # print(_mappingCandidate.keys())
                        concept = resolveConcept(_mappingCandidate)
                        conceptGroup.addConcept(concept)
                    phrase.addConceptGroup(conceptGroup)
                sentence.addPhrase(phrase)
            section.addSentence(sentence)
        for _negation in _document["Document"]["Negations"]:
            negation = resolveNegation(_negation)
            section.addNegation(negation)
        record.addSection(section)
        records.addRecord(record)
    return records


Methods to align the metamap output to the spacy output

In [None]:
from spacy.tokens import Doc
from operator import itemgetter


def align(baseList: Doc, inputTokenGroups):
    alignment = [-1] * len(baseList)
    for id, tokenGroup in enumerate(inputTokenGroups):
        alignment[tokenGroup.start : tokenGroup.end] = [id] * (tokenGroup.end - tokenGroup.start)
    return alignment


def align_byIndex(baseList: Doc, inputIndexGroups):
    alignment = [-1] * len(baseList)
    for id, indexGroup in enumerate(inputIndexGroups):
        alignment[indexGroup[0] : indexGroup[-1] + 1] = [id] * len(indexGroup)
    return alignment


def align_byIndex_individually_withData_noOverlap(baseList: Doc, inputIndexGroups_withData):
    alignment = [-1] * len(baseList)
    for id, indexGroup_withData in enumerate(inputIndexGroups_withData):
        indexGroup = indexGroup_withData["indices"]
        extra_str = indexGroup_withData["extra_str"]
        for index in indexGroup:
            alignment[index] = f"{id}|{extra_str}"
    return alignment


def align_byIndex_individually_withData(baseList: Doc, inputIndexGroups_withData):
    alignment = [-1] * len(baseList)
    for id, indexGroup_withData in enumerate(inputIndexGroups_withData):
        indexGroup = indexGroup_withData["indices"]
        extra_str = indexGroup_withData["extra_str"]
        for index in indexGroup:
            if alignment[index] == -1:
                alignment[index] = [extra_str]
            else:
                alignment[index].append(extra_str)
    return alignment


def getTokenOffset(baseText: str, inputTokens):
    startPos = 0
    offset = []
    for token in inputTokens:
        offsetPos = baseText.find(token.text, startPos, len(baseText))
        offset.append(offsetPos)
        startPos = offsetPos + len(token.text)
    return offset


def resolveTokenIndices_byPosition(tokenOffset, startPos, length) -> list:
    indicesList = []
    doInsert = False
    posPointer = startPos
    for i, currPos in enumerate(tokenOffset):
        nextPos = tokenOffset[i + 1] if i + 1 < len(tokenOffset) else tokenOffset[i] + 99
        if not doInsert and posPointer >= currPos and posPointer < nextPos:
            doInsert = True
            posPointer = startPos + length - 1
        elif doInsert and posPointer < currPos:
            break  # break the loop in advance, othewise will stop when finish the loop.
        if doInsert:
            indicesList.append(i)
    return indicesList


def resolveTokenIndices_byPosition_multiToken(tokenOffset, startPosList, lengthList) -> list:
    idxList_3d = [
        resolveTokenIndices_byPosition(tokenOffset, startPos, length)
        for startPos, length in zip(startPosList, lengthList)
    ]
    idxList_flatten = [idx for idxList in idxList_3d for idx in idxList]
    return idxList_flatten


def trimIndices(_indices, keepNum):
    interval = []
    for id, current in enumerate(_indices):
        if id == len(_indices) - 1:
            break
        nextid = id + 1
        next = _indices[nextid]
        interval.append(next - current)
    interval_withIdx = list(enumerate(interval))
    trimed_list = sorted(interval_withIdx, key=itemgetter(1))[0 : keepNum - 1]
    idx_remained = set()
    for i in trimed_list:
        idx_remained.add(i[0])
        idx_remained.add(i[0] + 1)
    return [_indices[i] for i in idx_remained]


def replPunc(matchObj):
    if matchObj.string == matchObj.group(0):
        return matchObj.string
    else:
        return ""


def findSubString(sourceText, subStr, subStr_tokens, begin):
    sourceText = sourceText.lower()
    startPos = sourceText.find(subStr.lower(), begin)
    if startPos != -1:
        return startPos, len(subStr)
    else:
        # Sometimes metamap will rewrite the text, making the subStr differ to the source text.
        # In this case, we use token.
        if subStr_tokens:
            subStr_tokens = [i.lower() for i in subStr_tokens]
            startPos = sourceText.find(subStr_tokens[0], begin)
            assert startPos != -1
            nextStartPos = startPos + len(subStr_tokens[0])
            for token in subStr_tokens[1:]:
                nextStartPos = sourceText.find(token, nextStartPos)
                nextStartPos += len(token)
            assert nextStartPos - startPos > 0
            return startPos, nextStartPos - startPos
        else:
            return begin, 0


In [None]:
import jsonpickle


def classToJSON(obj) -> str:

    return jsonpickle.encode(obj, unpicklable=False)



Format metamap outputs so that it can be aligned to spacy tokens

In [None]:
def formatMetamapRecord(metamapRecord):
    reportText = metamapRecord.getFindingSection().text
    phrases = [phrase for sentence in metamapRecord.getFindingSection().sentences for phrase in sentence.phrases]
    tokenOffset = spacyOutput.loc[:, SPACY_COLUMN_NAME["token_offset"]].tolist()
    phraseIdxGroups = []
    syntaxChunkIdxGroups_withData = []
    conceptIdxGroup_withData = []
    negTriggerGroups_withData = []
    negTargetGroups_withData = []
    conceptGroupId = 0
    negationGroupId = 0
    offsetBegin = 0
    for phrase in phrases:
        phraseIdxList = resolveTokenIndices_byPosition(tokenOffset, phrase.startPos, phrase.length)
        phraseIdxGroups.append(phraseIdxList)
        for syntaxChunk in phrase.syntaxChunks:
            startPos, length = findSubString(reportText, syntaxChunk.text, syntaxChunk.tokens, offsetBegin)
            offsetBegin = startPos + length
            syntaxChunkIdxGroups_withData.append(
                {
                    "indices": resolveTokenIndices_byPosition(tokenOffset, startPos, length),
                    "extra_str": f"{syntaxChunk.syntaxType}|{syntaxChunk.partOfSpeech}|{syntaxChunk.tokens}",
                }
            )
        for conceptGroup in phrase.mappings:
            for concept in conceptGroup.concepts:
                conceptIdxList_flatten = resolveTokenIndices_byPosition_multiToken(
                    tokenOffset, concept.startPosList, concept.lengthList
                )
                conceptIdxGroup_withData.append(
                    {
                        "indices": conceptIdxList_flatten,
                        "extra_str": f"{conceptGroupId}|{concept.umlsCUI}|{concept.preferedName}({concept.hitTerm})|{','.join(concept.categories)}|{concept.isHead}|{concept.isNegated}",
                    }
                )
            conceptGroupId += 1
    for negation in metamapRecord.getFindingSection().negations:
        negTriggerIdxList_flatten = resolveTokenIndices_byPosition_multiToken(
            tokenOffset, negation.trgger["startPosList"], negation.trgger["lengthList"]
        )
        negTargetIdxList_flatten = resolveTokenIndices_byPosition_multiToken(
            tokenOffset, negation.tarrget["startPosList"], negation.tarrget["lengthList"]
        )
        negTriggerGroups_withData.append(
            {
                "indices": negTriggerIdxList_flatten,
                "extra_str": f"{negationGroupId}|{','.join([str(i) for i in negTargetIdxList_flatten])}|{','.join(negation.tarrget['conceptsCUIs'])}",
            }
        )
        negTargetGroups_withData.append(
            {
                "indices": negTargetIdxList_flatten,
                "extra_str": f"{negationGroupId}|{','.join([str(i) for i in negTriggerIdxList_flatten])}|{','.join(negation.tarrget['conceptsCUIs'])}",
            }
        )
        negationGroupId += 1
    return (
        phraseIdxGroups,
        syntaxChunkIdxGroups_withData,
        conceptIdxGroup_withData,
        negTriggerGroups_withData,
        negTargetGroups_withData,
    )



In [None]:
SPACY_PREFIXX = "[sp]"
SPACY_COLUMN_NAME = {
    "token": SPACY_PREFIXX + "token",
    "token_offset": SPACY_PREFIXX + "token_offset",
    "sentence_group": SPACY_PREFIXX + "sentence_group",
    "noun_chunk": SPACY_PREFIXX + "noun_chunk",
    "lemma": SPACY_PREFIXX + "lemma",
    "pos_core": SPACY_PREFIXX + "pos_core",
    "pos_feature": SPACY_PREFIXX + "pos_feature",
    "dependency": SPACY_PREFIXX + "dependency",
    "dependency_head": SPACY_PREFIXX + "dependency_head",
    "dependency_children": SPACY_PREFIXX + "dependency_children",
    "morphology": SPACY_PREFIXX + "morphology",
    "is_alpha": SPACY_PREFIXX + "is_alpha",
    "is_stop": SPACY_PREFIXX + "is_stop",
    "is_pronoun": SPACY_PREFIXX + "is_pronoun",
    "trailing_space": SPACY_PREFIXX + "trailing_space",
}
METAMAP_PREFIXX = "[mm]"
METAMAP_COLUMN_NAME = {
    "phrase": METAMAP_PREFIXX + "metamap_phrase",
    "syntax_chunk": METAMAP_PREFIXX + "syntax_chunk|syntax_type|pos",
    "concept": METAMAP_PREFIXX + "concept_group|CUI|prefered_name(hit_synonym)|categories|isHead|isNegated",
    "neg_trigger": METAMAP_PREFIXX + "negation_group|target_token_indices|target_CUI",
    "negated_target": METAMAP_PREFIXX + "negation_group|trigger_token_indices|target_CUI",
}


In [None]:
import stanza
from stanza.server import CoreNLPClient


Start CoreNLP Server

In [None]:
# import spacy
# from stanza.server import CoreNLPClient, StartServer

# CORENLP_CUSTOM_PROPS = {
#     "annotators": "tokenize, ssplit, pos, lemma, ner, depparse, coref",
#     "coref.algorithm": "statistical",
# }
# client = CoreNLPClient(
#     memory="8G", threads=8, endpoint="http://localhost:8801", be_quiet=True, properties=CORENLP_CUSTOM_PROPS,
#     start_server=StartServer.FORCE_START
# )
# client.start()


Run

In [None]:
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import Lock
from IPython.display import display, HTML
import json, time
import spacy
import stanza
from stanza.server import CoreNLPClient
from stanza.pipeline.core import DownloadMethod

# mp.cpu_count()
METAMAP_CORES = 8
STANZA_CORES = 8
CORENLP_CORES = 8
BATCH_SIZE = 1
DATA_START_POS = 8690  # 8690
DATA_END_POS = 8691
# DATA_END_INDEX = DATA_SIZE

SECTION_FLAG = "findings"

STANZA_PROCESSOR_DICT = {
    "tokenize": "mimic",
    "pos": "mimic",
    "lemma": "mimic",
    "depparse": "mimic",
    # 'sentiment':'sstplus', # Sentiment scores of 0, 1, or 2 (negative, neutral, positive).
    "constituency": "wsj",  # wsj, wsj_bert, wsj_roberta
    "ner": "radiology",
}

nlp_spacy = spacy.load("en_core_web_md", disable=["ner"])
nlp_stanza = stanza.Pipeline(
    "en", processors=STANZA_PROCESSOR_DICT, package=None, download_method=DownloadMethod.REUSE_RESOURCES, verbose=False
)  # logging_level='WARN'

executor = ProcessPoolExecutor(max_workers=METAMAP_CORES)
all_task = [
    executor.submit(run_metamap, startIndex, BATCH_SIZE)
    for startIndex in range(DATA_START_POS, DATA_END_POS, BATCH_SIZE)
]

# time0 = time.time()
for future in as_completed(all_task):
    # time1 = time.time()
    # print(f"Get response from Metamap in: {time1-time0}s")
    # Metamap
    metamap_output, idx_inteval, input_list = future.result()
    id_subList = id_list[idx_inteval[0] : idx_inteval[1]]
    metamap_json_output = list(metamap_output.split("\n"))[1]  # Only the second line is the required JSON string.
    metamap_json_obj = json.loads(metamap_json_output)
    parsed_obj_batch = parseMetamapJSON(metamap_json_obj, id_subList)
    # print(classToJSON(parsed_obj_batch))

    # Stanza
    # time2 = time.time()
    # in_docs = []
    # for record in parsed_obj_batch.records:
    #     stanzaDoc = stanza.Document([], text=record.getFindingSection().text)
    #     stanzaDoc._sid = record.sid
    #     in_docs.append(stanzaDoc)
    # # Call the neural pipeline on this list of documents
    # # The output is also a list of stanza.Document objects, each output corresponding to an input Document object
    # out_docs = nlp_stanza(in_docs)
    # # print([i._sid for i in out_docs])
    # time3 = time.time()
    # print(f"Finish Stanza batch process in: {time3-time2}s")

    # SpaCy
    time4 = time.time()
    text_tuples = [(text, {"record": record}) for text, record in zip(input_list, parsed_obj_batch.records)]
    for doc, context in nlp_spacy.pipe(text_tuples, as_tuples=True):
        data = {
            SPACY_COLUMN_NAME["token"]: [tok.text for tok in doc],
            SPACY_COLUMN_NAME["token_offset"]: getTokenOffset(doc.text, doc),
            SPACY_COLUMN_NAME["sentence_group"]: align(doc, doc.sents),
            SPACY_COLUMN_NAME["noun_chunk"]: align(doc, doc.noun_chunks),
            SPACY_COLUMN_NAME["lemma"]: [tok.lemma_ for tok in doc],
            SPACY_COLUMN_NAME["pos_core"]: [f"[{tok.pos_}]{spacy.explain(tok.pos_)}" for tok in doc],
            SPACY_COLUMN_NAME["pos_feature"]: [f"[{tok.tag_}]{spacy.explain(tok.tag_)}" for tok in doc],
            SPACY_COLUMN_NAME["dependency"]: [f"[{tok.dep_}]{spacy.explain(tok.dep_)}" for tok in doc],
            SPACY_COLUMN_NAME["dependency_head"]: [tok.head.text for tok in doc],
            SPACY_COLUMN_NAME["dependency_children"]: [[child for child in tok.children] for tok in doc],
            SPACY_COLUMN_NAME["morphology"]: [tok.morph for tok in doc],
            SPACY_COLUMN_NAME["is_alpha"]: [tok.is_alpha for tok in doc],
            SPACY_COLUMN_NAME["is_stop"]: [tok.is_stop for tok in doc],
            SPACY_COLUMN_NAME["is_pronoun"]: [True if tok.pos_ == "PRON" else False for tok in doc],
            SPACY_COLUMN_NAME["trailing_space"]: [True if tok.whitespace_ else False for tok in doc],
        }
        spacyOutput = pd.DataFrame(data=data)

        # Metamap
        metamapRecord = context["record"]  # the Record obj resolved from metamap output
        # print(metamapRecord.sid)
        # display(HTML(spacyOutput.to_html()))
        phraseInfo, syntaxChunkInfo, conceptInfo, negTriggerInfo, negTargetInfo = formatMetamapRecord(metamapRecord)

        metamapOutput = pd.DataFrame(
            {
                METAMAP_COLUMN_NAME["phrase"]: align_byIndex(doc, phraseInfo),
                METAMAP_COLUMN_NAME["syntax_chunk"]: align_byIndex_individually_withData_noOverlap(
                    doc, syntaxChunkInfo
                ),
                METAMAP_COLUMN_NAME["concept"]: align_byIndex_individually_withData(doc, conceptInfo),
                METAMAP_COLUMN_NAME["neg_trigger"]: align_byIndex_individually_withData(doc, negTriggerInfo),
                METAMAP_COLUMN_NAME["negated_target"]: align_byIndex_individually_withData(doc, negTargetInfo),
            }
        )
        output = spacyOutput.join(metamapOutput)

        # CoreNLP
        # time41 = time.time()
        # corenlp_doc = client.annotate(metamapRecord.getFindingSection().text)
        # time42 = time.time()
        # print(f"Get response from CoreNLP for single record in: {time42-time41}s")

    # time5 = time.time()
    # print(f"Finish Spacy+CoreNLP batch process in: {time5-time4}s")
    # time0 = time.time()


### Check coreNLP output

In [None]:
import requests

corenlp_out = requests.post(
    'http://[::]:8801/?properties={"annotators":"coref","coref.algorithm":"statistical","outputFormat":"json"}',
    data=findings_list[8690].encode(),
).text

print(corenlp_out)

In [None]:
corenlp_json = json.loads(corenlp_out)

In [None]:
CORENLP_PREFIXX = "[co]"
CORENLP_COLUMN_NAME = {
    "token": CORENLP_PREFIXX + "token",
    "pos": CORENLP_PREFIXX + "pos",
    "lemma": CORENLP_PREFIXX + "lemma",
    "corefMentionIndex": CORENLP_PREFIXX + "corefMentionIndex",
}

tokenOffset_spacy = spacyOutput.loc[:, SPACY_COLUMN_NAME["token_offset"]].tolist()


def align_byIndex_individually(length, inputIndexGroups):
    alignment = [-1] * length
    for id, indexGroup in enumerate(inputIndexGroups):
        for index in indexGroup:
            if alignment[index] == -1:
                alignment[index] = [id]
            else:
                alignment[index].append(id)
    return alignment


def align_byIndex_individually_withData_noOverlap1(length, inputIndexGroups_withData):
    alignment = [-1] * length
    for id, indexGroup_withData in enumerate(inputIndexGroups_withData):
        indexGroup = indexGroup_withData["indices"]
        extra_str = indexGroup_withData["extra_str"]
        for index in indexGroup:
            alignment[index] = f"{id}|{extra_str}"
    return alignment

def align_byIndex_individually_withData_dictInList(tokNum, inputDictList):
    alignment = [-1] * tokNum
    for elementDict in inputDictList:
        index = elementDict['index']
        extra_str = elementDict['extra_str']
        if alignment[index] == -1:
            alignment[index] = [extra_str]
        else:
            alignment[index].append(extra_str)
    return alignment

referTo_spacy = [
    resolveTokenIndices_byPosition(tokenOffset_spacy, token['characterOffsetBegin'], token['characterOffsetEnd'] - token['characterOffsetBegin'])[0]
    for sentence in corenlp_json['sentences']
    for token in sentence['tokens']
]

sentenceFirstTokenIndex_offset = [0]
tokenTotalNum = 0
for sentId, sentence in enumerate(corenlp_json['sentences']):
    tokenNum = len(sentence['tokens'])
    tokenTotalNum += tokenNum
    nextOffset = sentenceFirstTokenIndex_offset[sentId] + tokenNum
    sentenceFirstTokenIndex_offset.append(nextOffset)

dep_list = []
depPlus_list = []
depPlusPlus_list = []
for sentId, sentence in enumerate(corenlp_json['sentences']):
    for basicDep in sentence['basicDependencies']:
        depTag = basicDep['dep']
        headTok = basicDep['governorGloss']
        headTokIdx = basicDep['governor'] - 1 + sentenceFirstTokenIndex_offset[sentId]
        headTokIdx_inSpacy = referTo_spacy[headTokIdx]
        currentTok = basicDep['dependentGloss']
        currentTokIdx = basicDep['dependent'] - 1 + sentenceFirstTokenIndex_offset[sentId]
        currentTokIdx_inSpacy = referTo_spacy[currentTokIdx]
        # Make the root dep token point to itself, just like what spacy did.
        if depTag == "ROOT" and headTok == "ROOT" and basicDep['governor'] == 0:
            headTok = currentTok
            headTokIdx = currentTokIdx
            headTokIdx_inSpacy = currentTokIdx_inSpacy
        dep_list.append({
            'tag':depTag,
            'currTok':currentTok,
            'currIdx':currentTokIdx,
            'currIdx_algin': currentTokIdx_inSpacy,
            'headTok':headTok,
            'headIdx':headTokIdx,
            'headIdx_align':headTokIdx_inSpacy,
            'index':currentTokIdx,
            'extra_str':f"{depTag}|{headTok}|{headTokIdx_inSpacy}",
        })
    for depPlus in sentence['enhancedDependencies']:
        depTag = depPlus['dep']
        headTok = depPlus['governorGloss']
        headTokIdx = depPlus['governor'] - 1 + sentenceFirstTokenIndex_offset[sentId]
        headTokIdx_inBase = referTo_spacy[headTokIdx]
        currentTok = depPlus['dependentGloss']
        currentTokIdx = depPlus['dependent'] - 1 + sentenceFirstTokenIndex_offset[sentId]
        currentTokIdx_inBase = referTo_spacy[currentTokIdx]
        # Make the root dep token point to itself, just like what spacy did.
        if depTag == "ROOT" and headTok == "ROOT" and depPlus['governor'] == 0:
            headTok = currentTok
            headTokIdx = currentTokIdx
            headTokIdx_inBase = currentTokIdx_inBase
        depPlus_list.append({
            'index':currentTokIdx,
            'extra_str':f"{depTag}|{headTok}|{headTokIdx_inBase}", # We use headTokIdx_inBase as all the rows/tokens will finally align to df_base (df_spacy)
        })
    for depPlusPlus in sentence['enhancedPlusPlusDependencies']:
        depTag = depPlusPlus['dep']
        headTok = depPlusPlus['governorGloss']
        headTokIdx = depPlusPlus['governor'] - 1 + sentenceFirstTokenIndex_offset[sentId]
        headTokIdx_inBase = referTo_spacy[headTokIdx]
        currentTok = depPlusPlus['dependentGloss']
        currentTokIdx = depPlusPlus['dependent'] - 1 + sentenceFirstTokenIndex_offset[sentId]
        currentTokIdx_inBase = referTo_spacy[currentTokIdx]
        # Make the root dep token point to itself, just like what spacy did.
        if depTag == "ROOT" and headTok == "ROOT" and depPlusPlus['governor'] == 0:
            headTok = currentTok
            headTokIdx = currentTokIdx
            headTokIdx_inBase = currentTokIdx_inBase
        depPlusPlus_list.append({
            'index':currentTokIdx,
            'extra_str':f"{depTag}|{headTok}|{headTokIdx_inBase}", # We use headTokIdx_inBase as all the rows/tokens will finally align to df_base (df_spacy)
        })
# dep_list.sort(key=lambda ele:ele['currIdx'])

corefGroups = []
corefMetionGroups_withData = []
for corefChain in corenlp_json['corefs'].values():
    corefGroup = []
    for mention in corefChain:
        sentenceFirstIndexStart = sentenceFirstTokenIndex_offset[mention['sentNum']-1]
        beginIndex = sentenceFirstIndexStart + mention['startIndex']- 1
        endIndex = sentenceFirstIndexStart + mention['endIndex'] - 1  # The index of the next token of the target token
        mentionIndices = [i for i in range(beginIndex, endIndex)]
        corefGroup.append(mentionIndices)
        mentionType = mention['type']
        corefMetionGroups_withData.append(
            {"indices": mentionIndices, "extra_str": mentionType,}
        )
    corefGroup_flatten = [indices for mention in corefGroup for indices in mention]
    corefGroups.append(corefGroup_flatten)

df_corenlp_length = tokenTotalNum
tokenOffset_spacy = spacyOutput.loc[:, SPACY_COLUMN_NAME["token_offset"]].tolist()

df_corenlp = pd.DataFrame(
    {
        # 'referTo_spacy':referTo_spacy,
        CORENLP_COLUMN_NAME["token"]: [
            token['originalText'] for sentence in corenlp_json['sentences'] for token in sentence['tokens']
        ],
        CORENLP_COLUMN_NAME["pos"]: [token['pos'] for sentence in corenlp_json['sentences'] for token in sentence['tokens']],
        CORENLP_COLUMN_NAME["lemma"]: [token['lemma'] for sentence in corenlp_json['sentences'] for token in sentence['tokens']],
        "corefId|corefType": align_byIndex_individually_withData_noOverlap1(
            df_corenlp_length, corefMetionGroups_withData
        ),
        "corefGroup": align_byIndex_individually(df_corenlp_length, corefGroups),
        "dep_tag":[item['tag'] for item in dep_list],
        "dep_head":[f"{item['headTok']}|{item['headIdx_align']}" for item in dep_list],
        "dep":align_byIndex_individually_withData_dictInList(df_corenlp_length, dep_list),
        "dep+":align_byIndex_individually_withData_dictInList(df_corenlp_length, depPlus_list),
        "dep++":align_byIndex_individually_withData_dictInList(df_corenlp_length, depPlusPlus_list),
        # })
    },
    index=referTo_spacy,
)

In [None]:
from IPython.display import display, HTML

display(HTML(df_corenlp.to_html()))
# df_new = output.join(df_corenlp)
# display(HTML(df_new.to_html()))

### Check metamap output

In [None]:
output[output[SPACY_COLUMN_NAME["pos_core"]].str.contains("PRON")]



In [None]:
output.to_csv("/home/yuxiangliao/PhD/output/s53741303_test.csv")


In [None]:
from IPython.display import display, HTML

display(HTML(output.to_html()))
