In [None]:
import os
os.path.abspath("./")

In [None]:
import os, sys
from loguru import logger

LOG_ROOT = os.path.abspath("./")
LOG_FILE = LOG_ROOT + "/logs/metamap_processing.log"

# Remove all handlers and reset stderr
logger.remove(handler_id=None)
logger.add(
    LOG_FILE,
    level="TRACE",
    mode="w",
    backtrace=False,
    diagnose=True,
    colorize=False,
    format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
)
logger.info("\r\n" + ">" * 29 + "\r\n" + ">>> New execution started >>>" + "\r\n" + ">" * 29)
# To filter log level: TRACE=5, DEBUG=10, INFO=20, SUCCESS=25, WARNING=30, ERROR=40, CRITICAL=50
logger.add(sys.stdout, level="INFO", filter=lambda record: record["level"].no < 40, colorize=True)
logger.add(sys.stderr, level="ERROR", backtrace=False, diagnose=True, colorize=True)

# Load Data

In [15]:
import pandas
REPORT_PATH = "/home/yuxiangliao/PhD/data/mimic_cxr_reports_core.json"
df = pandas.read_json(REPORT_PATH,orient="records",lines=True)
print(df)

id_list = df.loc[:,'sid'].to_list()
findings_list = df.loc[:,'findings'].to_list()
impression_list = df.loc[:,'impression'].to_list()
pfi_list = df.loc[:,'provisional_findings_impression'].to_list()
fai_list = df.loc[:,'findings_and_impression'].to_list()

DATA_SIZE = len(id_list)

              pid        sid  \
0       p10000032  s50414267   
1       p10000032  s53189527   
2       p10000032  s53911762   
3       p10000032  s56699142   
4       p10000764  s57375967   
...           ...        ...   
227830  p19999442  s58708861   
227831  p19999733  s57132437   
227832  p19999987  s55368167   
227833  p19999987  s58621812   
227834  p19999987  s58971208   

                                                 findings  \
0       There is no focal consolidation, pleural effus...   
1       The cardiac, mediastinal and hilar contours ar...   
2       Single frontal view of the chest provided. \n ...   
3       The lungs are clear of focal consolidation, pl...   
4       PA and lateral views of the chest provided.   ...   
...                                                   ...   
227830  ET tube ends 4.7 cm above the carina.  NG tube...   
227831  The lungs are clear, and the cardiomediastinal...   
227832  There has been interval extubation and improve...   
22783

# Run Spacy

In [None]:
!conda env list

In [48]:
from spacy.language import Language

@Language.component("metamap_processor")
def metamap_processor_func(doc):
    return doc


In [None]:
import spacy
import pandas as pd

BATCH_SIZE = 1
DATA_START_POS = 376
DATA_END_POS = 377

nlp = spacy.load("en_core_web_md",disable=['ner'])
nlp.add_pipe("metamap_processor", last=True)

# Batch pocessing
for startPos in range(DATA_START_POS,DATA_END_POS,BATCH_SIZE):
    text_tuples = [(text,{"sid":sid}) for sid, text in zip(id_list[startPos:startPos+BATCH_SIZE],findings_list[startPos:startPos+BATCH_SIZE])]
    
    for doc, context in nlp.pipe(text_tuples, as_tuples=True):
        print(nlp.pipe_names)
        sid = context["sid"]
        nounChunks = [-1] * len(doc)
        for id, chunk in enumerate(doc.noun_chunks):
            nounChunks[chunk.start:chunk.end] = [id] * (chunk.end-chunk.start)
        sentences = [-1] * len(doc)
        for id, sent in enumerate(doc.sents):
            sentences[sent.start:sent.end] = [id] * (sent.end-sent.start)
        offset = [0]
        for i,tok in enumerate(doc):
            offset.append(text.find(tok.text,offset[i],len(text)))
        offset = offset[1:]
        data = {
            'token': [tok for tok in doc],
            'tokenOffset': offset,
            'sentenceGroup': sentences,
            'nounChunk': nounChunks,
            'lemma': [tok.lemma_ for tok in doc],
            'pos_core': [f"[{tok.pos_}]{spacy.explain(tok.pos_)}" for tok in doc],
            'pos_feature': [f"[{tok.tag_}]{spacy.explain(tok.tag_)}" for tok in doc],
            'dependency': [f"[{tok.dep_}]{spacy.explain(tok.dep_)}" for tok in doc],
            'dependency_head': [tok.head.text for tok in doc],
            'dependency_children': [[child for child in tok.children] for tok in doc],
            'morphology': [tok.morph for tok in doc],
            'is_alpha': [tok.is_alpha for tok in doc],
            'is_stop': [tok.is_stop for tok in doc],
            'is_pronoun': [True if tok.pos_ == 'PRON' else False for tok in doc],
            'trailing_space': [True if tok.whitespace_ else False for tok in doc]
        }
        output = pd.DataFrame(data=data)

Check Dataframe

In [29]:
# output[output['pos_core'].str.contains("PRON")].index.tolist()
output[output['pos_core'].str.contains("PRON")]

Unnamed: 0,token,tokenOffset,sentenceGroup,nounChunk,lemma,pos_core,pos_feature,dependency,dependency_head,dependency_children,morphology,is_alpha,is_stop,is_pronoun,trailing_space
76,it,379,4,14,it,[PRON]pronoun,"[PRP]pronoun, personal",[nsubj]nominal subject,appears,[],"(Case=Nom, Gender=Neut, Number=Sing, Person=3,...",True,True,True,True


In [None]:
l1 = [0, 3, 12, 15, 24, 34, 36, 40, 46, 54, 63, 67, 77, 80, 87, 91, 94, 98, 105, 108, 111, 115, 119, 121, 125, 134, 137, 141, 154, 160, 163, 172, 175, 178, 182, 190, 192, 196, 201, 206, 215, 219, 224, 232, 236, 240, 248, 252, 261, 263, 267, 272, 276, 279, 283, 287, 290, 294, 299, 302, 312, 315, 325, 333, 336, 344, 349, 353, 363, 368, 370, 377, 379, 382, 390, 393, 401, 406, 410, 416, 424, 427, 431, 437, 444, 448, 454, 457, 464, 469, 472, 474, 477, 479, 481, 485, 495, 504, 507, 511, 516, 525, 534, 536, 546, 553, 556, 560, 570, 575]
l2 = output.loc[:,'tokenOffset']
print(l1)
print(l2.tolist())

In [None]:
# pd.set_option("display.max_columns", None)
# pd.set_option("display.max_rows", None)
# record

from IPython.display import display, HTML
display(HTML(output.to_html()))

Visualization

In [None]:
from spacy import displacy
displacy.serve(doc, style="dep")
# sentence_spans = list(doc.sents)
# displacy.serve(sentence_spans, style="dep")

Explain tag and label

In [None]:
spacy.explain('Peri')