In [1]:
import os
os.path.abspath("./")

'/home/yuxiangliao/PhD/workspace/VSCode_workspace/structured_reporting/notebook'

In [2]:
import os, sys
from loguru import logger

LOG_ROOT = os.path.abspath("./")
LOG_FILE = LOG_ROOT + "/logs/metamap_processing.log"

# Remove all handlers and reset stderr
logger.remove(handler_id=None)
logger.add(
    LOG_FILE,
    level="TRACE",
    mode="w",
    backtrace=False,
    diagnose=True,
    colorize=False,
    format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
)
logger.info("\r\n" + ">" * 29 + "\r\n" + ">>> New execution started >>>" + "\r\n" + ">" * 29)
# To filter log level: TRACE=5, DEBUG=10, INFO=20, SUCCESS=25, WARNING=30, ERROR=40, CRITICAL=50
logger.add(sys.stdout, level="INFO", filter=lambda record: record["level"].no < 40, colorize=True)
logger.add(sys.stderr, level="ERROR", backtrace=False, diagnose=True, colorize=True)

3

# Load Data

In [3]:
import pandas
REPORT_PATH = "/home/yuxiangliao/PhD/data/mimic_cxr_reports_core.json"
df = pandas.read_json(REPORT_PATH,orient="records",lines=True)
print(df)

id_list = df.loc[:,'sid'].to_list()
findings_list = df.loc[:,'findings'].to_list()
impression_list = df.loc[:,'impression'].to_list()
pfi_list = df.loc[:,'provisional_findings_impression'].to_list()
fai_list = df.loc[:,'findings_and_impression'].to_list()

DATA_SIZE = len(id_list)

              pid        sid  \
0       p10000032  s50414267   
1       p10000032  s53189527   
2       p10000032  s53911762   
3       p10000032  s56699142   
4       p10000764  s57375967   
...           ...        ...   
227830  p19999442  s58708861   
227831  p19999733  s57132437   
227832  p19999987  s55368167   
227833  p19999987  s58621812   
227834  p19999987  s58971208   

                                                 findings  \
0       There is no focal consolidation, pleural effus...   
1       The cardiac, mediastinal and hilar contours ar...   
2       Single frontal view of the chest provided. \n ...   
3       The lungs are clear of focal consolidation, pl...   
4       PA and lateral views of the chest provided.   ...   
...                                                   ...   
227830  ET tube ends 4.7 cm above the carina.  NG tube...   
227831  The lungs are clear, and the cardiomediastinal...   
227832  There has been interval extubation and improve...   
22783

# Run Spacy

In [4]:
!conda env list

# conda environments:
#
                         /home/yuxiangliao/anaconda3
                         /home/yuxiangliao/anaconda3/envs/coref_hf
base                  *  /home/yuxiangliao/anaconda3/envs/corenlp
                         /home/yuxiangliao/anaconda3/envs/py27
                         /home/yuxiangliao/anaconda3/envs/wl-coref



In [5]:
from spacy.language import Language

@Language.component("metamap_processor")
def metamap_processor_func(doc):
    return doc


In [1]:
import spacy
nlp = spacy.load("en_core_web_md",disable=['ner'])

In [3]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer']


In [6]:
import spacy
import pandas as pd

BATCH_SIZE = 1
DATA_START_POS = 8690
DATA_END_POS = 8691

nlp = spacy.load("en_core_web_md",disable=['ner'])

# Batch pocessing
for startPos in range(DATA_START_POS,DATA_END_POS,BATCH_SIZE):
    text_tuples = [(text,{"sid":sid,"text":text}) for sid, text in zip(id_list[startPos:startPos+BATCH_SIZE],findings_list[startPos:startPos+BATCH_SIZE])]
    
    for doc, context in nlp.pipe(text_tuples, as_tuples=True):
        print(nlp.pipe_names)
        sid = context["sid"]
        text = context["text"]
        nounChunks = [-1] * len(doc)
        for id, chunk in enumerate(doc.noun_chunks):
            nounChunks[chunk.start:chunk.end] = [id] * (chunk.end-chunk.start)
        sentences = [-1] * len(doc)
        for id, sent in enumerate(doc.sents):
            sentences[sent.start:sent.end] = [id] * (sent.end-sent.start)
        offset = [0]
        for i,tok in enumerate(doc):
            offset.append(text.find(tok.text,offset[i],len(text)))
        offset = offset[1:]
        data = {
            'token': [tok for tok in doc],
            'tokenOffset': offset,
            'sentenceGroup': sentences,
            'nounChunk': nounChunks,
            'lemma': [tok.lemma_ for tok in doc],
            'pos_core': [f"[{tok.pos_}]{spacy.explain(tok.pos_)}" for tok in doc],
            'pos_feature': [f"[{tok.tag_}]{spacy.explain(tok.tag_)}" for tok in doc],
            'dependency': [f"[{tok.dep_}]{spacy.explain(tok.dep_)}" for tok in doc],
            'dependency_head': [f"{tok.head.text}|{tok.head.i}" for tok in doc],
            'dependency_children': [[f"{child.text}|{child.i}" for child in tok.children] for tok in doc],
            'morphology': [tok.morph for tok in doc],
            'is_alpha': [tok.is_alpha for tok in doc],
            'is_stop': [tok.is_stop for tok in doc],
            'is_pronoun': [True if tok.pos_ == 'PRON' else False for tok in doc],
            'trailing_space': [True if tok.whitespace_ else False for tok in doc]
        }
        output = pd.DataFrame(data=data)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer']


Check Dataframe

In [7]:
# output[output['pos_core'].str.contains("PRON")].index.tolist()
# output[output['pos_core'].str.contains("PRON")]

In [8]:
l1 = [0, 3, 12, 15, 24, 34, 36, 40, 46, 54, 63, 67, 77, 80, 87, 91, 94, 98, 105, 108, 111, 115, 119, 121, 125, 134, 137, 141, 154, 160, 163, 172, 175, 178, 182, 190, 192, 196, 201, 206, 215, 219, 224, 232, 236, 240, 248, 252, 261, 263, 267, 272, 276, 279, 283, 287, 290, 294, 299, 302, 312, 315, 325, 333, 336, 344, 349, 353, 363, 368, 370, 377, 379, 382, 390, 393, 401, 406, 410, 416, 424, 427, 431, 437, 444, 448, 454, 457, 464, 469, 472, 474, 477, 479, 481, 485, 495, 504, 507, 511, 516, 525, 534, 536, 546, 553, 556, 560, 570, 575]
l2 = output.loc[:,'tokenOffset']
print(l1)
print(l2.tolist())

[0, 3, 12, 15, 24, 34, 36, 40, 46, 54, 63, 67, 77, 80, 87, 91, 94, 98, 105, 108, 111, 115, 119, 121, 125, 134, 137, 141, 154, 160, 163, 172, 175, 178, 182, 190, 192, 196, 201, 206, 215, 219, 224, 232, 236, 240, 248, 252, 261, 263, 267, 272, 276, 279, 283, 287, 290, 294, 299, 302, 312, 315, 325, 333, 336, 344, 349, 353, 363, 368, 370, 377, 379, 382, 390, 393, 401, 406, 410, 416, 424, 427, 431, 437, 444, 448, 454, 457, 464, 469, 472, 474, 477, 479, 481, 485, 495, 504, 507, 511, 516, 525, 534, 536, 546, 553, 556, 560, 570, 575]
[0, 9, 14, 18, 24, 28, 30, 33, 41, 48, 51, 60, 70, 74, 80, 84, 86, 92, 97, 100, 102, 108, 114, 118, 120, 125, 130, 137, 146, 149, 153, 159, 172, 178, 182, 190, 200, 205, 208, 212, 218, 223, 225, 230, 239, 243, 253, 259, 264, 265, 267, 270, 278, 291, 294, 304, 306, 310, 318, 322, 328, 334, 338, 346, 349, 353, 359, 363, 365, 373, 378, 380, 386, 399, 401, 403, 409, 412, 414, 420, 425, 433, 435, 441, 444, 451, 461, 470, 475, 479, 485, 489, 491, 495, 503, 503, 518, 522,

In [9]:
# pd.set_option("display.max_columns", None)
# pd.set_option("display.max_rows", None)
# record

from IPython.display import display, HTML
display(HTML(output.to_html()))

Unnamed: 0,token,tokenOffset,sentenceGroup,nounChunk,lemma,pos_core,pos_feature,dependency,dependency_head,dependency_children,morphology,is_alpha,is_stop,is_pronoun,trailing_space
0,Compared,0,0,-1,compare,[VERB]verb,"[VBN]verb, past participle",[prep]prepositional modifier,detected|10,[with|1],"(Aspect=Perf, Tense=Past, VerbForm=Part)",True,False,False,True
1,with,9,0,-1,with,[ADP]adposition,"[IN]conjunction, subordinating or preposition",[prep]prepositional modifier,Compared|0,[film|4],(),True,True,False,True
2,the,14,0,0,the,[DET]determiner,[DT]determiner,[det]determiner,film|4,[],"(Definite=Def, PronType=Art)",True,True,False,True
3,prior,18,0,0,prior,[ADJ]adjective,"[JJ]adjective (English), other noun-modifier (Chinese)",[amod]adjectival modifier,film|4,[],(Degree=Pos),True,False,False,True
4,film,24,0,0,film,[NOUN]noun,"[NN]noun, singular or mass",[pobj]object of preposition,with|1,"[the|2, prior|3]",(Number=Sing),True,False,False,False
5,",",28,0,-1,",",[PUNCT]punctuation,"[,]punctuation mark, comma",[punct]punctuation,detected|10,[],(PunctType=Comm),False,False,False,True
6,no,30,0,1,no,[DET]determiner,[DT]determiner,[det]determiner,change|8,[],(),True,True,False,True
7,obvious,33,0,1,obvious,[ADJ]adjective,"[JJ]adjective (English), other noun-modifier (Chinese)",[amod]adjectival modifier,change|8,[],(Degree=Pos),True,False,False,True
8,change,41,0,1,change,[NOUN]noun,"[NN]noun, singular or mass",[nsubjpass]nominal subject (passive),detected|10,"[no|6, obvious|7]",(Number=Sing),True,False,False,True
9,is,48,0,-1,be,[AUX]auxiliary,"[VBZ]verb, 3rd person singular present",[auxpass]auxiliary (passive),detected|10,[],"(Mood=Ind, Number=Sing, Person=3, Tense=Pres, VerbForm=Fin)",True,True,False,True


Visualization

In [10]:
from spacy import displacy
displacy.serve(doc, style="dep")
# sentence_spans = list(doc.sents)
# displacy.serve(sentence_spans, style="dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


Explain tag and label

In [None]:
spacy.explain('Peri')

In [5]:
import spacy
t = ['All the monitoring devices are unchanged and in standard position. Lung volumes persist, low, now with new opacification of the right lung for increased pleural fluid. There is no pleural effusion on the left lung. Heart size is mildly enlarged.']
nlp = spacy.load("en_core_web_md",disable=['ner'])
doc = nlp.pipe(t)