In [None]:
import os
os.path.abspath("./")

In [None]:
import os, sys
from loguru import logger

LOG_ROOT = os.path.abspath("./")
LOG_FILE = LOG_ROOT + "/logs/metamap_processing.log"

# Remove all handlers and reset stderr
logger.remove(handler_id=None)
logger.add(
    LOG_FILE,
    level="TRACE",
    mode="w",
    backtrace=False,
    diagnose=True,
    colorize=False,
    format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
)
logger.info("\r\n" + ">" * 29 + "\r\n" + ">>> New execution started >>>" + "\r\n" + ">" * 29)
# To filter log level: TRACE=5, DEBUG=10, INFO=20, SUCCESS=25, WARNING=30, ERROR=40, CRITICAL=50
logger.add(sys.stdout, level="INFO", filter=lambda record: record["level"].no < 40, colorize=True)
logger.add(sys.stderr, level="ERROR", backtrace=False, diagnose=True, colorize=True)

# Load Data

In [43]:
import pandas
REPORT_PATH = "/home/yuxiangliao/PhD/data/mimic_cxr_reports_core.json"
df = pandas.read_json(REPORT_PATH,orient="records",lines=True)
print(df)

id_list = df.loc[:,'sid'].to_list()
findings_list = df.loc[:,'findings'].to_list()
impression_list = df.loc[:,'impression'].to_list()
pfi_list = df.loc[:,'provisional_findings_impression'].to_list()
fai_list = df.loc[:,'findings_and_impression'].to_list()

DATA_SIZE = len(id_list)

              pid        sid  \
0       p10000032  s50414267   
1       p10000032  s53189527   
2       p10000032  s53911762   
3       p10000032  s56699142   
4       p10000764  s57375967   
...           ...        ...   
227830  p19999442  s58708861   
227831  p19999733  s57132437   
227832  p19999987  s55368167   
227833  p19999987  s58621812   
227834  p19999987  s58971208   

                                                 findings  \
0       There is no focal consolidation, pleural effus...   
1       The cardiac, mediastinal and hilar contours ar...   
2       Single frontal view of the chest provided. \n ...   
3       The lungs are clear of focal consolidation, pl...   
4       PA and lateral views of the chest provided.   ...   
...                                                   ...   
227830  ET tube ends 4.7 cm above the carina.  NG tube...   
227831  The lungs are clear, and the cardiomediastinal...   
227832  There has been interval extubation and improve...   
22783

# Run Spacy

In [None]:
!conda env list

POS Tagging:
 - Text: The original word text.
 - Lemma: The base form of the word.
 - POS: The simple UPOS part-of-speech tag.
 - Tag: The detailed part-of-speech tag.
 - Dep: Syntactic dependency, i.e. the relation between tokens.
 - Shape: The word shape – capitalization, punctuation, digits.
 - is alpha: Is the token an alpha character?
 - is stop: Is the token part of a stop list, i.e. the most common words of the language?

In [77]:
import spacy
import pandas as pd

DATA_START_POS = 376
DATA_STOP_POS = 377

nlp = spacy.load("en_core_web_md",disable=['ner'])
print(nlp.pipe_names)
lemmatizer = nlp.get_pipe("lemmatizer")
print(lemmatizer.mode)
for sid, text in zip(id_list[DATA_START_POS:DATA_STOP_POS],findings_list[DATA_START_POS:DATA_STOP_POS]):
    print(f"[{sid}] {text}")
    doc = nlp(text)
    nounChunks = [-1] * len(doc)
    for id, chunk in enumerate(doc.noun_chunks):
        nounChunks[chunk.start:chunk.end] = [id] * (chunk.end-chunk.start)
    sentences = [-1] * len(doc)
    for id, sent in enumerate(doc.sents):
        sentences[sent.start:sent.end] = [id] * (sent.end-sent.start)
    data = {
        'token': [tok for tok in doc],
        'sentenceGroup': sentences,
        'nounChunk': nounChunks,
        'lemma': [tok.lemma_ for tok in doc],
        'pos_core': [f"[{tok.pos_}]{spacy.explain(tok.pos_)}" for tok in doc],
        'pos_feature': [f"[{tok.tag_}]{spacy.explain(tok.tag_)}" for tok in doc],
        'dependency': [f"[{tok.dep_}]{spacy.explain(tok.dep_)}" for tok in doc],
        'dependency_head': [tok.head.text for tok in doc],
        'dependency_children': [[child for child in tok.children] for tok in doc],
        'morphology': [tok.morph for tok in doc],
        'is_alpha': [tok.is_alpha for tok in doc],
        'is_stop': [tok.is_stop for tok in doc],
        'is_pronoun': [True if tok.pos_ == 'PRON' else False for tok in doc],
        'trailing_space': [True if tok.whitespace_ else False for tok in doc]
    }
    output = pd.DataFrame(data=data)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer']
rule
[s57874790] As compared to previous radiograph, the right pleural effusion has decreased in extent and is now minimal.  On the left, the blunting of the costophrenic sinus is unchanged.  In the interval, the Swan-Ganz catheter has been removed and the patient has received a new PICC line.  The tip of the line is difficult to visualize because of overlay with the pacemaker wires. However, it appears to project over the upper aspects of the right atrium and could be pulled back by 2 to 3 cm. 
 Unchanged position of the left pectoral pacemaker, unchanged course of the pacemaker wires.


Check Dataframe

In [76]:
print([f"{i.text}|{i.i}" for i in doc])

['As|0', 'compared|1', 'to|2', 'previous|3', 'radiograph|4', ',|5', 'the|6', 'right|7', 'pleural|8', 'effusion|9', 'has|10', 'decreased|11', 'in|12', 'extent|13', 'and|14', 'is|15', 'now|16', 'minimal|17', '.|18', ' |19', 'On|20', 'the|21', 'left|22', ',|23', 'the|24', 'blunting|25', 'of|26', 'the|27', 'costophrenic|28', 'sinus|29', 'is|30', 'unchanged|31', '.|32', ' |33', 'In|34', 'the|35', 'interval|36', ',|37', 'the|38', 'Swan|39', '-|40', 'Ganz|41', 'catheter|42', 'has|43', 'been|44', 'removed|45', 'and|46', 'the|47', 'patient|48', 'has|49', 'received|50', 'a|51', 'new|52', 'PICC|53', 'line|54', '.|55', ' |56', 'The|57', 'tip|58', 'of|59', 'the|60', 'line|61', 'is|62', 'difficult|63', 'to|64', 'visualize|65', 'because|66', 'of|67', 'overlay|68', 'with|69', 'the|70', 'pacemaker|71', 'wires|72', '.|73', 'However|74', ',|75', 'it|76', 'appears|77', 'to|78', 'project|79', 'over|80', 'the|81', 'upper|82', 'aspects|83', 'of|84', 'the|85', 'right|86', 'atrium|87', 'and|88', 'could|89', 'b

In [60]:
# output[output['pos_core'].str.contains("PRON")].index.tolist()
output[output['pos_core'].str.contains("PRON")]

Unnamed: 0,token,sentenceGroup,nounChunk,lemma,pos_core,pos_feature,dependency,dependency_head,dependency_children,morphology,is_alpha,is_stop,is_pronoun
76,it,4,14,it,[PRON]pronoun,"[PRP]pronoun, personal",[nsubj]nominal subject,appears,[],"(Case=Nom, Gender=Neut, Number=Sing, Person=3,...",True,True,True


In [78]:
# pd.set_option("display.max_columns", None)
# pd.set_option("display.max_rows", None)
# record

from IPython.display import display, HTML
display(HTML(output.to_html()))

Unnamed: 0,token,sentenceGroup,nounChunk,lemma,pos_core,pos_feature,dependency,dependency_head,dependency_children,morphology,is_alpha,is_stop,is_pronoun,trailing_space
0,As,0,-1,as,[SCONJ]subordinating conjunction,"[IN]conjunction, subordinating or preposition",[mark]marker,compared,[],(),True,True,False,True
1,compared,0,-1,compare,[VERB]verb,"[VBN]verb, past participle",[prep]prepositional modifier,decreased,"[As, to]","(Aspect=Perf, Tense=Past, VerbForm=Part)",True,False,False,True
2,to,0,-1,to,[ADP]adposition,"[IN]conjunction, subordinating or preposition",[prep]prepositional modifier,compared,[radiograph],(),True,True,False,True
3,previous,0,0,previous,[ADJ]adjective,"[JJ]adjective (English), other noun-modifier (Chinese)",[amod]adjectival modifier,radiograph,[],(Degree=Pos),True,False,False,True
4,radiograph,0,0,radiograph,[NOUN]noun,"[NN]noun, singular or mass",[pobj]object of preposition,to,[previous],(Number=Sing),True,False,False,False
5,",",0,-1,",",[PUNCT]punctuation,"[,]punctuation mark, comma",[punct]punctuation,decreased,[],(PunctType=Comm),False,False,False,True
6,the,0,1,the,[DET]determiner,[DT]determiner,[det]determiner,effusion,[],"(Definite=Def, PronType=Art)",True,True,False,True
7,right,0,1,right,[ADJ]adjective,"[JJ]adjective (English), other noun-modifier (Chinese)",[amod]adjectival modifier,effusion,[],(Degree=Pos),True,False,False,True
8,pleural,0,1,pleural,[ADJ]adjective,"[JJ]adjective (English), other noun-modifier (Chinese)",[amod]adjectival modifier,effusion,[],(Degree=Pos),True,False,False,True
9,effusion,0,1,effusion,[NOUN]noun,"[NN]noun, singular or mass",[nsubj]nominal subject,decreased,"[the, right, pleural]",(Number=Sing),True,False,False,True


Visualization

In [34]:
from spacy import displacy
displacy.serve(doc, style="dep")
# sentence_spans = list(doc.sents)
# displacy.serve(sentence_spans, style="dep")


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


Explain tag and label

In [6]:
spacy.explain('Peri')

