In [None]:
import os
os.path.abspath("")

In [None]:
import os, sys
from loguru import logger

LOG_ROOT = os.path.abspath("./")
LOG_FILE = LOG_ROOT + "/logs/sr-3.log"

# Remove all handlers and reset stderr
logger.remove(handler_id=None)
logger.add(
    LOG_FILE,
    level="TRACE",
    mode="w",
    backtrace=False,
    diagnose=True,
    colorize=False,
    format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
)
logger.info("\r\n" + ">" * 29 + "\r\n" + ">>> New execution started >>>" + "\r\n" + ">" * 29)
# To filter log level: TRACE=5, DEBUG=10, INFO=20, SUCCESS=25, WARNING=30, ERROR=40, CRITICAL=50
logger.add(sys.stdout, level="INFO", filter=lambda record: record["level"].no < 40, colorize=True)
logger.add(sys.stderr, level="ERROR", backtrace=False, diagnose=True, colorize=True)

## Load data

In [1]:
import pandas as pd
REPORT_PATH = "/home/yuxiangliao/PhD/data/mimic_cxr_reports_core.json"
df = pd.read_json(REPORT_PATH,orient="records",lines=True)
print(df)

id_list = df.loc[:,'sid'].to_list()
findings_list = df.loc[:,'findings'].to_list()
impression_list = df.loc[:,'impression'].to_list()
pfi_list = df.loc[:,'provisional_findings_impression'].to_list()
fai_list = df.loc[:,'findings_and_impression'].to_list()

DATA_SIZE = len(id_list)

              pid        sid  \
0       p10000032  s50414267   
1       p10000032  s53189527   
2       p10000032  s53911762   
3       p10000032  s56699142   
4       p10000764  s57375967   
...           ...        ...   
227830  p19999442  s58708861   
227831  p19999733  s57132437   
227832  p19999987  s55368167   
227833  p19999987  s58621812   
227834  p19999987  s58971208   

                                                 findings  \
0       There is no focal consolidation, pleural effus...   
1       The cardiac, mediastinal and hilar contours ar...   
2       Single frontal view of the chest provided. \n ...   
3       The lungs are clear of focal consolidation, pl...   
4       PA and lateral views of the chest provided.   ...   
...                                                   ...   
227830  ET tube ends 4.7 cm above the carina.  NG tube...   
227831  The lungs are clear, and the cardiomediastinal...   
227832  There has been interval extubation and improve...   
22783

# Check CoreNLP

In [None]:
!ls $CORENLP_HOME

# CoreNLP

In [None]:
import stanza
from stanza.server import CoreNLPClient

In [None]:
import time
CUSTOM_PROPS = {
    'annotators':'tokenize, ssplit, pos, lemma, ner, depparse, coref',
    "coref.algorithm": "statistical"
}
with CoreNLPClient(memory='8G', threads=16, endpoint='http://localhost:8801', be_quiet=False, 
                   properties=CUSTOM_PROPS) as client:
    start = time.time()
    for text in findings_list[0:100]:
        document = client.annotate(text)
    done = time.time()
    elapsed = done - start
    print(elapsed)

# Stanza

Biomedical models

In [None]:
import stanza
stanza.download('en', package='mimic')
stanza.download('en', package='radiology')

Stanza processor: https://stanfordnlp.github.io/stanza/pipeline.html#processors

In [None]:
from stanza.pipeline.core import DownloadMethod
processor_dict = {
    'tokenize': 'mimic', 
    'pos': 'mimic', 
    'lemma': 'mimic',
    'depparse': 'mimic',
    # 'sentiment':'sstplus', # Sentiment scores of 0, 1, or 2 (negative, neutral, positive).
    'constituency': 'wsj', # wsj, wsj_bert, wsj_roberta
    'ner': 'radiology',
}
nlp = stanza.Pipeline('en', processors=processor_dict, package=None, 
                      download_method=DownloadMethod.REUSE_RESOURCES,
                      verbose=False) # logging_level='WARN'
doc = nlp(findings_list[8690])
# print out dependency tree
print(doc)

Processing Multiple Documents

28s -> 100docs

In [4]:
import stanza
from stanza.pipeline.core import DownloadMethod
processor_dict = {
    'tokenize': 'mimic', 
    'pos': 'mimic', 
    'lemma': 'mimic',
    'depparse': 'mimic',
    # 'sentiment':'sstplus', # Sentiment scores of 0, 1, or 2 (negative, neutral, positive).
    'constituency': 'wsj', # wsj, wsj_bert, wsj_roberta
    'ner': 'radiology',
}

def set_sid(self, value):
    self._sid = value
def get_sid(self):
    return self._sid

nlp = stanza.Pipeline('en', processors=processor_dict, package=None, 
                      download_method=DownloadMethod.REUSE_RESOURCES,
                      verbose=False) # logging_level='WARN'
documents = findings_list[0:10] # Documents that we are going to process
# stanza.Document.add_property("sid",getter=get_sid, setter=set_sid)


In [9]:
in_docs = []
for id,d in enumerate(documents):
    stanzaDoc = stanza.Document([], text=d)
    stanzaDoc._sid = id_list[id]
    in_docs.append(stanzaDoc)
# in_docs = [stanza.Document([], text=d) for d in documents] # Wrap each document with a stanza.Document object
out_docs = nlp(in_docs) # Call the neural pipeline on this list of documents
# print(out_docs[0]) # The output is also a list of stanza.Document objects, each output corresponding to an input Document object
print([i._sid for i in out_docs])
print(id_list[0:10])

['s50414267', 's53189527', 's53911762', 's56699142', 's57375967', 's50771383', 's54205396', 's50578979', 's51178377', 's55697293']
['s50414267', 's53189527', 's53911762', 's56699142', 's57375967', 's50771383', 's54205396', 's50578979', 's51178377', 's55697293']
