In [None]:
import os
os.path.abspath("")

In [None]:
import os, sys
from loguru import logger

LOG_ROOT = os.path.abspath("./")
LOG_FILE = LOG_ROOT + "/logs/sr-3.log"

# Remove all handlers and reset stderr
logger.remove(handler_id=None)
logger.add(
    LOG_FILE,
    level="TRACE",
    mode="w",
    backtrace=False,
    diagnose=True,
    colorize=False,
    format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
)
logger.info("\r\n" + ">" * 29 + "\r\n" + ">>> New execution started >>>" + "\r\n" + ">" * 29)
# To filter log level: TRACE=5, DEBUG=10, INFO=20, SUCCESS=25, WARNING=30, ERROR=40, CRITICAL=50
logger.add(sys.stdout, level="INFO", filter=lambda record: record["level"].no < 40, colorize=True)
logger.add(sys.stderr, level="ERROR", backtrace=False, diagnose=True, colorize=True)

## Load data

In [1]:
import pandas as pd
REPORT_PATH = "/home/yuxiangliao/PhD/data/mimic_cxr_reports_core.json"
df = pd.read_json(REPORT_PATH,orient="records",lines=True)
print(df)

id_list = df.loc[:,'sid'].to_list()
findings_list = df.loc[:,'findings'].to_list()
impression_list = df.loc[:,'impression'].to_list()
pfi_list = df.loc[:,'provisional_findings_impression'].to_list()
fai_list = df.loc[:,'findings_and_impression'].to_list()

DATA_SIZE = len(id_list)

              pid        sid  \
0       p10000032  s50414267   
1       p10000032  s53189527   
2       p10000032  s53911762   
3       p10000032  s56699142   
4       p10000764  s57375967   
...           ...        ...   
227830  p19999442  s58708861   
227831  p19999733  s57132437   
227832  p19999987  s55368167   
227833  p19999987  s58621812   
227834  p19999987  s58971208   

                                                 findings  \
0       There is no focal consolidation, pleural effus...   
1       The cardiac, mediastinal and hilar contours ar...   
2       Single frontal view of the chest provided. \n ...   
3       The lungs are clear of focal consolidation, pl...   
4       PA and lateral views of the chest provided.   ...   
...                                                   ...   
227830  ET tube ends 4.7 cm above the carina.  NG tube...   
227831  The lungs are clear, and the cardiomediastinal...   
227832  There has been interval extubation and improve...   
22783

# Check CoreNLP

In [1]:
!ls $CORENLP_HOME

build.xml				  jollyday.jar
corenlp.sh				  LIBRARY-LICENSES
CoreNLP-to-HTML.xsl			  LICENSE.txt
ejml-core-0.39.jar			  Makefile
ejml-core-0.39-sources.jar		  patterns
ejml-ddense-0.39.jar			  pom-java-11.xml
ejml-ddense-0.39-sources.jar		  pom-java-17.xml
ejml-simple-0.39.jar			  pom.xml
ejml-simple-0.39-sources.jar		  protobuf-java-3.19.2.jar
input.txt				  README.txt
input.txt.out				  RESOURCE-LICENSES
input.txt.xml				  SemgrexDemo.java
istack-commons-runtime-3.0.7.jar	  ShiftReduceDemo.java
istack-commons-runtime-3.0.7-sources.jar  slf4j-api.jar
javax.activation-api-1.2.0.jar		  slf4j-simple.jar
javax.activation-api-1.2.0-sources.jar	  stanford-corenlp-4.4.0.jar
javax.json-api-1.0-sources.jar		  stanford-corenlp-4.4.0-javadoc.jar
javax.json.jar				  stanford-corenlp-4.4.0-models.jar
jaxb-api-2.4.0-b180830.0359.jar		  stanford-corenlp-4.4.0-sources.jar
jaxb-api-2.4.0-b180830.0359-sources.jar   StanfordCoreNlpDemo.java
jaxb-impl-2.4.0-b180830.0438.jar	  StanfordDependenciesManual.p

# CoreNLP

In [22]:
import requests
request_url = 'http://[::]:8801/?properties={"annotators":"coref","coref.algorithm":"statistical","outputFormat":"json"}'
t = "The man call Leo. He is a PhD"
corenlp_out_jsonStr = requests.post(request_url, data=t.encode()).text

In [23]:
import json
json.loads(corenlp_out_jsonStr)

{'sentences': [{'index': 0,
   'basicDependencies': [{'dep': 'ROOT',
     'governor': 0,
     'governorGloss': 'ROOT',
     'dependent': 3,
     'dependentGloss': 'call'},
    {'dep': 'det',
     'governor': 2,
     'governorGloss': 'man',
     'dependent': 1,
     'dependentGloss': 'The'},
    {'dep': 'nsubj',
     'governor': 3,
     'governorGloss': 'call',
     'dependent': 2,
     'dependentGloss': 'man'},
    {'dep': 'obj',
     'governor': 3,
     'governorGloss': 'call',
     'dependent': 4,
     'dependentGloss': 'Leo'},
    {'dep': 'punct',
     'governor': 3,
     'governorGloss': 'call',
     'dependent': 5,
     'dependentGloss': '.'}],
   'enhancedDependencies': [{'dep': 'ROOT',
     'governor': 0,
     'governorGloss': 'ROOT',
     'dependent': 3,
     'dependentGloss': 'call'},
    {'dep': 'det',
     'governor': 2,
     'governorGloss': 'man',
     'dependent': 1,
     'dependentGloss': 'The'},
    {'dep': 'nsubj',
     'governor': 3,
     'governorGloss': 'call',
    

# CoreNLPClient

In [6]:
import stanza
from stanza.server import CoreNLPClient
import time
import json
COREF_PROPS = {
    'annotators':'tokenize, ssplit, pos, lemma, ner, parse, coref',
    "coref.algorithm": "statistical",
    'outputFormat': 'json',
}
DCOREF_PROPS = {
    'annotators':'tokenize, ssplit, pos, lemma, ner, parse, dcoref',
    'outputFormat': 'json',
}
NEURAL_PROPS = {
    'annotators':'tokenize, ssplit, pos, lemma, ner, parse, coref',
    'coref.algorithm': 'neural ',
    'outputFormat': 'json',
}
client = CoreNLPClient(memory='4G', threads=8, endpoint='http://localhost:8802', be_quiet=False,output_format="JSON", properties=NEURAL_PROPS)
t = "The man call Leo. He is a PhD"
client.start()
client.ensure_alive()

2022-08-20 16:23:31 INFO: Writing properties to tmp file: corenlp_server-0e7ab311873c4c85.props
2022-08-20 16:23:31 INFO: Starting server with command: java -Xmx4G -cp /home/yuxiangliao/PhD/Stanford_CoreNLP/stanford-corenlp-4.5.0/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 8802 -timeout 60000 -threads 8 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-0e7ab311873c4c85.props -preload -outputFormat JSON
[main] INFO CoreNLP - --- StanfordCoreNLPServer#main() called ---
[main] INFO CoreNLP -     Build: x86_64-conda-linux-gnu
[main] INFO CoreNLP - Server default properties:
			(Note: unspecified annotator properties are English defaults)
			annotators = tokenize, ssplit, pos, lemma, ner, parse, coref
			coref.algorithm = neural
			inputFormat = text
			outputFormat = JSON
			prettyPrint = false
			threads = 8
[main] INFO CoreNLP - Threads: 8
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.

In [8]:
import requests
request_url = 'http://0.0.0.0:8802/'
target_sid = ['s51791511']
for sid in target_sid:
    t = impression_list[id_list.index(sid)]
    corenlp_out_jsonStr = requests.post(request_url, data=t.encode()).text
    json.loads(corenlp_out_jsonStr)

[pool-1-thread-5] INFO CoreNLP - [/127.0.0.1:56082] API call w/annotators tokenize,pos,lemma,ner,parse,coref


AP chest reviewed in the absence of prior chest radiographs, at 9:20 a.m., a following series of chest radiographs as late as 8:09 a.m. on ___:   By the time this chest radiograph was officially interpreted by the staff radiologist, at 9:20 a.m., there was subsequent radiographic documentation of partial treatment of bilateral pneumothorax. The findings on this chest radiograph were discussed by Dr. ___ with Dr. ___ ___ Dr. ___, ___ the telephone at 9:50 and 9:55 p.m., respectively.   Large bilateral pneumothorax, pneumomediastinum, pneumopericardium, and extensive subcutaneous emphysema in the neck and less so right chest wall attributable to duodenal perforation.  Since the pneumothorax is bilateral, there is less likelihood of mediastinal shift from hemodynamically significant positive pleural pressure.  Similarly, although the diaphragm is not depressed, there is probably pneumoperitoneum or at least pneumoretroperitoneum which could temper the effects of even hemodynamically signi

In [9]:
client.stop()

[Thread-0] INFO CoreNLP - CoreNLP Server is shutting down.


In [None]:
# import stanza
# from stanza.server import CoreNLPClient

In [None]:
# import time
# CUSTOM_PROPS = {
#     'annotators':'tokenize, ssplit, pos, lemma, ner, depparse, coref',
#     "coref.algorithm": "statistical"
# }
# with CoreNLPClient(memory='8G', threads=16, endpoint='http://localhost:8801', be_quiet=False, 
#                    properties=CUSTOM_PROPS) as client:
#     start = time.time()
#     for text in findings_list[0:100]:
#         document = client.annotate(text)
#     done = time.time()
#     elapsed = done - start
#     print(elapsed)

In [None]:
# import stanza
# from stanza.server import CoreNLPClient

In [None]:
# import time
# CUSTOM_PROPS = {
#     'annotators':'tokenize, ssplit, pos, lemma, ner, depparse, coref',
#     "coref.algorithm": "statistical"
# }
# with CoreNLPClient(memory='8G', threads=16, endpoint='http://localhost:8801', be_quiet=False, 
#                    properties=CUSTOM_PROPS) as client:
#     start = time.time()
#     for text in findings_list[0:100]:
#         document = client.annotate(text)
#     done = time.time()
#     elapsed = done - start
#     print(elapsed)

# Stanza

Biomedical models

In [None]:
# import stanza
# stanza.download('en', package='mimic')
# stanza.download('en', package='radiology')

Stanza processor: https://stanfordnlp.github.io/stanza/pipeline.html#processors

In [None]:
# from stanza.pipeline.core import DownloadMethod
# import stanza

# processor_dict = {
#     'tokenize': 'mimic', 
#     'pos': 'mimic', 
#     'lemma': 'mimic',
#     'depparse': 'mimic',
#     # 'sentiment':'sstplus', # Sentiment scores of 0, 1, or 2 (negative, neutral, positive).
#     'constituency': 'wsj', # wsj, wsj_bert, wsj_roberta
#     'ner': 'radiology',
# }
# nlp = stanza.Pipeline('en', processors=processor_dict, package=None, 
#                       download_method=DownloadMethod.REUSE_RESOURCES,
#                       verbose=False) # logging_level='WARN'
# doc = nlp(findings_list[8690])
# # print out dependency tree
# print(doc)

Processing Multiple Documents

28s -> 100docs

In [10]:
import stanza
from stanza.pipeline.core import DownloadMethod

processor_dict = {
    'tokenize': 'mimic', 
    'pos': 'mimic', 
    'lemma': 'mimic',
    'depparse': 'mimic',
    # 'sentiment':'sstplus', # Sentiment scores of 0, 1, or 2 (negative, neutral, positive).
    'constituency': 'wsj', # wsj, wsj_bert, wsj_roberta
    'ner': 'radiology',
}

def set_sid(self, value):
    self._sid = value
def get_sid(self):
    return self._sid

nlp = stanza.Pipeline('en', processors=processor_dict, package=None, 
                      download_method=DownloadMethod.REUSE_RESOURCES,
                      verbose=False) # logging_level='WARN'
documents = findings_list[8690:8691] # Documents that we are going to process
# stanza.Document.add_property("sid",getter=get_sid, setter=set_sid)


NER tag
- B，Begin
- I，Intermediate
- E，End
- S，Single
- O，Other

In [11]:
in_docs = []
for id,d in enumerate(documents):
    stanzaDoc = stanza.Document([], text=d)
    stanzaDoc._sid = id_list[id]
    in_docs.append(stanzaDoc)
# in_docs = [stanza.Document([], text=d) for d in documents] # Wrap each document with a stanza.Document object
out_docs = nlp(in_docs) # Call the neural pipeline on this list of documents
# print(out_docs[0]) # The output is also a list of stanza.Document objects, each output corresponding to an input Document object
for doc in out_docs:
    print(f"sid:{doc._sid}")
    print(doc)

sid:s50414267
[
  [
    {
      "id": 1,
      "text": "Compared",
      "lemma": "compare",
      "upos": "VERB",
      "xpos": "VBN",
      "head": 5,
      "deprel": "case",
      "start_char": 0,
      "end_char": 8,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 2,
      "text": "with",
      "lemma": "with",
      "upos": "ADP",
      "xpos": "IN",
      "head": 5,
      "deprel": "case",
      "start_char": 9,
      "end_char": 13,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 3,
      "text": "the",
      "lemma": "the",
      "upos": "DET",
      "xpos": "DT",
      "head": 5,
      "deprel": "det",
      "start_char": 14,
      "end_char": 17,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 4,
      "text": "prior",
      "lemma": "prior",
      "upos": "ADJ",
      "xpos": "JJ",
      "head": 5,
      "deprel": "amod",
      "start_char": 18,
      "end_char": 23,
 