## Relation Extraction

In [163]:
import json

with open('spark_nlp_for_healthcare_spark_ocr_4210.json', 'r') as f:
    license_keys = json.load(f)

In [165]:
# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
import os
os.environ.update(license_keys)

In [166]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


In [167]:
# if you want to start the session with custom params as in start function above
from pyspark.sql import SparkSession

def start(SECRET):
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[*]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:"+PUBLIC_VERSION) \
        .config("spark.jars", "https://pypi.johnsnowlabs.com/"+SECRET+"/spark-nlp-jsl-"+JSL_VERSION+".jar")
      
    return builder.getOrCreate()


In [168]:
import json
import os
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

Spark NLP Version : 3.4.1
Spark NLP_JSL Version : 3.4.1


In [169]:
import functools 
import numpy as np
from scipy import spatial
import pyspark.sql.functions as F
import pyspark.sql.types as T
from sparknlp.base import *

In [170]:
documenter = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentencer = SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentences")

tokenizer = sparknlp.annotators.Tokenizer()\
    .setInputCols(["sentences"])\
    .setOutputCol("tokens")

words_embedder = WordEmbeddingsModel()\
    .pretrained("embeddings_clinical", "en", "clinical/models")\
    .setInputCols(["sentences", "tokens"])\
    .setOutputCol("embeddings")

pos_tagger = PerceptronModel()\
    .pretrained("pos_clinical", "en", "clinical/models") \
    .setInputCols(["sentences", "tokens"])\
    .setOutputCol("pos_tags")

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
pos_clinical download started this may take some time.
Approximate size to download 1.5 MB
[OK!]


In [171]:
ner_chunker = NerConverterInternal()\
    .setInputCols(["sentences", "tokens", "ner_tags"])\
    .setOutputCol("ner_chunks")

dependency_parser = DependencyParserModel()\
    .pretrained("dependency_conllu", "en")\
    .setInputCols(["sentences", "pos_tags", "tokens"])\
    .setOutputCol("dependencies")

dependency_conllu download started this may take some time.
Approximate size to download 16.7 MB
[OK!]


In [172]:
import pandas as pd

def get_relations_df (results, col='relations'):
    rel_pairs=[]
    for rel in results[0][col]:
        rel_pairs.append((
          rel.result, 
          rel.metadata['entity1'], 
          rel.metadata['entity1_begin'],
          rel.metadata['entity1_end'],
          rel.metadata['chunk1'], 
          rel.metadata['entity2'],
          rel.metadata['entity2_begin'],
          rel.metadata['entity2_end'],
          rel.metadata['chunk2'], 
          rel.metadata['confidence']
        ))

    rel_df = pd.DataFrame(rel_pairs, columns=['relation','entity1','entity1_begin','entity1_end','chunk1','entity2','entity2_begin','entity2_end','chunk2', 'confidence'])

    return rel_df


In [61]:
text ="""Radiologic studies also included a chest
CT, which confirmed cavitary lesions in the left lung apex
consistent with infectious process/tuberculosis.
"""

lmodel = LightPipeline(model)
annotations = lmodel.fullAnnotate(text)

rel_df = get_relations_df (annotations)

rel_df[rel_df.relation!="O"]

Unnamed: 0,relation,entity1,entity1_begin,entity1_end,chunk1,entity2,entity2_begin,entity2_end,chunk2,confidence


In [62]:
text ="""RADIOLOGIC STUDIES:  Radiologic studies also included a chest
CT, which confirmed cavitary lesions in the left lung apex
consistent with infectious process/tuberculosis.  This also
moderate-sized left pleural effusion.

HEAD CT:  Head CT showed no intracranial hemorrhage or mass
effect, but old infarction consistent with past medical
history.

ABDOMINAL CT:  Abdominal CT showed lesions of T10 and sacrum most likely secondary to osteoporosis. These can be followed by repeat imaging as an outpatient.

"""

lmodel = LightPipeline(model)
annotations = lmodel.fullAnnotate(text)

rel_df = get_relations_df(annotations)

rel_df[rel_df.relation!="O"]

Unnamed: 0,relation,entity1,entity1_begin,entity1_end,chunk1,entity2,entity2_begin,entity2_end,chunk2,confidence
0,TeRP,TEST,220,226,HEAD CT,PROBLEM,248,270,intracranial hemorrhage,0.99999976
1,TeRP,TEST,230,236,Head CT,PROBLEM,248,270,intracranial hemorrhage,1.0
2,TeRP,TEST,230,236,Head CT,PROBLEM,292,305,old infarction,0.9972275
3,TeRP,TEST,361,372,Abdominal CT,PROBLEM,381,405,lesions of T10 and sacrum,1.0


In [25]:
rel_df

Unnamed: 0,relation,entity1,entity1_begin,entity1_end,chunk1,entity2,entity2_begin,entity2_end,chunk2,confidence


In [173]:
clinical_ner_tagger = MedicalNerModel.pretrained("ner_clinical", "en", "clinical/models")\
    .setInputCols("sentences", "tokens", "embeddings")\
    .setOutputCol("ner_tags")    

clinical_re_ner_chunk_filter = RENerChunksFilter() \
    .setInputCols(["ner_chunks", "dependencies"])\
    .setOutputCol("re_ner_chunks")\
    .setMaxSyntacticDistance(4)\
    .setRelationPairs(["problem-test", "problem-treatment"])# we can set the possible relation pairs (if not set, all the relations will be calculated)
    
clinical_re_Model = RelationExtractionDLModel() \
    .pretrained('redl_clinical_biobert', "en", "clinical/models")\
    .setPredictionThreshold(0.9)\
    .setInputCols(["re_ner_chunks", "sentences"]) \
    .setOutputCol("relations")

pipeline = Pipeline(stages=[
    documenter,
    sentencer,
    tokenizer, 
    words_embedder, 
    pos_tagger, 
    clinical_ner_tagger,
    ner_chunker,
    dependency_parser,
    clinical_re_ner_chunk_filter,
    clinical_re_Model
])

empty_data = spark.createDataFrame([[""]]).toDF("text")
model = pipeline.fit(empty_data)

ner_clinical download started this may take some time.
Approximate size to download 13.9 MB
[OK!]
redl_clinical_biobert download started this may take some time.
Approximate size to download 383.4 MB
[OK!]


In [174]:
import pandas as pd

In [175]:
#importing the file with discharge summaries
df = pd.read_csv("discharge_df.csv")

In [176]:
df.head()

Unnamed: 0.1,Unnamed: 0,CATEGORY,TEXT
0,0,Discharge summary,Admission Date: [**2151-7-16**] Dischar...
1,1,Discharge summary,Admission Date: [**2118-6-2**] Discharg...
2,2,Discharge summary,Admission Date: [**2119-5-4**] D...
3,3,Discharge summary,Admission Date: [**2124-7-21**] ...
4,4,Discharge summary,Admission Date: [**2162-3-3**] D...


In [177]:
len(df)

59652

In [178]:
df["TEXT"][0]

'Admission Date:  [**2151-7-16**]       Discharge Date:  [**2151-8-4**]\n\n\nService:\nADDENDUM:\n\nRADIOLOGIC STUDIES:  Radiologic studies also included a chest\nCT, which confirmed cavitary lesions in the left lung apex\nconsistent with infectious process/tuberculosis.  This also\nmoderate-sized left pleural effusion.\n\nHEAD CT:  Head CT showed no intracranial hemorrhage or mass\neffect, but old infarction consistent with past medical\nhistory.\n\nABDOMINAL CT:  Abdominal CT showed lesions of\nT10 and sacrum most likely secondary to osteoporosis. These can\nbe followed by repeat imaging as an outpatient.\n\n\n\n                            [**First Name8 (NamePattern2) **] [**First Name4 (NamePattern1) 1775**] [**Last Name (NamePattern1) **], M.D.  [**MD Number(1) 1776**]\n\nDictated By:[**Hospital 1807**]\nMEDQUIST36\n\nD:  [**2151-8-5**]  12:11\nT:  [**2151-8-5**]  12:21\nJOB#:  [**Job Number 1808**]\n'

In [142]:
annotations = lmodel.fullAnnotate((df["TEXT"][0]).replace("\n", " "))
rel_df = get_relations_df (annotations)

rel_df = rel_df[(rel_df.relation!="O")]

rel_df

Unnamed: 0,relation,entity1,entity1_begin,entity1_end,chunk1,entity2,entity2_begin,entity2_end,chunk2,confidence
0,TeRP,TEST,313,319,HEAD CT,PROBLEM,341,363,intracranial hemorrhage,0.9948571
1,TeRP,TEST,323,329,Head CT,PROBLEM,341,363,intracranial hemorrhage,0.99840754
2,TeRP,TEST,323,329,Head CT,PROBLEM,385,398,old infarction,0.99830353
3,TeRP,TEST,454,465,Abdominal CT,PROBLEM,474,498,lesions of T10 and sacrum,0.99864763


In [210]:
rel_df.columns

Index(['relation', 'entity1', 'entity1_begin', 'entity1_end', 'chunk1',
       'entity2', 'entity2_begin', 'entity2_end', 'chunk2', 'confidence'],
      dtype='object')

In [211]:
column_names = ['relation', 'entity1', 'entity1_begin', 'entity1_end', 'chunk1',
       'entity2', 'entity2_begin', 'entity2_end', 'chunk2', 'confidence']

In [212]:
relation_df = pd.DataFrame(columns = column_names)

In [213]:
relation_df

Unnamed: 0,relation,entity1,entity1_begin,entity1_end,chunk1,entity2,entity2_begin,entity2_end,chunk2,confidence


In [214]:
from tqdm import tqdm as tq
import time

# ner_results = []

#range can be changed as required
for a in tq(range(50000,59652)):
    annotations = lmodel.fullAnnotate((df.loc[a, "TEXT"]).replace("\n", " "))
    rel_df = get_relations_df (annotations)

    rel_df = rel_df[(rel_df.relation!="O")]
#     print(len(rel_df))
    if len(rel_df)!=0:
        relation_df = relation_df.append(rel_df,ignore_index=True)

100%|██████████| 9652/9652 [14:16:52<00:00,  5.33s/it]   


In [215]:
relation_df

Unnamed: 0,relation,entity1,entity1_begin,entity1_end,chunk1,entity2,entity2_begin,entity2_end,chunk2,confidence
0,TeRP,TEST,1064,1070,an XRay,PROBLEM,1092,1121,a closed fracture of the femur,0.9973917
1,TeRP,TEST,1124,1133,Basic labs,PROBLEM,1161,1175,baseline anemia,0.99217427
2,TeRP,TEST,1472,1474,EKG,PROBLEM,1483,1486,LBBB,0.99847513
3,TrAP,PROBLEM,1483,1486,LBBB,TREATMENT,1518,1525,Morphine,0.9915821
4,TeRP,TEST,6061,6067,CT head,PROBLEM,6103,6131,acute intracranial hemorrhage,0.9977088
...,...,...,...,...,...,...,...,...,...,...
109632,TeCP,PROBLEM,9657,9675,infectious diseases,TEST,9681,9711,monitoring of your blood counts,0.94961625
109633,TeRP,TEST,764,774,examination,PROBLEM,794,798,rales,0.9989237
109634,TeRP,PROBLEM,2816,2832,an ongoing anemia,TEST,2839,2849,hematocrits,0.92045087
109635,TrAP,PROBLEM,3479,3492,your incisions,TREATMENT,3531,3539,ointments,0.94121


In [120]:
from tqdm import tqdm as tq
import time

ner_results = []
for a in tq(range(1)):
    annotations = lmodel.fullAnnotate(text)
    rel_df = get_relations_df (annotations)

    rel_df = rel_df[(rel_df.relation!="O")]
# #     print(len(rel_df))
#     if len(rel_df)!=0:
#         relation_df = relation_df.append(rel_df,ignore_index=True)

100%|██████████| 1/1 [09:50<00:00, 590.23s/it]


In [121]:
rel_df

Unnamed: 0,relation,entity1,entity1_begin,entity1_end,chunk1,entity2,entity2_begin,entity2_end,chunk2,confidence
0,TeRP,TEST,313,319,HEAD CT,PROBLEM,341,363,intracranial hemorrhage,0.99725145
1,TeRP,TEST,323,329,Head CT,PROBLEM,341,363,intracranial hemorrhage,0.99826175
2,TeRP,TEST,323,329,Head CT,PROBLEM,385,398,old infarction,0.9983309
3,TeRP,TEST,454,465,Abdominal CT,PROBLEM,474,498,lesions of T10 and sacrum,0.99946445
6,TeRP,TEST,1779,1795,Review of systems,PROBLEM,1829,1834,Fevers,0.9946861
...,...,...,...,...,...,...,...,...,...,...
437,TrAP,PROBLEM,194023,194044,Chronic left knee pain,TREATMENT,194081,194115,debridement of left knee [**2123**],0.9693431
438,TrCP,PROBLEM,194670,194697,2+ pitting edema b/l Strings,TREATMENT,194704,194708,seton,0.9560664
439,TrAP,TREATMENT,195923,195927,PRBCs,PROBLEM,195933,195955,borderline urine output,0.95823085
440,TrAP,TREATMENT,196410,196417,coumadin,PROBLEM,196432,196433,PE,0.9911623


In [90]:
relation_df

Unnamed: 0,relation,entity1,entity1_begin,entity1_end,chunk1,entity2,entity2_begin,entity2_end,chunk2,confidence
0,TeRP,TEST,313,319,HEAD CT,PROBLEM,341,363,intracranial hemorrhage,0.9966468
1,TeRP,TEST,323,329,Head CT,PROBLEM,341,363,intracranial hemorrhage,0.9990114
2,TeRP,TEST,323,329,Head CT,PROBLEM,385,398,old infarction,0.998779
3,TeRP,TEST,454,465,Abdominal CT,PROBLEM,474,498,lesions of T10 and sacrum,0.99928623
4,TrAP,PROBLEM,253,261,emphysema,TREATMENT,271,277,home O2,0.91821116
...,...,...,...,...,...,...,...,...,...,...
141,TrAP,TREATMENT,895,903,resection,PROBLEM,908,924,the radionecrosis,0.9967272
142,TrAP,PROBLEM,1066,1080,a wound washout,TREATMENT,1086,1096,craniectomy,0.9099828
143,TrAP,TREATMENT,1228,1240,radio therapy,PROBLEM,1246,1248,avm,0.9863196
144,TrAP,TREATMENT,3774,3786,Acetaminophen,PROBLEM,3856,3868,pain/t>100/HA,0.9954633


In [134]:
relation_df["relation"].unique()

array(['TeRP', 'TrAP', 'TrCP', 'TeCP', 'TrIP', 'PIP', 'TrNAP', 'TrWP'],
      dtype=object)

In [216]:
relation_df.to_csv("/home/akumar/relations-8.csv")