# Analyze Group C

Observations:
1. Cardiomegaly
2. Plueral Effusion
3. Eddema
4. Inspectra Lung Opacity (Infiltration + Consolidation + Lung Opacity)
5. Atelectasis
6. Lung Lesion (Mass + Nodule)

Steps:
1. Import data
2. Select data for each observation (choose reports that have different labelers)
3. Prepare data for finding patterns
4. Find patterns using RegEx and N-grams

## Import Data

In [3]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive

/content/drive


In [5]:
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

In [6]:
# get data from google sheet
import pandas as pd

worksheet = gc.open('Annotation_group_c_for_n_coop').sheet1

# get_all_values gives a list of rows
rows = worksheet.get_all_values()

# Convert to a DataFrame
df = pd.DataFrame.from_records(rows)
# Create columns name
df.columns = df.iloc[0]
df = df[1:]
# reset index value
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Image Index,Reports,Cardiomegaly BERT Labeler,Edema BERT Labeler,Inspectra Lung Opacity v1 BERT Labeler,Atelectasis BERT Labeler,Pleural Effusion BERT Labeler,Lung Lesion BERT Labeler,Cardiomegaly Inspectra Labeler,Edema Inspectra Labeler,...,Atelectasis Inspectra Labeler,Pleural Effusion Inspectra Labeler,Lung Lesion Inspectra Labeler,Inspectra Lung Opacity v1 Balanced Score,Mass Balanced Score,Nodule Balanced Score,Edema Balanced Score,Atelectasis Balanced Score,Cardiomegaly Balanced Score,Pleural Effusion Balanced Score
0,20290676,CXR (PA upright)\n\nPatchy infiltration at RL...,1,0,0,0,0,0,1,0,...,0,0,0,0.6705767571,0.08962687799,0.3223274601,0.008262414654,0.07552811707,0.01909299623,0.007672864324
1,20292772,CXR (PA upright)\n\nReticulonodular infiltrati...,0,0,0,0,0,1,0,0,...,0,0,0,0.9478721681,0.1678770714,0.4644771763,0.08673519902,0.2267119501,0.05830180422,0.878199365
2,20294567,Chest:-\nPa chest study reveals fibronodular i...,0,0,0,0,0,1,0,0,...,0,0,0,0.8226948088,0.1517771742,0.6166349793,0.01350364634,0.235725454,0.00647519908,0.08067476336
3,20294712,CHEST :\nP.A. upright view .\nFibronodular inf...,0,0,0,0,0,1,0,0,...,0,0,0,0.6979679996,0.1366843406,0.6163465142,0.04663362481,0.08208811247,0.02999480347,0.01371037712
4,20294740,CHEST :\nP.A. upright view .\nFibronodular inf...,0,0,0,0,0,1,0,0,...,0,0,1,0.9419731872,0.4525913554,0.4320501349,0.05085499389,0.2342405043,0.007880797901,0.1427466416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135579,25106595,CXR(PA)\n\n IMP:\n No active pulmonary infi...,1,0,0,0,0,0,0,0,...,0,0,0,0.04828104463,0.005975966292,0.08753935783,0.0004548292506,0.01260417629,0.01721901118,0.0006430751514
135580,25106622,CXR PA upright\n\n There is no definite pulmo...,0,0,0,0,0,0,0,0,...,0,0,0,0.04914811821,0.0117086733,0.1039028442,0.01578075513,0.07933877652,0.4247253044,0.007032924475
135581,25106687,CXR PA UPRIGHT\n\nIMP: As compared to previous...,1,0,0,-1,1,0,1,0,...,-1,1,0,0.5108068485,0.2280439488,0.3587409915,0.1286962249,0.2957346634,0.5565393814,0.2747272033
135582,25106812,CXR(PA)\n\n IMP:\n No active pulmonary infi...,1,0,0,0,0,0,0,0,...,0,0,0,0.0397352668,0.00500379011,0.0558833124,0.0008390195919,0.01478381395,0.02319880988,0.0005166731356


## Function

In [7]:
# create dataframe of each observations
def selectData(df, name):
  # choose rows that have different labelers
  name_df = df.loc[df[name + ' BERT Labeler'] != df[name + ' Inspectra Labeler']]
  name_df.reset_index(drop=True, inplace=True)

  # split into 2 dataframes
  BERT_0_Inspec_1_df = name_df[name_df[name + ' BERT Labeler'] == '0']
  BERT_1_Inspec_0_df = name_df[name_df[name + ' BERT Labeler'] == '1']

  return name_df, BERT_0_Inspec_1_df, BERT_1_Inspec_0_df

In [8]:
# create list of reports
import re

def createReportList(df):
  report = []
  for i in range(df['Reports'].shape[0]):
    report.append([s for s in (df['Reports'].iloc[i]).split('\n') if s != '' and s != ' '])
  print('Lenght :', len(report))
  return report

In [9]:
# crate n-grams
from collections import defaultdict
import re
from nltk import ngrams

def createPatternNgrams(data, n, keyword=None):
  pattern_dict = defaultdict(lambda: 0)
  for report in data:
    for s in report[1:]:
      s = re.sub('[,;:=\\-]', ' ', s)
      s = (s.strip()).lower()
      s = re.sub('[.]\Z', '', s)
      s = '<s> ' + s + ' <\s>'
      if (keyword != None and re.search(keyword, s)) or keyword == None:
        words = []
        for w in s.split(): 
          if w != '' and w != ' ':
            words.extend([w[:-1], '<\s>', '<s>']) if re.search('[.]\Z', w) else words.append(w)
        for word in ngrams(words, n):
          pattern_dict[word] += 1    

  pattern_dict = sorted(pattern_dict.items(), key=lambda x: x[1], reverse=True)

  return dict(pattern_dict)

In [10]:
# find word before input word
def findWordBefore(bigrams_dict, word):
  word_before = defaultdict(lambda: 0)
  for k,v in bigrams_dict.items():
    if k[1] == word:
      word_before[k[0]] += v

  word_before = sorted(word_before.items(), key=lambda x: x[1], reverse=True)

  return dict(word_before)

# Cardiomegaly

## Prepare Data

In [11]:
# choose rows
cardio_df, cardio_b0i1_df, cardio_b1i0_df = selectData(df, 'Cardiomegaly')
cardio_df

Unnamed: 0,Image Index,Reports,Cardiomegaly BERT Labeler,Edema BERT Labeler,Inspectra Lung Opacity v1 BERT Labeler,Atelectasis BERT Labeler,Pleural Effusion BERT Labeler,Lung Lesion BERT Labeler,Cardiomegaly Inspectra Labeler,Edema Inspectra Labeler,...,Atelectasis Inspectra Labeler,Pleural Effusion Inspectra Labeler,Lung Lesion Inspectra Labeler,Inspectra Lung Opacity v1 Balanced Score,Mass Balanced Score,Nodule Balanced Score,Edema Balanced Score,Atelectasis Balanced Score,Cardiomegaly Balanced Score,Pleural Effusion Balanced Score
0,20294844,CHEST :\nP.A. upright view .\nNo active pulmon...,1,0,0,0,0,0,0,0,...,0,0,0,0.1545549537,0.1134158085,0.1497948813,0.3858737325,0.1863493767,0.8780091842,0.01716235591
1,20296145,CXR PA upright\n The chest shows round calcif...,0,0,0,0,0,0,1,0,...,0,0,0,0.5372333875,0.1580556291,0.5273918368,0.05144614352,0.1072859713,0.03658095785,0.0615121635
2,20298305,CXR PA upright\nNo demonstrable active pulmona...,0,0,0,0,0,0,1,0,...,0,0,0,0.04703057456,0.01075730331,0.04479191569,0.04892595487,0.02976730592,0.2871204495,0.003888655149
3,20305794,CXR \n\nNo clinical history is provided.\n\nNo...,0,0,0,0,0,0,1,0,...,0,0,0,0.05848120402,0.06362653962,0.09207393694,0.3825493607,0.06412955085,0.9474925083,0.006668203767
4,20306919,Chest PA upright;\n R/O lung metastasis.\n\n E...,1,0,0,0,1,1,0,0,...,0,1,1,0.09744055314,0.3018906804,0.909952602,0.01252616605,0.1517874423,0.008666818642,0.7718668526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39236,25105665,CXR PA UPRIGHT\n\nFINDINGS : No detectable pu...,1,0,0,0,0,0,0,0,...,0,0,0,0.6699802947,0.1221750322,0.4594273696,0.01429137321,0.4090773577,0.08064206305,0.05404155913
39237,25106092,CXR PA UPRIGHT\n\nFINDINGS : No detectable pu...,1,0,0,0,0,0,0,0,...,0,0,0,0.08266283671,0.005025856353,0.1745592005,0,0.004885917867,0.001456480401,0.003004806707
39238,25106595,CXR(PA)\n\n IMP:\n No active pulmonary infi...,1,0,0,0,0,0,0,0,...,0,0,0,0.04828104463,0.005975966292,0.08753935783,0.0004548292506,0.01260417629,0.01721901118,0.0006430751514
39239,25106812,CXR(PA)\n\n IMP:\n No active pulmonary infi...,1,0,0,0,0,0,0,0,...,0,0,0,0.0397352668,0.00500379011,0.0558833124,0.0008390195919,0.01478381395,0.02319880988,0.0005166731356


In [12]:
# create report list
cardio_b0i1_report = createReportList(cardio_b0i1_df)
cardio_b1i0_report = createReportList(cardio_b1i0_df)
cardio_b0i1_report[0]

Lenght : 5526
Lenght : 33715


['CXR PA upright',
 '  The chest shows round calcification at cardiac shadow should be calcification at left lower lung.',
 '  The right lung is clear.',
 '  The heart is not enlarged.',
 '  Both costophrenic are clear.',
 '  The bony thorax is intact.']

In [13]:
# define keyword to select sentence from report
cardio_keyword = r'cardi|heart|LV|ventri'

## BERT 0, Inspectra 1

Without keyword

In [14]:
cardio_b0i1_unigram = createPatternNgrams(cardio_b0i1_report, 1)

In [15]:
cardio_b0i1_bigram = createPatternNgrams(cardio_b0i1_report, 2)
list(cardio_b0i1_bigram.items())[:20]

[(('<s>', 'no'), 6498),
 (('pulmonary', 'infiltration'), 2673),
 (('<s>', 'both'), 2412),
 (('active', 'pulmonary'), 2257),
 (('both', 'costophrenic'), 2137),
 (('bony', 'thorax'), 2062),
 (('pleural', 'effusion'), 1983),
 (('cardiothoracic', 'ratio'), 1893),
 (('infiltration', '<\\s>'), 1719),
 (('intact', '<\\s>'), 1662),
 (('costophrenic', 'sulci'), 1622),
 (('<s>', 'bony'), 1587),
 (('<\\s>', '<s>'), 1551),
 (('<s>', 'the'), 1538),
 (('thorax', 'is'), 1486),
 (('is', 'intact'), 1485),
 (('seen', '<\\s>'), 1470),
 (('is', 'seen'), 1437),
 (('are', 'not'), 1367),
 (('not', 'remarkable'), 1365)]

In [16]:
cardio_b0i1_trigram = createPatternNgrams(cardio_b0i1_report, 3)
list(cardio_b0i1_trigram.items())[:20]

[(('<s>', 'both', 'costophrenic'), 2039),
 (('pulmonary', 'infiltration', '<\\s>'), 1561),
 (('both', 'costophrenic', 'sulci'), 1503),
 (('active', 'pulmonary', 'infiltration'), 1493),
 (('is', 'intact', '<\\s>'), 1484),
 (('bony', 'thorax', 'is'), 1482),
 (('<s>', 'bony', 'thorax'), 1432),
 (('thorax', 'is', 'intact'), 1390),
 (('not', 'remarkable', '<\\s>'), 1365),
 (('are', 'not', 'remarkable'), 1364),
 (('is', 'seen', '<\\s>'), 1304),
 (('<s>', 'no', 'demonstrable'), 1167),
 (('no', 'demonstrable', 'active'), 1097),
 (('demonstrable', 'active', 'pulmonary'), 1097),
 (('no', 'active', 'pulmonary'), 1088),
 (('no', 'pleural', 'effusion'), 1081),
 (('pleural', 'effusion', '<\\s>'), 1052),
 (('cardiothoracic', 'ratio', 'is'), 1044),
 (('<s>', 'no', 'pleural'), 1022),
 (('is', 'noted', '<\\s>'), 1022)]

With keyword

In [17]:
cardio_b0i1_bigramk = createPatternNgrams(cardio_b0i1_report, 2, cardio_keyword)
list(cardio_b0i1_bigramk.items())[:20]

[(('cardiothoracic', 'ratio'), 1893),
 (('ratio', 'is'), 1081),
 (('<\\s>', '<s>'), 954),
 (('<s>', 'the'), 877),
 (('<s>', 'prominent'), 821),
 (('is', 'about'), 797),
 (('size', '<\\s>'), 734),
 (('<s>', 'mild'), 730),
 (('heart', 'size'), 697),
 (('ratio', '<\\s>'), 681),
 (('prominent', 'heart'), 678),
 (('<s>', 'no'), 644),
 (('heart', 'is'), 572),
 (('<s>', 'cardiothoracic'), 562),
 (('cardiac', 'shadow'), 519),
 (('mild', 'prominent'), 515),
 (('0.5', '<\\s>'), 467),
 (('about', '0.5'), 453),
 (('<s>', 'borderline'), 438),
 (('shadow', '<\\s>'), 438)]

In [18]:
cardio_b0i1_trigramk = createPatternNgrams(cardio_b0i1_report, 3, cardio_keyword)
list(cardio_b0i1_trigramk.items())[:20]

[(('cardiothoracic', 'ratio', 'is'), 1044),
 (('ratio', 'is', 'about'), 793),
 (('cardiothoracic', 'ratio', '<\\s>'), 677),
 (('<s>', 'cardiothoracic', 'ratio'), 557),
 (('prominent', 'heart', 'size'), 511),
 (('about', '0.5', '<\\s>'), 451),
 (('is', 'about', '0.5'), 446),
 (('heart', 'size', '<\\s>'), 427),
 (('mild', 'prominent', 'heart'), 404),
 (('<s>', 'mild', 'prominent'), 400),
 (('<s>', 'the', 'heart'), 384),
 (('increased', 'cardiothoracic', 'ratio'), 371),
 (('heart', 'is', 'not'), 341),
 (('the', 'heart', 'is'), 334),
 (('<s>', 'the', 'cardiothoracic'), 332),
 (('the', 'cardiothoracic', 'ratio'), 331),
 (('prominent', 'cardiothoracic', 'ratio'), 311),
 (('cardiac', 'shadow', '<\\s>'), 311),
 (('<s>', 'prominent', 'cardiothoracic'), 301),
 (('is', 'seen', '<\\s>'), 297)]

In [19]:
cardio_b0i1_fourgramk = createPatternNgrams(cardio_b0i1_report, 4, cardio_keyword)
list(cardio_b0i1_fourgramk.items())[:20]

[(('cardiothoracic', 'ratio', 'is', 'about'), 774),
 (('<s>', 'cardiothoracic', 'ratio', 'is'), 524),
 (('is', 'about', '0.5', '<\\s>'), 444),
 (('ratio', 'is', 'about', '0.5'), 443),
 (('prominent', 'heart', 'size', '<\\s>'), 342),
 (('<s>', 'the', 'cardiothoracic', 'ratio'), 329),
 (('<s>', 'mild', 'prominent', 'heart'), 329),
 (('the', 'cardiothoracic', 'ratio', 'is'), 327),
 (('<s>', 'the', 'heart', 'is'), 326),
 (('<s>', 'prominent', 'cardiothoracic', 'ratio'), 301),
 (('prominent', 'cardiothoracic', 'ratio', '<\\s>'), 300),
 (('mild', 'prominent', 'heart', 'size'), 271),
 (('the', 'heart', 'is', 'not'), 238),
 (('increased', 'cardiothoracic', 'ratio', '<\\s>'), 217),
 (('<s>', 'borderline', 'cardiac', 'size'), 209),
 (('<s>', 'prominent', 'heart', 'size'), 202),
 (('prominent', 'cardiac', 'shadow', '<\\s>'), 180),
 (('<s>', 'prominent', 'cardiac', 'shadow'), 165),
 (('<s>', 'increased', 'cardiothoracic', 'ratio'), 158),
 (('<\\s>', '<s>', 'the', 'heart'), 148)]

In [20]:
cardio_b0i1_fourgramk = createPatternNgrams(cardio_b0i1_report, 5, cardio_keyword)
list(cardio_b0i1_fourgramk.items())[:20]

[(('<s>', 'cardiothoracic', 'ratio', 'is', 'about'), 455),
 (('ratio', 'is', 'about', '0.5', '<\\s>'), 441),
 (('cardiothoracic', 'ratio', 'is', 'about', '0.5'), 439),
 (('<s>', 'the', 'cardiothoracic', 'ratio', 'is'), 326),
 (('the', 'cardiothoracic', 'ratio', 'is', 'about'), 318),
 (('<s>', 'prominent', 'cardiothoracic', 'ratio', '<\\s>'), 292),
 (('<s>', 'the', 'heart', 'is', 'not'), 233),
 (('<s>', 'mild', 'prominent', 'heart', 'size'), 205),
 (('mild', 'prominent', 'heart', 'size', '<\\s>'), 188),
 (('heart', 'is', 'not', 'enalrged', '<\\s>'), 148),
 (('ratio', 'is', 'about', '0.51', '<\\s>'), 145),
 (('cardiothoracic', 'ratio', 'is', 'about', '0.51'), 144),
 (('<\\s>', '<s>', 'the', 'heart', 'is'), 141),
 (('<s>', 'prominent', 'heart', 'size', '<\\s>'), 130),
 (('<s>', 'borderline', 'cardiac', 'size', 'with'), 127),
 (('cardiothoracic', 'ratio', 'is', 'about', '0.52'), 121),
 (('<s>', 'prominent', 'cardiac', 'shadow', '<\\s>'), 120),
 (('ratio', 'is', 'about', '0.52', '<\\s>'), 1

## BERT 1, Inspectra 0

Without keyword

In [21]:
cardio_b1i0_unigram = createPatternNgrams(cardio_b1i0_report, 1)

In [22]:
cardio_b1i0_bigram = createPatternNgrams(cardio_b1i0_report, 2)
list(cardio_b1i0_bigram.items())[:20]

[(('<s>', 'no'), 73879),
 (('noted', '<\\s>'), 37766),
 (('is', 'noted'), 31942),
 (('no', 'cardiomegaly'), 29722),
 (('cardiomegaly', 'is'), 28628),
 (('both', 'costophrenic'), 26449),
 (('costophrenic', 'angles'), 26029),
 (('angles', 'are'), 26004),
 (('<s>', 'findings'), 25652),
 (('infiltration', 'or'), 25533),
 (('is', 'seen'), 25469),
 (('seen', '<\\s>'), 25460),
 (('pulmonary', 'infiltration'), 25029),
 (('or', 'nodule'), 24697),
 (('nodule', 'is'), 24403),
 (('clear', '<\\s>'), 21671),
 (('are', 'clear'), 21484),
 (('<s>', 'the'), 19354),
 (('no', 'active'), 17529),
 (('findings', '<\\s>'), 16795)]

In [23]:
cardio_b1i0_trigram = createPatternNgrams(cardio_b1i0_report, 3)
list(cardio_b1i0_trigram.items())[:20]

[(('is', 'noted', '<\\s>'), 31763),
 (('<s>', 'no', 'cardiomegaly'), 29339),
 (('no', 'cardiomegaly', 'is'), 27284),
 (('costophrenic', 'angles', 'are'), 25943),
 (('both', 'costophrenic', 'angles'), 25763),
 (('is', 'seen', '<\\s>'), 25116),
 (('infiltration', 'or', 'nodule'), 24634),
 (('or', 'nodule', 'is'), 24228),
 (('are', 'clear', '<\\s>'), 21483),
 (('pulmonary', 'infiltration', 'or'), 19916),
 (('nodule', 'is', 'noted'), 18514),
 (('<s>', 'findings', '<\\s>'), 16792),
 (('no', 'active', 'pulmonary'), 12880),
 (('is', 'observed', '<\\s>'), 12557),
 (('<s>', 'both', 'costophrenic'), 12405),
 (('<s>', 'no', 'pulmonary'), 12345),
 (('angles', 'are', 'clear'), 11976),
 (('no', 'pulmonary', 'infiltration'), 11904),
 (('cardiomegaly', 'is', 'observed'), 10737),
 (('active', 'pulmonary', 'disease'), 10689)]

In [24]:
cardio_b1i0_fourgram = createPatternNgrams(cardio_b1i0_report, 4)
list(cardio_b1i0_fourgram.items())[:20]

[(('<s>', 'no', 'cardiomegaly', 'is'), 27284),
 (('both', 'costophrenic', 'angles', 'are'), 25695),
 (('infiltration', 'or', 'nodule', 'is'), 24198),
 (('pulmonary', 'infiltration', 'or', 'nodule'), 19057),
 (('nodule', 'is', 'noted', '<\\s>'), 18504),
 (('or', 'nodule', 'is', 'noted'), 18444),
 (('angles', 'are', 'clear', '<\\s>'), 11976),
 (('costophrenic', 'angles', 'are', 'clear'), 11926),
 (('<s>', 'no', 'pulmonary', 'infiltration'), 11854),
 (('<s>', 'both', 'costophrenic', 'angles'), 11739),
 (('cardiomegaly', 'is', 'observed', '<\\s>'), 10726),
 (('no', 'cardiomegaly', 'is', 'observed'), 10716),
 (('active', 'pulmonary', 'disease', '<\\s>'), 10606),
 (('no', 'active', 'pulmonary', 'disease'), 10578),
 (('no', 'pulmonary', 'infiltration', 'or'), 10319),
 (('cardiomegaly', 'is', 'seen', '<\\s>'), 9779),
 (('<s>', 'imp', 'no', 'active'), 9758),
 (('no', 'cardiomegaly', 'is', 'seen'), 9749),
 (('imp', 'no', 'active', 'pulmonary'), 9749),
 (('<s>', 'no', 'blunting', 'of'), 8483)]

With keyword

In [25]:
cardio_b1i0_bigramk = createPatternNgrams(cardio_b1i0_report, 2, cardio_keyword)
list(cardio_b1i0_bigramk.items())[:20]

[(('<s>', 'no'), 29802),
 (('no', 'cardiomegaly'), 29722),
 (('cardiomegaly', 'is'), 28628),
 (('is', 'observed'), 10789),
 (('observed', '<\\s>'), 10785),
 (('noted', '<\\s>'), 10639),
 (('is', 'noted'), 10590),
 (('seen', '<\\s>'), 10119),
 (('is', 'seen'), 10103),
 (('<\\s>', '<s>'), 8812),
 (('<s>', 'calcified'), 6177),
 (('aortic', 'knob'), 5328),
 (('calcified', 'aortic'), 5189),
 (('found', '<\\s>'), 5085),
 (('is', 'found'), 5065),
 (('knob', 'is'), 5054),
 (('aorta', 'is'), 3229),
 (('cardiomegaly', '<\\s>'), 2508),
 (('<s>', 'dilate'), 1972),
 (('dilate', 'aorta'), 1794)]

In [26]:
cardio_b1i0_trigramk = createPatternNgrams(cardio_b1i0_report, 3, cardio_keyword)
list(cardio_b1i0_trigramk.items())[:20]

[(('<s>', 'no', 'cardiomegaly'), 29339),
 (('no', 'cardiomegaly', 'is'), 27284),
 (('is', 'observed', '<\\s>'), 10775),
 (('cardiomegaly', 'is', 'observed'), 10737),
 (('is', 'noted', '<\\s>'), 10572),
 (('is', 'seen', '<\\s>'), 10070),
 (('cardiomegaly', 'is', 'seen'), 9788),
 (('<\\s>', '<s>', 'calcified'), 6174),
 (('seen', '<\\s>', '<s>'), 6066),
 (('calcified', 'aortic', 'knob'), 5177),
 (('is', 'found', '<\\s>'), 5065),
 (('<s>', 'calcified', 'aortic'), 5050),
 (('aortic', 'knob', 'is'), 5039),
 (('knob', 'is', 'noted'), 4902),
 (('cardiomegaly', 'is', 'noted'), 4436),
 (('cardiomegaly', 'is', 'found'), 3087),
 (('noted', '<\\s>', '<s>'), 2048),
 (('no', 'cardiomegaly', '<\\s>'), 2033),
 (('<\\s>', '<s>', 'dilate'), 1971),
 (('aorta', 'is', 'found'), 1971)]

In [27]:
cardio_b1i0_fourgramk = createPatternNgrams(cardio_b1i0_report, 4, cardio_keyword)
list(cardio_b1i0_fourgramk.items())[:20]

[(('<s>', 'no', 'cardiomegaly', 'is'), 27284),
 (('cardiomegaly', 'is', 'observed', '<\\s>'), 10726),
 (('no', 'cardiomegaly', 'is', 'observed'), 10716),
 (('cardiomegaly', 'is', 'seen', '<\\s>'), 9779),
 (('no', 'cardiomegaly', 'is', 'seen'), 9749),
 (('is', 'seen', '<\\s>', '<s>'), 6059),
 (('seen', '<\\s>', '<s>', 'calcified'), 5916),
 (('<\\s>', '<s>', 'calcified', 'aortic'), 5048),
 (('<s>', 'calcified', 'aortic', 'knob'), 5040),
 (('calcified', 'aortic', 'knob', 'is'), 5018),
 (('knob', 'is', 'noted', '<\\s>'), 4900),
 (('aortic', 'knob', 'is', 'noted'), 4888),
 (('cardiomegaly', 'is', 'noted', '<\\s>'), 4431),
 (('no', 'cardiomegaly', 'is', 'noted'), 3238),
 (('cardiomegaly', 'is', 'found', '<\\s>'), 3087),
 (('no', 'cardiomegaly', 'is', 'found'), 3085),
 (('is', 'noted', '<\\s>', '<s>'), 2046),
 (('aorta', 'is', 'found', '<\\s>'), 1971),
 (('noted', '<\\s>', '<s>', 'dilate'), 1965),
 (('<\\s>', '<s>', 'dilate', 'aorta'), 1786)]

In [28]:
cardio_b1i0_fourgramk = createPatternNgrams(cardio_b1i0_report, 5, cardio_keyword)
list(cardio_b1i0_fourgramk.items())[:20]

[(('<s>', 'no', 'cardiomegaly', 'is', 'observed'), 10716),
 (('no', 'cardiomegaly', 'is', 'observed', '<\\s>'), 10707),
 (('<s>', 'no', 'cardiomegaly', 'is', 'seen'), 9749),
 (('no', 'cardiomegaly', 'is', 'seen', '<\\s>'), 9742),
 (('cardiomegaly', 'is', 'seen', '<\\s>', '<s>'), 6037),
 (('is', 'seen', '<\\s>', '<s>', 'calcified'), 5915),
 (('<\\s>', '<s>', 'calcified', 'aortic', 'knob'), 5038),
 (('<s>', 'calcified', 'aortic', 'knob', 'is'), 4932),
 (('aortic', 'knob', 'is', 'noted', '<\\s>'), 4886),
 (('calcified', 'aortic', 'knob', 'is', 'noted'), 4873),
 (('seen', '<\\s>', '<s>', 'calcified', 'aortic'), 4861),
 (('<s>', 'no', 'cardiomegaly', 'is', 'noted'), 3238),
 (('no', 'cardiomegaly', 'is', 'noted', '<\\s>'), 3236),
 (('<s>', 'no', 'cardiomegaly', 'is', 'found'), 3085),
 (('no', 'cardiomegaly', 'is', 'found', '<\\s>'), 3085),
 (('cardiomegaly', 'is', 'noted', '<\\s>', '<s>'), 2024),
 (('is', 'noted', '<\\s>', '<s>', 'dilate'), 1965),
 (('noted', '<\\s>', '<s>', 'dilate', 'aorta

## Find Word Before

In [29]:
before_cardiot = findWordBefore(cardio_b0i1_bigram, 'cardiothoracic')
print('cardiothoracic :', cardio_b0i1_unigram['cardiothoracic',])
print('word before cardiothoracic :', sum(before_cardiot.values()))
list(before_cardiot.items())[:10]

cardiothoracic : 1939
word before cardiothoracic : 1939


[('<s>', 562),
 ('increased', 403),
 ('the', 334),
 ('prominent', 311),
 ('normal', 96),
 ('increase', 84),
 ('borderline', 64),
 ('of', 20),
 ('mild', 6),
 ('noted', 5)]

In [30]:
before_cardiac = findWordBefore(cardio_b0i1_bigram, 'cardiac')
print('cardiac :', cardio_b0i1_unigram['cardiac',])
print('word before cardiac :', sum(before_cardiac.values()))
list(before_cardiac.items())[:10]

cardiac : 1025
word before cardiac : 1025


[('prominent', 318),
 ('borderline', 211),
 ('enlarged', 107),
 ('<s>', 62),
 ('of', 62),
 ('normal', 38),
 ('enlarge', 31),
 ('the', 31),
 ('left', 18),
 ('right', 11)]

In [31]:
before_heart = findWordBefore(cardio_b0i1_bigram, 'heart')
print('heart :', cardio_b0i1_unigram['heart',])
print('word before heart :', sum(before_heart.values()))
list(before_heart.items())[:10]

heart : 2044
word before heart : 2044


[('prominent', 678),
 ('the', 406),
 ('<s>', 323),
 ('of', 102),
 ('normal', 44),
 ('noraml', 41),
 ('prosthetic', 41),
 ('left', 39),
 ('metallic', 36),
 ('normla', 26)]

In [32]:
before_cardiom = findWordBefore(cardio_b1i0_bigram, 'cardiomegaly')
print('cardiomegaly :', cardio_b1i0_unigram['cardiomegaly',])
print('word before cardiomegaly :', sum(before_cardiom.values()))
list(before_cardiom.items())[:10]

cardiomegaly : 32041
word before cardiomegaly : 32041


[('no', 29722),
 ('<s>', 1320),
 ('mild', 534),
 ('or', 198),
 ('of', 111),
 ('borderlined', 41),
 ('borderline', 39),
 ('significant', 25),
 ('definite', 10),
 ('marked', 6)]

# Pleural Effusion

## Prepare Data

In [33]:
# choose rows
pleural_df, pleural_b0i1_df, pleural_b1i0_df = selectData(df, 'Pleural Effusion')
pleural_df

Unnamed: 0,Image Index,Reports,Cardiomegaly BERT Labeler,Edema BERT Labeler,Inspectra Lung Opacity v1 BERT Labeler,Atelectasis BERT Labeler,Pleural Effusion BERT Labeler,Lung Lesion BERT Labeler,Cardiomegaly Inspectra Labeler,Edema Inspectra Labeler,...,Atelectasis Inspectra Labeler,Pleural Effusion Inspectra Labeler,Lung Lesion Inspectra Labeler,Inspectra Lung Opacity v1 Balanced Score,Mass Balanced Score,Nodule Balanced Score,Edema Balanced Score,Atelectasis Balanced Score,Cardiomegaly Balanced Score,Pleural Effusion Balanced Score
0,20306927,"Chest PA upright;\n follow up film , compared ...",1,0,0,0,0,0,1,0,...,0,1,0,0.3424621841,0.2581134735,0.2577210474,0.190172492,0.4214429448,0.2302414994,0.8176410373
1,20308456,Chest;\n\n Minimal fibrosis at RUL .\n The l...,0,0,0,0,1,0,0,0,...,0,0,0,0.6295449147,0.08568040545,0.368485034,0.0263565687,0.0401865423,0.01557852071,0.01015650975
2,20309045,CHEST\n\nReticulonodular infiltration at RUL. ...,0,0,0,0,0,1,0,0,...,0,1,0,0.9400387571,0.2294832248,0.4388088888,0.0455660058,0.4155477142,0.03378489308,0.690142133
3,20309306,CHEST\n\nRt hydropneumothorax is seen.\nLeft l...,0,0,0,0,0,0,0,0,...,0,1,0,0.1334314838,0.07123852055,0.1448232296,0.06152678248,0.2577919648,0.104567395,0.1711762308
4,20313861,CXR PA upright\n\nPulmonary infiltration at RL...,0,0,1,0,0,0,0,0,...,0,1,0,0.8783668403,0.1155542338,0.2204621596,0.04706450117,0.4382139311,0.02227249072,0.4959128867
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4557,25100320,CXR\n\nNo interval change of fibrosis and trac...,0,0,0,0,0,0,0,0,...,0,1,0,0.6749139683,0.2361295826,0.407994873,0.06792504001,0.4530317066,0.2317463295,0.4591987993
4558,25100471,CXR PA upright\n\nComparison to prior radiogra...,0,0,0,0,0,0,0,0,...,0,1,0,0.7881276103,0.2617268099,0.5536876881,0.05206789979,0.4764402244,0.1293609039,0.3912950119
4559,25100802,CXR PA\n\nCompared with previous study: 29/03/...,0,0,0,0,0,1,0,0,...,0,1,1,0.8443707129,0.5786528594,0.6166951418,0.05021092042,0.4110298796,0.009381104377,0.5988893739
4560,25103594,Chest CXR\n\nFibrotic and interstitial infiltr...,1,0,1,0,1,1,1,0,...,0,0,1,0.8953606978,0.117052136,0.5153906885,0.384864837,0.43920031,0.9686396125,0.1145209179


In [34]:
# create report list
pleural_b0i1_report = createReportList(pleural_b0i1_df)
pleural_b1i0_report = createReportList(pleural_b1i0_df)
pleural_b0i1_report[0]

Lenght : 3039
Lenght : 1523


['Chest PA upright;',
 ' follow up film , compared to film on 22/3/2005',
 ' No significant changed of rt.pleural effusion, opacity of RLL.',
 ' L.t, lung is clear.',
 ' remaining thickening of Rt.paratracheal soft tissue is also shown.',
 ' Heart is mild prominent in size. Tortuous thoracic aorta is seen.',
 'ds']

In [35]:
# define keyword to select sentence from report
pleural_keyword = r'pleural|costophrenic'

## BERT 0, Inspectra 1

Without keyword

In [36]:
pleural_b0i1_unigram = createPatternNgrams(pleural_b0i1_report, 1)

In [37]:
pleural_b0i1_bigram = createPatternNgrams(pleural_b0i1_report, 2)
list(pleural_b0i1_bigram.items())[:20]

[(('<s>', 'no'), 4014),
 (('change', 'of'), 3442),
 (('pleural', 'effusion'), 2759),
 (('effusion', '<\\s>'), 1840),
 (('no', 'change'), 1800),
 (('<s>', 'the'), 1799),
 (('right', 'pleural'), 1310),
 (('no', 'significant'), 1290),
 (('significant', 'change'), 1195),
 (('bony', 'thorax'), 1172),
 (('at', 'right'), 997),
 (('<\\s>', '<s>'), 982),
 (('noted', '<\\s>'), 974),
 (('of', 'right'), 939),
 (('thorax', 'is'), 925),
 (('seen', '<\\s>'), 921),
 (('left', 'pleural'), 907),
 (('is', 'seen'), 900),
 (('compared', 'with'), 885),
 (('intact', '<\\s>'), 881)]

In [38]:
pleural_b0i1_trigram = createPatternNgrams(pleural_b0i1_report, 3)
list(pleural_b0i1_trigram.items())[:20]

[(('no', 'change', 'of'), 1724),
 (('pleural', 'effusion', '<\\s>'), 1671),
 (('right', 'pleural', 'effusion'), 1252),
 (('no', 'significant', 'change'), 1191),
 (('significant', 'change', 'of'), 1131),
 (('<s>', 'no', 'change'), 1102),
 (('<s>', 'no', 'significant'), 959),
 (('bony', 'thorax', 'is'), 919),
 (('left', 'pleural', 'effusion'), 870),
 (('is', 'seen', '<\\s>'), 794),
 (('<s>', 'there', 'is'), 743),
 (('is', 'intact', '<\\s>'), 732),
 (('is', 'noted', '<\\s>'), 728),
 (('thorax', 'is', 'intact'), 698),
 (('compared', 'with', 'previous'), 672),
 (('<s>', 'bony', 'thorax'), 669),
 (('<s>', 'compared', 'with'), 642),
 (('of', 'right', 'pleural'), 621),
 (('<s>', 'no', 'cardiomegaly'), 604),
 (('change', 'of', 'right'), 569)]

In [39]:
pleural_b0i1_sixgram = createPatternNgrams(pleural_b0i1_report, 6)
list(pleural_b0i1_sixgram.items())[:20]

[(('<s>', 'bony', 'thorax', 'is', 'intact', '<\\s>'), 377),
 (('<s>', 'the', 'bony', 'thorax', 'is', 'intact'), 313),
 (('the', 'bony', 'thorax', 'is', 'intact', '<\\s>'), 313),
 (('<s>', 'the', 'heart', 'is', 'not', 'enlarged'), 287),
 (('the', 'heart', 'is', 'not', 'enlarged', '<\\s>'), 287),
 (('change', 'of', 'right', 'pleural', 'effusion', '<\\s>'), 270),
 (('<s>', 'compared', 'with', 'previous', 'chest', 'on'), 241),
 (('<s>', 'the', 'chest', 'shows', 'no', 'change'), 239),
 (('the', 'chest', 'shows', 'no', 'change', 'of'), 236),
 (('no', 'significant', 'change', 'of', 'right', 'pleural'), 203),
 (('<s>', 'bony', 'thorax', 'is', 'unremarkable', '<\\s>'), 201),
 (('significant', 'change', 'of', 'right', 'pleural', 'effusion'), 198),
 (('no', 'change', 'of', 'right', 'pleural', 'effusion'), 194),
 (('<s>', 'there', 'is', 'no', 'change', 'of'), 186),
 (('as', 'compared', 'to', 'prior', 'film', 'on'), 185),
 (('<s>', 'as', 'compared', 'to', 'prior', 'film'), 181),
 (('change', 'of', 

With keyword

In [40]:
pleural_b0i1_bigramk = createPatternNgrams(pleural_b0i1_report, 2, pleural_keyword)
list(pleural_b0i1_bigramk.items())[:20]

[(('pleural', 'effusion'), 2759),
 (('change', 'of'), 2148),
 (('<s>', 'no'), 1947),
 (('effusion', '<\\s>'), 1740),
 (('right', 'pleural'), 1310),
 (('no', 'change'), 1132),
 (('left', 'pleural'), 907),
 (('no', 'significant'), 813),
 (('significant', 'change'), 753),
 (('of', 'right'), 717),
 (('<\\s>', '<s>'), 657),
 (('costophrenic', 'angle'), 542),
 (('there', 'is'), 512),
 (('of', 'left'), 487),
 (('<s>', 'there'), 454),
 (('<s>', 'the'), 446),
 (('clear', '<\\s>'), 427),
 (('effusion', 'is'), 424),
 (('is', 'no'), 402),
 (('pleural', 'thickening'), 385)]

In [41]:
pleural_b0i1_trigramk = createPatternNgrams(pleural_b0i1_report, 3, pleural_keyword)
list(pleural_b0i1_trigramk.items())[:20]

[(('pleural', 'effusion', '<\\s>'), 1671),
 (('right', 'pleural', 'effusion'), 1252),
 (('no', 'change', 'of'), 1113),
 (('left', 'pleural', 'effusion'), 870),
 (('no', 'significant', 'change'), 750),
 (('significant', 'change', 'of'), 743),
 (('<s>', 'no', 'change'), 670),
 (('of', 'right', 'pleural'), 621),
 (('<s>', 'no', 'significant'), 603),
 (('change', 'of', 'right'), 533),
 (('<s>', 'there', 'is'), 435),
 (('pleural', 'effusion', 'is'), 415),
 (('there', 'is', 'no'), 393),
 (('of', 'left', 'pleural'), 378),
 (('change', 'of', 'left'), 325),
 (('left', 'costophrenic', 'angle'), 319),
 (('bilateral', 'pleural', 'effusion'), 299),
 (('costophrenic', 'angle', 'is'), 285),
 (('<s>', 'as', 'compared'), 271),
 (('is', 'clear', '<\\s>'), 247)]

In [42]:
pleural_b0i1_fourgramk = createPatternNgrams(pleural_b0i1_report, 4, pleural_keyword)
list(pleural_b0i1_fourgramk.items())[:20]

[(('right', 'pleural', 'effusion', '<\\s>'), 798),
 (('no', 'significant', 'change', 'of'), 743),
 (('<s>', 'no', 'change', 'of'), 657),
 (('of', 'right', 'pleural', 'effusion'), 598),
 (('<s>', 'no', 'significant', 'change'), 560),
 (('left', 'pleural', 'effusion', '<\\s>'), 543),
 (('change', 'of', 'right', 'pleural'), 510),
 (('of', 'left', 'pleural', 'effusion'), 360),
 (('<s>', 'there', 'is', 'no'), 320),
 (('change', 'of', 'left', 'pleural'), 301),
 (('<s>', 'as', 'compared', 'to'), 230),
 (('no', 'change', 'of', 'right'), 223),
 (('<\\s>', '<s>', 'there', 'is'), 223),
 (('significant', 'change', 'of', 'right'), 206),
 (('as', 'compared', 'to', 'prior'), 195),
 (('of', 'bilateral', 'pleural', 'effusion'), 190),
 (('change', 'of', 'bilateral', 'pleural'), 182),
 (('costophrenic', 'angle', 'is', 'clear'), 175),
 (('angle', 'is', 'clear', '<\\s>'), 175),
 (('left', 'costophrenic', 'angle', 'is'), 174)]

In [43]:
pleural_b0i1_fivegramk = createPatternNgrams(pleural_b0i1_report, 5, pleural_keyword)
list(pleural_b0i1_fivegramk.items())[:20]

[(('<s>', 'no', 'significant', 'change', 'of'), 556),
 (('change', 'of', 'right', 'pleural', 'effusion'), 489),
 (('of', 'right', 'pleural', 'effusion', '<\\s>'), 335),
 (('change', 'of', 'left', 'pleural', 'effusion'), 290),
 (('no', 'significant', 'change', 'of', 'right'), 206),
 (('no', 'change', 'of', 'right', 'pleural'), 204),
 (('significant', 'change', 'of', 'right', 'pleural'), 203),
 (('of', 'left', 'pleural', 'effusion', '<\\s>'), 196),
 (('<s>', 'as', 'compared', 'to', 'prior'), 191),
 (('costophrenic', 'angle', 'is', 'clear', '<\\s>'), 174),
 (('compared', 'to', 'prior', 'film', 'on'), 172),
 (('change', 'of', 'bilateral', 'pleural', 'effusion'), 171),
 (('as', 'compared', 'to', 'prior', 'film'), 170),
 (('<s>', 'the', 'chest', 'shows', 'no'), 165),
 (('pleural', 'effusion', 'is', 'seen', '<\\s>'), 162),
 (('there', 'is', 'no', 'change', 'of'), 150),
 (('the', 'chest', 'shows', 'no', 'change'), 149),
 (('chest', 'shows', 'no', 'change', 'of'), 148),
 (('<s>', 'there', 'is',

In [44]:
pleural_b0i1_sixgramk = createPatternNgrams(pleural_b0i1_report, 6, pleural_keyword)
list(pleural_b0i1_sixgramk.items())[:20]

[(('change', 'of', 'right', 'pleural', 'effusion', '<\\s>'), 270),
 (('no', 'significant', 'change', 'of', 'right', 'pleural'), 203),
 (('significant', 'change', 'of', 'right', 'pleural', 'effusion'), 198),
 (('no', 'change', 'of', 'right', 'pleural', 'effusion'), 194),
 (('as', 'compared', 'to', 'prior', 'film', 'on'), 170),
 (('<s>', 'as', 'compared', 'to', 'prior', 'film'), 166),
 (('change', 'of', 'left', 'pleural', 'effusion', '<\\s>'), 153),
 (('<s>', 'the', 'chest', 'shows', 'no', 'change'), 149),
 (('the', 'chest', 'shows', 'no', 'change', 'of'), 148),
 (('<s>', 'there', 'is', 'no', 'change', 'of'), 139),
 (('<s>', 'no', 'significant', 'change', 'of', 'right'), 135),
 (('<s>', 'no', 'change', 'of', 'right', 'pleural'), 127),
 (('no', 'change', 'of', 'left', 'pleural', 'effusion'), 127),
 (('left', 'costophrenic', 'angle', 'is', 'clear', '<\\s>'), 120),
 (('no', 'significant', 'change', 'of', 'left', 'pleural'), 112),
 (('significant', 'change', 'of', 'left', 'pleural', 'effusio

## BERT 1, Inspectra 0

Without keyword

In [45]:
pleural_b1i0_unigram = createPatternNgrams(pleural_b1i0_report, 1)

In [46]:
pleural_b1i0_bigram = createPatternNgrams(pleural_b1i0_report, 2)
list(pleural_b1i0_bigram.items())[:20]

[(('<s>', 'no'), 899),
 (('<s>', 'the'), 673),
 (('<s>', 'normal'), 604),
 (('<s>', 'both'), 479),
 (('both', 'costophrenic'), 463),
 (('<s>', 'bony'), 449),
 (('clear', '<\\s>'), 438),
 (('bony', 'thorax'), 406),
 (('intact', '<\\s>'), 396),
 (('<s>', 'unremarkable'), 372),
 (('<s>', 'there'), 364),
 (('noted', '<\\s>'), 355),
 (('unremarkable', '<\\s>'), 349),
 (('are', 'clear'), 342),
 (('there', 'is'), 335),
 (('fibrosis', 'at'), 330),
 (('is', 'intact'), 322),
 (('<\\s>', '<s>'), 322),
 (('pleural', 'effusion'), 317),
 (('<s>', 'findings'), 315)]

In [47]:
pleural_b1i0_trigram = createPatternNgrams(pleural_b1i0_report, 3)
list(pleural_b1i0_trigram.items())[:20]

[(('<s>', 'both', 'costophrenic'), 415),
 (('are', 'clear', '<\\s>'), 342),
 (('is', 'intact', '<\\s>'), 321),
 (('<s>', 'there', 'is'), 311),
 (('bony', 'thorax', 'is'), 304),
 (('<s>', 'findings', '<\\s>'), 291),
 (('thorax', 'is', 'intact'), 287),
 (('both', 'costophrenic', 'angles'), 271),
 (('is', 'noted', '<\\s>'), 269),
 (('costophrenic', 'angles', 'are'), 266),
 (('effusion', 'nor', 'pneumothorax'), 264),
 (('nor', 'pneumothorax', '<\\s>'), 262),
 (('<s>', 'impression', '<\\s>'), 256),
 (('angles', 'are', 'clear'), 249),
 (('<s>', 'heart', 'and'), 242),
 (('<s>', 'normal', 'heart'), 232),
 (('normal', 'heart', 'size'), 232),
 (('heart', 'size', '<\\s>'), 226),
 (('pulmonary', 'infiltration', '<\\s>'), 221),
 (('<s>', 'bony', 'thorax'), 213)]

In [48]:
pleural_b1i0_sixgram = createPatternNgrams(pleural_b1i0_report, 6)
list(pleural_b1i0_sixgram.items())[:20]

[(('both', 'costophrenic', 'angles', 'are', 'clear', '<\\s>'), 228),
 (('<s>', 'both', 'costophrenic', 'angles', 'are', 'clear'), 227),
 (('<s>', 'neither', 'plerual', 'effusion', 'nor', 'pneumothorax'), 191),
 (('neither', 'plerual', 'effusion', 'nor', 'pneumothorax', '<\\s>'), 191),
 (('<s>', 'bony', 'thorax', 'is', 'intact', '<\\s>'), 163),
 (('costophrenic', 'sulci', 'are', 'not', 'remarkable', '<\\s>'), 130),
 (('<s>', 'the', 'bony', 'thorax', 'is', 'intact'), 120),
 (('the', 'bony', 'thorax', 'is', 'intact', '<\\s>'), 120),
 (('<s>', 'both', 'costophrenic', 'sulci', 'are', 'not'), 106),
 (('both', 'costophrenic', 'sulci', 'are', 'not', 'remarkable'), 106),
 (('<s>', 'the', 'heart', 'is', 'not', 'enlarged'), 101),
 (('the', 'heart', 'is', 'not', 'enlarged', '<\\s>'), 101),
 (('<s>', 'neither', 'pleural', 'effusion', 'nor', 'pneumothorax'), 72),
 (('neither', 'pleural', 'effusion', 'nor', 'pneumothorax', '<\\s>'), 71),
 (('heart', 'and', 'great', 'vessels', 'are', 'unremarkable'), 

With keyword

In [49]:
pleural_b1i0_bigramk = createPatternNgrams(pleural_b1i0_report, 2, pleural_keyword)
list(pleural_b1i0_bigramk.items())[:20]

[(('both', 'costophrenic'), 463),
 (('<s>', 'both'), 440),
 (('clear', '<\\s>'), 340),
 (('pleural', 'effusion'), 317),
 (('are', 'clear'), 302),
 (('costophrenic', 'angles'), 283),
 (('angles', 'are'), 266),
 (('costophrenic', 'sulci'), 235),
 (('sulci', 'are'), 182),
 (('pleural', 'thickening'), 177),
 (('are', 'not'), 169),
 (('not', 'remarkable'), 169),
 (('remarkable', '<\\s>'), 169),
 (('right', 'pleural'), 158),
 (('costophrenic', 'angle'), 158),
 (('effusion', '<\\s>'), 136),
 (('<\\s>', '<s>'), 126),
 (('left', 'pleural'), 123),
 (('<s>', 'no'), 123),
 (('apical', 'pleural'), 97)]

In [50]:
pleural_b1i0_trigramk = createPatternNgrams(pleural_b1i0_report, 3, pleural_keyword)
list(pleural_b1i0_trigramk.items())[:20]

[(('<s>', 'both', 'costophrenic'), 415),
 (('are', 'clear', '<\\s>'), 302),
 (('both', 'costophrenic', 'angles'), 271),
 (('costophrenic', 'angles', 'are'), 266),
 (('angles', 'are', 'clear'), 235),
 (('both', 'costophrenic', 'sulci'), 185),
 (('costophrenic', 'sulci', 'are'), 182),
 (('not', 'remarkable', '<\\s>'), 169),
 (('are', 'not', 'remarkable'), 167),
 (('sulci', 'are', 'not'), 130),
 (('pleural', 'effusion', '<\\s>'), 130),
 (('pleural', 'thickening', '<\\s>'), 89),
 (('right', 'pleural', 'effusion'), 85),
 (('apical', 'pleural', 'thickening'), 83),
 (('left', 'costophrenic', 'angle'), 74),
 (('<s>', 'neither', 'pleural'), 72),
 (('neither', 'pleural', 'effusion'), 72),
 (('pleural', 'effusion', 'nor'), 72),
 (('effusion', 'nor', 'pneumothorax'), 72),
 (('nor', 'pneumothorax', '<\\s>'), 71)]

In [51]:
pleural_b1i0_fourgramk = createPatternNgrams(pleural_b1i0_report, 4, pleural_keyword)
list(pleural_b1i0_fourgramk.items())[:20]

[(('both', 'costophrenic', 'angles', 'are'), 255),
 (('costophrenic', 'angles', 'are', 'clear'), 235),
 (('angles', 'are', 'clear', '<\\s>'), 235),
 (('<s>', 'both', 'costophrenic', 'angles'), 231),
 (('<s>', 'both', 'costophrenic', 'sulci'), 181),
 (('are', 'not', 'remarkable', '<\\s>'), 167),
 (('both', 'costophrenic', 'sulci', 'are'), 138),
 (('costophrenic', 'sulci', 'are', 'not'), 130),
 (('sulci', 'are', 'not', 'remarkable'), 130),
 (('<s>', 'neither', 'pleural', 'effusion'), 72),
 (('neither', 'pleural', 'effusion', 'nor'), 72),
 (('pleural', 'effusion', 'nor', 'pneumothorax'), 72),
 (('effusion', 'nor', 'pneumothorax', '<\\s>'), 71),
 (('apical', 'pleural', 'thickening', '<\\s>'), 58),
 (('right', 'pleural', 'effusion', '<\\s>'), 54),
 (('costophrenic', 'sulci', 'are', 'clear'), 48),
 (('sulci', 'are', 'clear', '<\\s>'), 48),
 (('<s>', 'the', 'chest', 'shows'), 41),
 (('left', 'pleural', 'effusion', '<\\s>'), 39),
 (('diaphragm', 'are', 'not', 'remarkable'), 36)]

In [52]:
pleural_b1i0_fivegramk = createPatternNgrams(pleural_b1i0_report, 5, pleural_keyword)
list(pleural_b1i0_fivegramk.items())[:20]

[(('costophrenic', 'angles', 'are', 'clear', '<\\s>'), 235),
 (('<s>', 'both', 'costophrenic', 'angles', 'are'), 231),
 (('both', 'costophrenic', 'angles', 'are', 'clear'), 228),
 (('<s>', 'both', 'costophrenic', 'sulci', 'are'), 138),
 (('costophrenic', 'sulci', 'are', 'not', 'remarkable'), 130),
 (('sulci', 'are', 'not', 'remarkable', '<\\s>'), 130),
 (('both', 'costophrenic', 'sulci', 'are', 'not'), 106),
 (('<s>', 'neither', 'pleural', 'effusion', 'nor'), 72),
 (('neither', 'pleural', 'effusion', 'nor', 'pneumothorax'), 72),
 (('pleural', 'effusion', 'nor', 'pneumothorax', '<\\s>'), 71),
 (('costophrenic', 'sulci', 'are', 'clear', '<\\s>'), 48),
 (('diaphragm', 'are', 'not', 'remarkable', '<\\s>'), 36),
 (('and', 'diaphragm', 'are', 'not', 'remarkable'), 34),
 (('<s>', 'both', 'costophrenic', 'sulci', 'hili'), 31),
 (('both', 'costophrenic', 'sulci', 'hili', 'and'), 30),
 (('hili', 'and', 'diaphragm', 'are', 'not'), 30),
 (('costophrenic', 'sulci', 'hili', 'and', 'diaphragm'), 29),

In [53]:
pleural_b1i0_sixgramk = createPatternNgrams(pleural_b1i0_report, 6, pleural_keyword)
list(pleural_b1i0_sixgramk.items())[:20]

[(('both', 'costophrenic', 'angles', 'are', 'clear', '<\\s>'), 228),
 (('<s>', 'both', 'costophrenic', 'angles', 'are', 'clear'), 227),
 (('costophrenic', 'sulci', 'are', 'not', 'remarkable', '<\\s>'), 130),
 (('<s>', 'both', 'costophrenic', 'sulci', 'are', 'not'), 106),
 (('both', 'costophrenic', 'sulci', 'are', 'not', 'remarkable'), 106),
 (('<s>', 'neither', 'pleural', 'effusion', 'nor', 'pneumothorax'), 72),
 (('neither', 'pleural', 'effusion', 'nor', 'pneumothorax', '<\\s>'), 71),
 (('and', 'diaphragm', 'are', 'not', 'remarkable', '<\\s>'), 34),
 (('<s>', 'both', 'costophrenic', 'sulci', 'hili', 'and'), 30),
 (('hili', 'and', 'diaphragm', 'are', 'not', 'remarkable'), 30),
 (('both', 'costophrenic', 'sulci', 'hili', 'and', 'diaphragm'), 29),
 (('costophrenic', 'sulci', 'hili', 'and', 'diaphragm', 'are'), 29),
 (('sulci', 'hili', 'and', 'diaphragm', 'are', 'not'), 29),
 (('<s>', 'both', 'costophrenic', 'sulci', 'are', 'clear'), 28),
 (('both', 'costophrenic', 'sulci', 'are', 'clear'

## Find Word Before

In [54]:
before_pleural = findWordBefore(pleural_b0i1_bigram, 'pleural')
print('pleural :', pleural_b0i1_unigram['pleural',])
print('word before pleural :', sum(before_pleural.values()))
list(before_pleural.items())[:10]

pleural : 3323
word before pleural : 3323


[('right', 1310),
 ('left', 907),
 ('bilateral', 322),
 ('apical', 113),
 ('or', 99),
 ('of', 91),
 ('loculated', 62),
 ('and', 48),
 ('<s>', 25),
 ('be', 24)]

In [55]:
before_costo = findWordBefore(pleural_b0i1_bigram, 'costophrenic')
print('costophrenic :', pleural_b0i1_unigram['costophrenic',])
print('word before costophrenic :', sum(before_costo.values()))
list(before_costo.items())[:10]

costophrenic : 660
word before costophrenic : 660


[('left', 341),
 ('right', 230),
 ('both', 70),
 ('bilateral', 11),
 ('blunting', 3),
 ('lt', 2),
 ('and', 1),
 ('at', 1),
 ('is', 1)]

In [56]:
before_pleural = findWordBefore(pleural_b1i0_bigram, 'pleural')
print('pleural :', pleural_b1i0_unigram['pleural',])
print('word before pleural :', sum(before_pleural.values()))
list(before_pleural.items())[:10]

pleural : 754
word before pleural : 754


[('right', 158),
 ('left', 123),
 ('apical', 97),
 ('neither', 72),
 ('bilateral', 67),
 ('<s>', 25),
 ('no', 23),
 ('with', 20),
 ('of', 19),
 ('and', 15)]

In [57]:
before_costo = findWordBefore(pleural_b1i0_bigram, 'costophrenic')
print('costophrenic :', pleural_b1i0_unigram['costophrenic',])
print('word before costophrenic :', sum(before_costo.values()))
list(before_costo.items())[:10]

costophrenic : 696
word before costophrenic : 696


[('both', 463),
 ('left', 83),
 ('right', 83),
 ('<s>', 35),
 ('bilateral', 24),
 ('of', 2),
 ('oth', 1),
 ('and', 1),
 ('eflt', 1),
 ('blunt', 1)]

# Edema

## Prepare Data

In [58]:
# choose rows
edema_df, edema_b0i1_df, edema_b1i0_df = selectData(df, 'Edema')
edema_df

Unnamed: 0,Image Index,Reports,Cardiomegaly BERT Labeler,Edema BERT Labeler,Inspectra Lung Opacity v1 BERT Labeler,Atelectasis BERT Labeler,Pleural Effusion BERT Labeler,Lung Lesion BERT Labeler,Cardiomegaly Inspectra Labeler,Edema Inspectra Labeler,...,Atelectasis Inspectra Labeler,Pleural Effusion Inspectra Labeler,Lung Lesion Inspectra Labeler,Inspectra Lung Opacity v1 Balanced Score,Mass Balanced Score,Nodule Balanced Score,Edema Balanced Score,Atelectasis Balanced Score,Cardiomegaly Balanced Score,Pleural Effusion Balanced Score
0,20306491,CXR\nClinical history: DM with CRF\n - No ...,0,1,0,0,0,0,0,0,...,0,0,0,0.07303039028,0.02831693732,0.05593879763,0.1569643002,0.02970892043,0.7074314112,0.006373987574
1,20367612,CXR (PA upright)\nHx: CRF\n\nRt.side double l...,1,1,0,0,0,0,1,0,...,0,0,0,0.06688237412,0.02683061355,0.08159709045,0.134561279,0.04077892724,0.7485450798,0.00708947854
2,20538714,Chest PA upright\n compared to film on 23/2/20...,1,1,0,0,1,0,1,0,...,0,1,0,0.8160845905,0.2388284012,0.4375885272,0.4242215093,0.3860107819,0.8392846532,0.8781929561
3,20567665,"Chest PA upright;\nknown case CRF, on dialysis...",1,1,0,0,0,0,1,0,...,0,0,1,0.3251284437,0.1103513617,0.695227569,0.383227961,0.2636212862,0.8826890272,0.03996030814
4,20573024,Chest (PA upriht view)\n Hisory of gouthy ...,1,1,0,0,0,0,1,0,...,0,0,0,0.4592965131,0.2299608036,0.1490851245,0.4304440733,0.09100974211,0.9092212189,0.05209171506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,24825847,"CXR PA upright\n\nHistory : recent NSTEMI,CHF...",1,1,0,0,0,0,1,0,...,0,0,0,0.1945962246,0.06639665845,0.1987265551,0.07629760857,0.06828867431,0.6542565385,0.009900805846
280,24880152,Chest PA upright\n Tracheostomy tube is seen ...,0,1,0,0,0,0,0,0,...,0,0,0,0.2708123262,0.1016328766,0.2261318131,0.05663187802,0.1682433776,0.05214255897,0.04047236025
281,24999286,CXR PA SUPINE\n\nFinding :\n Trachea is ...,0,1,0,0,0,0,0,0,...,0,0,0,0.3077559492,0.1225547196,0.3755704353,0.06682741663,0.6055474943,0.1771505959,0.1524861504
282,25013257,CXR PA\n\nCompared with previous study: None\n...,1,1,0,0,0,0,1,0,...,0,0,0,0.3339428041,0.07436111775,0.1817781537,0.3825516524,0.07831878347,0.9304516914,0.008723032663


In [59]:
# create report list
edema_b0i1_report = createReportList(edema_b0i1_df)
edema_b1i0_report = createReportList(edema_b1i0_df)
edema_b1i0_report[0]

Lenght : 0
Lenght : 284


['CXR',
 'Clinical history: DM with CRF',
 '     - No definite pulmonary infiltration.',
 '     - The heart and great vessels appear normal.',
 '     - Bilateral costophrenic angles are unremarkable.',
 '     - Bony thorax is intact.',
 'IMP: Normal chest']

In [60]:
# define keyword to select sentence from report
edema_keyword = r'pleural'

## BERT 0, Inspectra 1

No report

## BERT 1, Inspectra 0

Without keyword

In [74]:
edema_b1i0_unigram = createPatternNgrams(edema_b1i0_report, 1)

In [62]:
edema_b1i0_bigram = createPatternNgrams(edema_b1i0_report, 2)
list(edema_b1i0_bigram.items())[:20]

[(('<s>', 'no'), 289),
 (('pleural', 'effusion'), 161),
 (('seen', '<\\s>'), 136),
 (('is', 'seen'), 128),
 (('pulmonary', 'infiltration'), 118),
 (('bony', 'thorax'), 116),
 (('noted', '<\\s>'), 105),
 (('intact', '<\\s>'), 100),
 (('<s>', 'the'), 97),
 (('<s>', 'bony'), 96),
 (('is', 'noted'), 92),
 (('effusion', '<\\s>'), 87),
 (('<\\s>', '<s>'), 85),
 (('<s>', 'findings'), 85),
 (('<s>', 'history'), 81),
 (('thorax', 'is'), 78),
 (('<s>', 'cardiomegaly'), 78),
 (('there', 'is'), 70),
 (('tube', 'is'), 69),
 (('effusion', 'is'), 67)]

In [63]:
edema_b1i0_trigram = createPatternNgrams(edema_b1i0_report, 3)
list(edema_b1i0_trigram.items())[:20]

[(('is', 'seen', '<\\s>'), 108),
 (('is', 'noted', '<\\s>'), 84),
 (('pleural', 'effusion', '<\\s>'), 83),
 (('bony', 'thorax', 'is'), 78),
 (('<s>', 'bony', 'thorax'), 73),
 (('<s>', 'no', 'pleural'), 61),
 (('no', 'pleural', 'effusion'), 61),
 (('is', 'intact', '<\\s>'), 58),
 (('pleural', 'effusion', 'is'), 58),
 (('thorax', 'is', 'intact'), 55),
 (('costophrenic', 'angles', 'are'), 54),
 (('tracheostomy', 'tube', 'is'), 54),
 (('<s>', 'there', 'is'), 54),
 (('both', 'costophrenic', 'angles'), 51),
 (('no', 'definite', 'pulmonary'), 50),
 (('pulmonary', 'infiltration', '<\\s>'), 48),
 (('<s>', 'both', 'costophrenic'), 48),
 (('<s>', 'tracheostomy', 'tube'), 46),
 (('<s>', 'findings', '<\\s>'), 46),
 (('<s>', 'no', 'cardiomegaly'), 44)]

In [64]:
edema_b1i0_trigram = createPatternNgrams(edema_b1i0_report, 4)
list(edema_b1i0_trigram.items())[:20]

[(('<s>', 'no', 'pleural', 'effusion'), 59),
 (('<s>', 'bony', 'thorax', 'is'), 55),
 (('bony', 'thorax', 'is', 'intact'), 55),
 (('thorax', 'is', 'intact', '<\\s>'), 55),
 (('both', 'costophrenic', 'angles', 'are'), 48),
 (('no', 'definite', 'pulmonary', 'infiltration'), 43),
 (('<s>', 'tracheostomy', 'tube', 'is'), 41),
 (('angles', 'are', 'clear', '<\\s>'), 38),
 (('in', 'proper', 'position', '<\\s>'), 36),
 (('<s>', 'both', 'costophrenic', 'angles'), 36),
 (('no', 'pleural', 'effusion', '<\\s>'), 35),
 (('costophrenic', 'angles', 'are', 'clear'), 35),
 (('no', 'significant', 'change', 'of'), 33),
 (('effusion', 'is', 'seen', '<\\s>'), 33),
 (('<s>', 'no', 'definite', 'pulmonary'), 29),
 (('pleural', 'effusion', 'is', 'seen'), 29),
 (('tracheostomy', 'tube', 'is', 'in'), 29),
 (('vein', 'and', 'its', 'tip'), 28),
 (('and', 'its', 'tip', 'is'), 28),
 (('no', 'active', 'pulmonary', 'infiltration'), 27)]

With keyword

In [65]:
edema_b1i0_bigramk = createPatternNgrams(edema_b1i0_report, 2, edema_keyword)
list(edema_b1i0_bigramk.items())[:20]

[(('pleural', 'effusion'), 161),
 (('<s>', 'no'), 111),
 (('effusion', '<\\s>'), 85),
 (('no', 'pleural'), 63),
 (('effusion', 'is'), 61),
 (('bilateral', 'pleural'), 36),
 (('seen', '<\\s>'), 33),
 (('is', 'seen'), 31),
 (('or', 'pleural'), 24),
 (('infiltration', 'or'), 20),
 (('pulmonary', 'infiltration'), 18),
 (('change', 'of'), 17),
 (('of', 'bilateral'), 14),
 (('no', 'significant'), 13),
 (('<s>', 'bilateral'), 13),
 (('noted', '<\\s>'), 13),
 (('right', 'pleural'), 12),
 (('<\\s>', '<s>'), 12),
 (('significant', 'change'), 12),
 (('detected', '<\\s>'), 11)]

In [66]:
edema_b1i0_trigramk = createPatternNgrams(edema_b1i0_report, 3, edema_keyword)
list(edema_b1i0_trigramk.items())[:20]

[(('pleural', 'effusion', '<\\s>'), 83),
 (('<s>', 'no', 'pleural'), 61),
 (('no', 'pleural', 'effusion'), 61),
 (('pleural', 'effusion', 'is'), 58),
 (('bilateral', 'pleural', 'effusion'), 33),
 (('is', 'seen', '<\\s>'), 31),
 (('effusion', 'is', 'seen'), 30),
 (('or', 'pleural', 'effusion'), 22),
 (('infiltration', 'or', 'pleural'), 19),
 (('pulmonary', 'infiltration', 'or'), 18),
 (('<s>', 'bilateral', 'pleural'), 13),
 (('right', 'pleural', 'effusion'), 12),
 (('no', 'significant', 'change'), 12),
 (('significant', 'change', 'of'), 12),
 (('<s>', 'no', 'significant'), 10),
 (('<s>', 'no', 'definite'), 10),
 (('of', 'bilateral', 'pleural'), 10),
 (('of', 'pleural', 'effusion'), 9),
 (('is', 'detected', '<\\s>'), 8),
 (('no', 'definite', 'pulmonary'), 8)]

In [67]:
edema_b1i0_fourgramk = createPatternNgrams(edema_b1i0_report, 4, edema_keyword)
list(edema_b1i0_fourgramk.items())[:20]

[(('<s>', 'no', 'pleural', 'effusion'), 59),
 (('no', 'pleural', 'effusion', '<\\s>'), 35),
 (('effusion', 'is', 'seen', '<\\s>'), 30),
 (('pleural', 'effusion', 'is', 'seen'), 29),
 (('no', 'pleural', 'effusion', 'is'), 25),
 (('or', 'pleural', 'effusion', '<\\s>'), 20),
 (('infiltration', 'or', 'pleural', 'effusion'), 19),
 (('pulmonary', 'infiltration', 'or', 'pleural'), 18),
 (('<s>', 'bilateral', 'pleural', 'effusion'), 13),
 (('bilateral', 'pleural', 'effusion', '<\\s>'), 13),
 (('no', 'significant', 'change', 'of'), 12),
 (('bilateral', 'pleural', 'effusion', 'is'), 12),
 (('<s>', 'no', 'significant', 'change'), 9),
 (('of', 'bilateral', 'pleural', 'effusion'), 9),
 (('<s>', 'no', 'definite', 'pulmonary'), 8),
 (('no', 'definite', 'pulmonary', 'infiltration'), 8),
 (('definite', 'pulmonary', 'infiltration', 'or'), 8),
 (('right', 'pleural', 'effusion', 'is'), 7),
 (('<s>', 'no', 'evidence', 'of'), 7),
 (('no', 'evidence', 'of', 'pleural'), 7)]

In [68]:
edema_b1i0_fivegramk = createPatternNgrams(edema_b1i0_report, 5, edema_keyword)
list(edema_b1i0_fivegramk.items())[:20]

[(('<s>', 'no', 'pleural', 'effusion', '<\\s>'), 33),
 (('pleural', 'effusion', 'is', 'seen', '<\\s>'), 29),
 (('<s>', 'no', 'pleural', 'effusion', 'is'), 25),
 (('no', 'pleural', 'effusion', 'is', 'seen'), 18),
 (('pulmonary', 'infiltration', 'or', 'pleural', 'effusion'), 18),
 (('infiltration', 'or', 'pleural', 'effusion', '<\\s>'), 18),
 (('<s>', 'no', 'significant', 'change', 'of'), 9),
 (('<s>', 'no', 'definite', 'pulmonary', 'infiltration'), 8),
 (('no', 'definite', 'pulmonary', 'infiltration', 'or'), 8),
 (('definite', 'pulmonary', 'infiltration', 'or', 'pleural'), 8),
 (('<s>', 'no', 'evidence', 'of', 'pleural'), 7),
 (('no', 'evidence', 'of', 'pleural', 'effusion'), 7),
 (('pleural', 'effusion', 'is', 'detected', '<\\s>'), 7),
 (('pleural', 'effusion', 'is', 'noted', '<\\s>'), 7),
 (('of', 'bilateral', 'pleural', 'effusion', 'is'), 7),
 (('<s>', 'no', 'pulmonary', 'infiltration', 'or'), 6),
 (('no', 'pulmonary', 'infiltration', 'or', 'pleural'), 6),
 (('<s>', 'no', 'detectable

## Find Word Before

In [75]:
before_pleural = findWordBefore(edema_b1i0_bigram, 'pleural')
print('pleural :', edema_b1i0_unigram['pleural',])
print('word before pleural :', sum(before_pleural.values()))
list(before_pleural.items())[:10]

pleural : 178
word before pleural : 178


[('no', 63),
 ('bilateral', 36),
 ('or', 24),
 ('right', 12),
 ('of', 9),
 ('left', 7),
 ('detectable', 6),
 ('apical', 3),
 ('minimal', 3),
 ('definite', 2)]

# Inspectra Lung Opacity v1
Infiltration + Consolidation + Lung Opacity

## Prepare Data

In [76]:
# choose rows
v1_df, v1_b0i1_df, v1_b1i0_df = selectData(df, 'Inspectra Lung Opacity v1')
v1_df

Unnamed: 0,Image Index,Reports,Cardiomegaly BERT Labeler,Edema BERT Labeler,Inspectra Lung Opacity v1 BERT Labeler,Atelectasis BERT Labeler,Pleural Effusion BERT Labeler,Lung Lesion BERT Labeler,Cardiomegaly Inspectra Labeler,Edema Inspectra Labeler,...,Atelectasis Inspectra Labeler,Pleural Effusion Inspectra Labeler,Lung Lesion Inspectra Labeler,Inspectra Lung Opacity v1 Balanced Score,Mass Balanced Score,Nodule Balanced Score,Edema Balanced Score,Atelectasis Balanced Score,Cardiomegaly Balanced Score,Pleural Effusion Balanced Score
0,20290676,CXR (PA upright)\n\nPatchy infiltration at RL...,1,0,0,0,0,0,1,0,...,0,0,0,0.6705767571,0.08962687799,0.3223274601,0.008262414654,0.07552811707,0.01909299623,0.007672864324
1,20292772,CXR (PA upright)\n\nReticulonodular infiltrati...,0,0,0,0,0,1,0,0,...,0,0,0,0.9478721681,0.1678770714,0.4644771763,0.08673519902,0.2267119501,0.05830180422,0.878199365
2,20294567,Chest:-\nPa chest study reveals fibronodular i...,0,0,0,0,0,1,0,0,...,0,0,0,0.8226948088,0.1517771742,0.6166349793,0.01350364634,0.235725454,0.00647519908,0.08067476336
3,20294712,CHEST :\nP.A. upright view .\nFibronodular inf...,0,0,0,0,0,1,0,0,...,0,0,0,0.6979679996,0.1366843406,0.6163465142,0.04663362481,0.08208811247,0.02999480347,0.01371037712
4,20294740,CHEST :\nP.A. upright view .\nFibronodular inf...,0,0,0,0,0,1,0,0,...,0,0,1,0.9419731872,0.4525913554,0.4320501349,0.05085499389,0.2342405043,0.007880797901,0.1427466416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76656,25106097,CXR\n\n \nUnchanged size of soft tissue densit...,0,0,0,1,0,1,0,0,...,1,0,1,0.6114081889,0.3188176683,0.2926796391,0.1410927658,0.4182457805,0.9265471478,0.3421297222
76657,25106159,CXR(PA)\n \n Reticulocalcified infiltration...,1,0,0,0,0,0,1,0,...,0,0,0,0.7535559355,0.0997864652,0.6272998,0.1504372268,0.2217993614,0.8989995311,0.06295422414
76658,25106503,CXR(PA)\n\n IMP: Retioculonodular infiltratio...,1,0,0,0,0,1,1,0,...,0,0,0,0.8797720788,0.2601079604,0.7077789726,0.1511556734,0.2263483271,0.700018838,0.1066745699
76659,25106622,CXR PA upright\n\n There is no definite pulmo...,0,0,0,0,0,0,0,0,...,0,0,0,0.04914811821,0.0117086733,0.1039028442,0.01578075513,0.07933877652,0.4247253044,0.007032924475


In [77]:
# create report list
v1_b0i1_report = createReportList(v1_b0i1_df)
v1_b1i0_report = createReportList(v1_b1i0_df)
v1_b0i1_report[0]

Lenght : 75554
Lenght : 1107


['CXR  (PA upright)',
 'Patchy infiltration at RLL, LUL and LLL. ',
 'Mild cardiomegaly. ',
 'No perihilar or mediastinal mass.',
 'No pleural effusion.',
 'Bony thorax is intact.']

In [78]:
infil_keyword = r'infil|densit|opaci|'
conso_keyword = 'consolidat|'
opaci_keyword = 'opaci|translu|air|mark|pattern|lung|reticul|scar|thicken|densit|patchy'
v1_keyword = infil_keyword + conso_keyword + opaci_keyword

## BERT 0, Inspectra 1

Without keyword

In [79]:
v1_b0i1_unigram = createPatternNgrams(v1_b0i1_report, 1)

In [80]:
v1_b0i1_bigram = createPatternNgrams(v1_b0i1_report, 2)
list(v1_b0i1_bigram.items())[:20]

[(('<s>', 'no'), 77218),
 (('<s>', 'the'), 50107),
 (('bony', 'thorax'), 38975),
 (('pulmonary', 'infiltration'), 35747),
 (('clear', '<\\s>'), 35669),
 (('intact', '<\\s>'), 33675),
 (('both', 'costophrenic'), 33162),
 (('thorax', 'is'), 32734),
 (('are', 'clear'), 31922),
 (('noted', '<\\s>'), 30789),
 (('pleural', 'effusion'), 30554),
 (('change', 'of'), 30318),
 (('is', 'intact'), 30193),
 (('<s>', 'both'), 30107),
 (('costophrenic', 'angles'), 29993),
 (('angles', 'are'), 29594),
 (('is', 'seen'), 29065),
 (('seen', '<\\s>'), 27704),
 (('infiltration', 'at'), 27013),
 (('is', 'noted'), 26912)]

In [81]:
v1_b0i1_trigram = createPatternNgrams(v1_b0i1_report, 3)
list(v1_b0i1_trigram.items())[:20]

[(('bony', 'thorax', 'is'), 32687),
 (('are', 'clear', '<\\s>'), 31867),
 (('is', 'intact', '<\\s>'), 30185),
 (('costophrenic', 'angles', 'are'), 29375),
 (('thorax', 'is', 'intact'), 29316),
 (('both', 'costophrenic', 'angles'), 27975),
 (('<s>', 'both', 'costophrenic'), 26840),
 (('is', 'seen', '<\\s>'), 24821),
 (('<s>', 'there', 'is'), 24427),
 (('is', 'noted', '<\\s>'), 23952),
 (('angles', 'are', 'clear'), 23562),
 (('there', 'is', 'no'), 18362),
 (('<s>', 'no', 'cardiomegaly'), 18145),
 (('<s>', 'the', 'bony'), 17654),
 (('pleural', 'effusion', '<\\s>'), 17399),
 (('the', 'bony', 'thorax'), 17345),
 (('no', 'pleural', 'effusion'), 17257),
 (('<s>', 'bony', 'thorax'), 16456),
 (('<s>', 'no', 'pleural'), 16177),
 (('pulmonary', 'infiltration', 'or'), 16083)]

In [82]:
v1_b0i1_fourgram = createPatternNgrams(v1_b0i1_report, 4)
list(v1_b0i1_fourgram.items())[:20]

[(('thorax', 'is', 'intact', '<\\s>'), 29309),
 (('bony', 'thorax', 'is', 'intact'), 29301),
 (('both', 'costophrenic', 'angles', 'are'), 27485),
 (('angles', 'are', 'clear', '<\\s>'), 23529),
 (('costophrenic', 'angles', 'are', 'clear'), 23355),
 (('<s>', 'both', 'costophrenic', 'angles'), 21913),
 (('<s>', 'there', 'is', 'no'), 17426),
 (('<s>', 'the', 'bony', 'thorax'), 17344),
 (('the', 'bony', 'thorax', 'is'), 17307),
 (('<s>', 'no', 'pleural', 'effusion'), 16090),
 (('pulmonary', 'infiltration', 'or', 'nodule'), 15504),
 (('<s>', 'bony', 'thorax', 'is'), 15207),
 (('no', 'definite', 'pulmonary', 'infiltration'), 13797),
 (('<s>', 'cardiac', 'shadow', 'is'), 13731),
 (('is', 'normal', 'size', '<\\s>'), 13512),
 (('there', 'is', 'no', 'definite'), 12961),
 (('infiltration', 'or', 'nodule', '<\\s>'), 12857),
 (('definite', 'pulmonary', 'infiltration', 'or'), 12839),
 (('no', 'active', 'pulmonary', 'infiltration'), 12758),
 (('cardiac', 'shadow', 'is', 'normal'), 12731)]

With keyword

In [83]:
v1_b0i1_bigramk = createPatternNgrams(v1_b0i1_report, 2, v1_keyword)
list(v1_b0i1_bigramk.items())[:20]

[(('pulmonary', 'infiltration'), 35747),
 (('infiltration', 'at'), 27013),
 (('change', 'of'), 23599),
 (('<s>', 'no'), 23553),
 (('there', 'is'), 21300),
 (('<s>', 'there'), 20648),
 (('infiltration', '<\\s>'), 17335),
 (('is', 'no'), 16728),
 (('infiltration', 'or'), 16511),
 (('active', 'pulmonary'), 15710),
 (('or', 'nodule'), 15695),
 (('no', 'definite'), 15206),
 (('at', 'right'), 14696),
 (('at', 'both'), 14379),
 (('definite', 'pulmonary'), 14051),
 (('nodule', '<\\s>'), 13535),
 (('<s>', 'imp'), 13485),
 (('imp', 'no'), 12981),
 (('no', 'active'), 12975),
 (('<\\s>', '<s>'), 12631)]

In [84]:
v1_b0i1_trigramk = createPatternNgrams(v1_b0i1_report, 3, v1_keyword)
list(v1_b0i1_trigramk.items())[:20]

[(('<s>', 'there', 'is'), 19887),
 (('there', 'is', 'no'), 16453),
 (('pulmonary', 'infiltration', 'or'), 16083),
 (('infiltration', 'or', 'nodule'), 15662),
 (('active', 'pulmonary', 'infiltration'), 15603),
 (('pulmonary', 'infiltration', '<\\s>'), 15138),
 (('no', 'definite', 'pulmonary'), 14038),
 (('definite', 'pulmonary', 'infiltration'), 13809),
 (('<s>', 'imp', 'no'), 12981),
 (('is', 'no', 'definite'), 12951),
 (('or', 'nodule', '<\\s>'), 12872),
 (('no', 'active', 'pulmonary'), 12778),
 (('no', 'change', 'of'), 11477),
 (('imp', 'no', 'active'), 11170),
 (('no', 'significant', 'change'), 10277),
 (('significant', 'change', 'of'), 9563),
 (('is', 'noted', '<\\s>'), 8836),
 (('reticulonodular', 'infiltration', 'at'), 8115),
 (('<s>', 'no', 'significant'), 7921),
 (('infiltration', 'at', 'both'), 7332)]

In [85]:
v1_b0i1_fourgramk = createPatternNgrams(v1_b0i1_report, 4, v1_keyword)
list(v1_b0i1_fourgramk.items())[:20]

[(('<s>', 'there', 'is', 'no'), 15580),
 (('pulmonary', 'infiltration', 'or', 'nodule'), 15504),
 (('no', 'definite', 'pulmonary', 'infiltration'), 13797),
 (('there', 'is', 'no', 'definite'), 12947),
 (('infiltration', 'or', 'nodule', '<\\s>'), 12857),
 (('definite', 'pulmonary', 'infiltration', 'or'), 12839),
 (('no', 'active', 'pulmonary', 'infiltration'), 12758),
 (('is', 'no', 'definite', 'pulmonary'), 12488),
 (('active', 'pulmonary', 'infiltration', '<\\s>'), 12322),
 (('<s>', 'imp', 'no', 'active'), 11170),
 (('imp', 'no', 'active', 'pulmonary'), 11163),
 (('no', 'significant', 'change', 'of'), 9540),
 (('<s>', 'no', 'significant', 'change'), 7444),
 (('<s>', 'no', 'change', 'of'), 5739),
 (('<s>', 'the', 'chest', 'shows'), 5121),
 (('are', 'not', 'remarkable', '<\\s>'), 4384),
 (('the', 'chest', 'shows', 'no'), 3724),
 (('<s>', 'both', 'costophrenic', 'sulci'), 3634),
 (('thorax', 'is', 'unremarkable', '<\\s>'), 3266),
 (('bony', 'thorax', 'is', 'unremarkable'), 3265)]

In [86]:
v1_b0i1_fivegramk = createPatternNgrams(v1_b0i1_report, 5, v1_keyword)
list(v1_b0i1_fivegramk.items())[:20]

[(('pulmonary', 'infiltration', 'or', 'nodule', '<\\s>'), 12836),
 (('no', 'definite', 'pulmonary', 'infiltration', 'or'), 12836),
 (('definite', 'pulmonary', 'infiltration', 'or', 'nodule'), 12712),
 (('<s>', 'there', 'is', 'no', 'definite'), 12567),
 (('there', 'is', 'no', 'definite', 'pulmonary'), 12487),
 (('is', 'no', 'definite', 'pulmonary', 'infiltration'), 12472),
 (('no', 'active', 'pulmonary', 'infiltration', '<\\s>'), 11631),
 (('<s>', 'imp', 'no', 'active', 'pulmonary'), 11163),
 (('imp', 'no', 'active', 'pulmonary', 'infiltration'), 11161),
 (('<s>', 'no', 'significant', 'change', 'of'), 7170),
 (('<s>', 'the', 'chest', 'shows', 'no'), 3723),
 (('bony', 'thorax', 'is', 'unremarkable', '<\\s>'), 3265),
 (('sulci', 'are', 'not', 'remarkable', '<\\s>'), 3174),
 (('costophrenic', 'sulci', 'are', 'not', 'remarkable'), 3167),
 (('the', 'chest', 'shows', 'no', 'change'), 3005),
 (('chest', 'shows', 'no', 'change', 'of'), 2967),
 (('<s>', 'both', 'costophrenic', 'sulci', 'are'), 2

In [87]:
v1_b0i1_sixgramk = createPatternNgrams(v1_b0i1_report, 6, v1_keyword)
list(v1_b0i1_sixgramk.items())[:20]

[(('no', 'definite', 'pulmonary', 'infiltration', 'or', 'nodule'), 12710),
 (('definite', 'pulmonary', 'infiltration', 'or', 'nodule', '<\\s>'), 12671),
 (('there', 'is', 'no', 'definite', 'pulmonary', 'infiltration'), 12471),
 (('<s>', 'there', 'is', 'no', 'definite', 'pulmonary'), 12401),
 (('is', 'no', 'definite', 'pulmonary', 'infiltration', 'or'), 12349),
 (('<s>', 'imp', 'no', 'active', 'pulmonary', 'infiltration'), 11161),
 (('imp', 'no', 'active', 'pulmonary', 'infiltration', '<\\s>'), 11161),
 (('costophrenic', 'sulci', 'are', 'not', 'remarkable', '<\\s>'), 3165),
 (('<s>', 'the', 'chest', 'shows', 'no', 'change'), 3004),
 (('the', 'chest', 'shows', 'no', 'change', 'of'), 2967),
 (('<s>', 'bony', 'thorax', 'is', 'unremarkable', '<\\s>'), 2399),
 (('<s>', 'both', 'costophrenic', 'sulci', 'are', 'not'), 2295),
 (('both', 'costophrenic', 'sulci', 'are', 'not', 'remarkable'), 2295),
 (('infiltration', 'or', 'nodule', 'is', 'noted', '<\\s>'), 2251),
 (('pulmonary', 'infiltration', 

## BERT 1, Inspectra 0

Without keyword

In [88]:
v1_b1i0_unigram = createPatternNgrams(v1_b1i0_report, 1)

In [89]:
v1_b1i0_bigram = createPatternNgrams(v1_b1i0_report, 2)
list(v1_b1i0_bigram.items())[:20]

[(('<s>', 'no'), 722),
 (('pleural', 'effusion'), 361),
 (('<\\s>', '<s>'), 342),
 (('bony', 'thorax'), 337),
 (('intact', '<\\s>'), 336),
 (('seen', '<\\s>'), 334),
 (('<s>', 'the'), 330),
 (('is', 'seen'), 310),
 (('clear', '<\\s>'), 296),
 (('noted', '<\\s>'), 294),
 (('<s>', 'bony'), 268),
 (('thorax', 'is'), 263),
 (('is', 'noted'), 260),
 (('<s>', 'both'), 252),
 (('is', 'intact'), 252),
 (('both', 'costophrenic'), 244),
 (('effusion', '<\\s>'), 231),
 (('<s>', 'normal'), 228),
 (('infiltration', 'at'), 207),
 (('<s>', 'mild'), 202)]

In [90]:
v1_b1i0_trigram = createPatternNgrams(v1_b1i0_report, 3)
list(v1_b1i0_trigram.items())[:20]

[(('is', 'seen', '<\\s>'), 268),
 (('bony', 'thorax', 'is'), 262),
 (('is', 'intact', '<\\s>'), 252),
 (('thorax', 'is', 'intact'), 237),
 (('is', 'noted', '<\\s>'), 226),
 (('<s>', 'bony', 'thorax'), 203),
 (('pleural', 'effusion', '<\\s>'), 199),
 (('<s>', 'both', 'costophrenic'), 197),
 (('are', 'clear', '<\\s>'), 185),
 (('<s>', 'no', 'cardiomegaly'), 156),
 (('both', 'costophrenic', 'angles'), 149),
 (('<s>', 'there', 'is'), 145),
 (('costophrenic', 'angles', 'are'), 143),
 (('not', 'enlarged', '<\\s>'), 137),
 (('is', 'not', 'enlarged'), 136),
 (('heart', 'is', 'not'), 131),
 (('<s>', 'compared', 'to'), 120),
 (('<s>', 'heart', 'is'), 119),
 (('no', 'pleural', 'effusion'), 118),
 (('angles', 'are', 'clear'), 118)]

In [91]:
v1_b1i0_fourgram = createPatternNgrams(v1_b1i0_report, 4)
list(v1_b1i0_fourgram.items())[:20]

[(('bony', 'thorax', 'is', 'intact'), 237),
 (('thorax', 'is', 'intact', '<\\s>'), 237),
 (('<s>', 'bony', 'thorax', 'is'), 189),
 (('is', 'not', 'enlarged', '<\\s>'), 136),
 (('both', 'costophrenic', 'angles', 'are'), 130),
 (('heart', 'is', 'not', 'enlarged'), 125),
 (('angles', 'are', 'clear', '<\\s>'), 118),
 (('costophrenic', 'angles', 'are', 'clear'), 112),
 (('<s>', 'no', 'pleural', 'effusion'), 111),
 (('compared', 'to', 'film', 'on'), 109),
 (('<s>', 'both', 'costophrenic', 'angles'), 107),
 (('<s>', 'compared', 'to', 'film'), 104),
 (('<s>', 'heart', 'is', 'not'), 91),
 (('<s>', 'both', 'costophrenic', 'sulci'), 87),
 (('<s>', 'no', 'cardiomegaly', '<\\s>'), 85),
 (('<s>', 'normal', 'cardiothoracic', 'ratio'), 84),
 (('are', 'not', 'remarkable', '<\\s>'), 83),
 (('cardiomegaly', 'is', 'noted', '<\\s>'), 77),
 (('normal', 'cardiothoracic', 'ratio', '<\\s>'), 77),
 (('no', 'pleural', 'effusion', '<\\s>'), 74)]

With keyword

In [92]:
v1_b1i0_bigramk = createPatternNgrams(v1_b1i0_report, 2, v1_keyword)
list(v1_b1i0_bigramk.items())[:20]

[(('<s>', 'no'), 239),
 (('<\\s>', '<s>'), 219),
 (('infiltration', 'at'), 207),
 (('at', 'both'), 154),
 (('is', 'seen'), 131),
 (('<s>', 'the'), 129),
 (('pulmonary', 'infiltration'), 129),
 (('seen', '<\\s>'), 124),
 (('clear', '<\\s>'), 124),
 (('<s>', 'both'), 122),
 (('unremarkable', '<\\s>'), 108),
 (('<s>', 'there'), 106),
 (('both', 'lungs'), 103),
 (('costophrenic', 'sulci'), 102),
 (('there', 'is'), 100),
 (('left', 'lung'), 99),
 (('interstitial', 'infiltration'), 95),
 (('both', 'costophrenic'), 95),
 (('at', 'right'), 94),
 (('infiltration', '<\\s>'), 94)]

In [93]:
v1_b1i0_trigramk = createPatternNgrams(v1_b1i0_report, 3, v1_keyword)
list(v1_b1i0_trigramk.items())[:20]

[(('is', 'seen', '<\\s>'), 98),
 (('<s>', 'there', 'is'), 95),
 (('<s>', 'both', 'costophrenic'), 85),
 (('both', 'costophrenic', 'sulci'), 84),
 (('not', 'remarkable', '<\\s>'), 84),
 (('are', 'not', 'remarkable'), 83),
 (('costophrenic', 'sulci', 'are'), 74),
 (('is', 'clear', '<\\s>'), 71),
 (('is', 'noted', '<\\s>'), 67),
 (('interstitial', 'infiltration', 'at'), 62),
 (('active', 'pulmonary', 'infiltration'), 60),
 (('lung', 'is', 'clear'), 59),
 (('pulmonary', 'infiltration', '<\\s>'), 57),
 (('sulci', 'are', 'not'), 56),
 (('<s>', 'no', 'new'), 55),
 (('infiltration', 'at', 'both'), 53),
 (('no', 'new', 'lung'), 49),
 (('new', 'lung', 'lesion'), 48),
 (('<s>', 'no', 'active'), 47),
 (('both', 'lungs', '<\\s>'), 47)]

In [94]:
v1_b1i0_fourgramk = createPatternNgrams(v1_b1i0_report, 4, v1_keyword)
list(v1_b1i0_fourgramk.items())[:20]

[(('<s>', 'both', 'costophrenic', 'sulci'), 83),
 (('are', 'not', 'remarkable', '<\\s>'), 83),
 (('both', 'costophrenic', 'sulci', 'are'), 60),
 (('lung', 'is', 'clear', '<\\s>'), 57),
 (('costophrenic', 'sulci', 'are', 'not'), 56),
 (('sulci', 'are', 'not', 'remarkable'), 56),
 (('<s>', 'no', 'new', 'lung'), 49),
 (('no', 'new', 'lung', 'lesion'), 48),
 (('new', 'lung', 'lesion', 'is'), 44),
 (('no', 'active', 'pulmonary', 'infiltration'), 43),
 (('<s>', 'there', 'is', 'no'), 35),
 (('<s>', 'no', 'active', 'pulmonary'), 34),
 (('active', 'pulmonary', 'infiltration', '<\\s>'), 31),
 (('<\\s>', '<s>', 'there', 'is'), 31),
 (('<s>', 'the', 'chest', 'shows'), 30),
 (('left', 'lung', 'is', 'clear'), 29),
 (('no', 'definite', 'pulmonary', 'infiltration'), 28),
 (('interstitial', 'infiltration', 'at', 'both'), 26),
 (('at', 'both', 'lower', 'lungs'), 26),
 (('of', 'lung', 'markings', 'at'), 25)]

In [95]:
v1_b1i0_fivegramk = createPatternNgrams(v1_b1i0_report, 5, v1_keyword)
list(v1_b1i0_fivegramk.items())[:20]

[(('<s>', 'both', 'costophrenic', 'sulci', 'are'), 60),
 (('costophrenic', 'sulci', 'are', 'not', 'remarkable'), 56),
 (('sulci', 'are', 'not', 'remarkable', '<\\s>'), 56),
 (('<s>', 'no', 'new', 'lung', 'lesion'), 48),
 (('no', 'new', 'lung', 'lesion', 'is'), 44),
 (('both', 'costophrenic', 'sulci', 'are', 'not'), 42),
 (('<s>', 'no', 'active', 'pulmonary', 'infiltration'), 33),
 (('left', 'lung', 'is', 'clear', '<\\s>'), 27),
 (('bony', 'thorax', 'is', 'unremarkable', '<\\s>'), 24),
 (('and', 'diaphragm', 'are', 'not', 'remarkable'), 23),
 (('diaphragm', 'are', 'not', 'remarkable', '<\\s>'), 23),
 (('new', 'lung', 'lesion', 'is', 'seen'), 22),
 (('lung', 'lesion', 'is', 'seen', '<\\s>'), 22),
 (('<s>', 'bony', 'thorax', 'is', 'unremarkable'), 22),
 (('no', 'active', 'pulmonary', 'infiltration', '<\\s>'), 21),
 (('<s>', 'no', 'definite', 'pulmonary', 'infiltration'), 21),
 (('<s>', 'both', 'costophrenic', 'sulci', 'hili'), 21),
 (('both', 'costophrenic', 'sulci', 'hili', 'and'), 20),


## Find Word Before

In [96]:
before_infiltration = findWordBefore(v1_b0i1_bigram, 'infiltration')
print('infiltration :', v1_b0i1_unigram['infiltration',])
print('word before infiltration :', sum(before_infiltration.values()))
list(before_infiltration.items())[:10]

infiltration : 75096
word before infiltration : 75096


[('pulmonary', 35747),
 ('reticulonodular', 11375),
 ('interstitial', 6340),
 ('fibronodular', 3414),
 ('fibrotic', 3364),
 ('nodular', 1921),
 ('fibrocalcific', 1575),
 ('patchy', 1312),
 ('fibrointerstitial', 893),
 ('reticular', 877)]

In [97]:
before_infiltrate = findWordBefore(v1_b0i1_bigram, 'infiltrate')
print('infiltrate :', v1_b0i1_unigram['infiltrate',])
print('word before infiltrate :', sum(before_infiltrate.values()))
list(before_infiltrate.items())[:10]

infiltrate : 1353
word before infiltrate : 1353


[('reticulonodular', 928),
 ('nodular', 71),
 ('fibrocalcific', 68),
 ('fibronodular', 42),
 ('reticular', 36),
 ('<s>', 31),
 ('interstitial', 30),
 ('with', 16),
 ('reticulnodular', 15),
 ('minimal', 13)]

In [98]:
before_infiltration = findWordBefore(v1_b1i0_bigram, 'infiltration')
print('infiltration :', v1_b1i0_unigram['infiltration',])
print('word before infiltration :', sum(before_infiltration.values()))
list(before_infiltration.items())[:10]

infiltration : 513
word before infiltration : 513


[('pulmonary', 129),
 ('interstitial', 95),
 ('patchy', 68),
 ('rll', 17),
 ('fibrotic', 16),
 ('rul', 16),
 ('reticulonodular', 11),
 ('no', 10),
 ('active', 10),
 ('of', 9)]

# Atelectasis

## Prepare Data

In [99]:
# choose rows
atelec_df, atelec_b0i1_df, atelec_b1i0_df = selectData(df, 'Atelectasis')
atelec_df

Unnamed: 0,Image Index,Reports,Cardiomegaly BERT Labeler,Edema BERT Labeler,Inspectra Lung Opacity v1 BERT Labeler,Atelectasis BERT Labeler,Pleural Effusion BERT Labeler,Lung Lesion BERT Labeler,Cardiomegaly Inspectra Labeler,Edema Inspectra Labeler,...,Atelectasis Inspectra Labeler,Pleural Effusion Inspectra Labeler,Lung Lesion Inspectra Labeler,Inspectra Lung Opacity v1 Balanced Score,Mass Balanced Score,Nodule Balanced Score,Edema Balanced Score,Atelectasis Balanced Score,Cardiomegaly Balanced Score,Pleural Effusion Balanced Score
0,20306056,CXR\n Suboptimal inspiration is noted.\n Inc...,-1,0,0,1,0,0,-1,0,...,0,0,0,0.1212930209,0.06004886437,0.06782658038,0.3911514027,0.1170704572,0.7822361718,0.007802768752
1,20344856,Chest film\nReticulonodular infiltration both ...,0,0,0,1,0,1,0,0,...,0,0,0,0.2955181881,0.2323869381,0.8824456146,0.006998682049,0.06093342448,0.009307021363,0.06180102524
2,20353706,CXR PA upright\nFibropathy and atelectasis at ...,0,0,0,0,0,1,0,0,...,1,0,0,0.9336188886,0.1687630824,0.5583767301,0.03142648212,0.7349457878,0.05296808628,0.1119734273
3,20359793,"CXR\n Compare with film on 6/5/05, less expan...",0,0,0,0,0,0,0,0,...,1,0,0,0.6829327517,0.1301421545,0.2440125346,0.388215461,0.4157071961,0.2981577734,0.06305989021
4,20361595,CXR\n\nRUL ateletasis.\nFibrocalcific infiltra...,1,0,1,1,0,0,1,0,...,0,0,0,0.5468756785,0.2285589693,0.3596826504,0.07667310766,0.8331698199,0.6703110048,0.1585522988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2165,25094580,CXR PA UPRIGHT\nFINDINGS:\nIncrease in amount ...,-1,0,0,1,1,0,-1,0,...,0,1,0,0.7094685716,0.2294285033,0.2662446658,0.1714694755,0.4353038826,0.02299523286,0.6082483764
2166,25094828,CXR PA UPRIGHT\n\nFINDINGS:\nFurther increase ...,-1,0,0,1,1,0,-1,0,...,0,1,0,0.758041559,0.2478894476,0.26763224,0.1229577586,0.4709005516,0.02349641865,0.4537408123
2167,25097701,"CHEST FILM\n\nCompare to CXR on 22/2/2018, no ...",1,0,0,0,1,0,1,0,...,1,1,0,0.895257505,0.227974605,0.6194419396,0.383797026,0.2936593018,0.817986079,0.4619188675
2168,25100972,CXR PA\n\nCompared study: 18/04/2018\nSuspicio...,0,0,0,1,1,1,0,0,...,0,1,1,0.3408991231,0.3123909012,0.8802979532,0.05109197491,0.2459659047,0.03043722218,0.5723631177


In [100]:
# create report list
atelec_b0i1_report = createReportList(atelec_b0i1_df)
atelec_b1i0_report = createReportList(atelec_b1i0_df)
atelec_b0i1_report[0]

Lenght : 810
Lenght : 1360


['CXR PA upright',
 'Fibropathy and atelectasis at RUL',
 'Nodular infiltration LUL',
 'Normal cardiothoracic ratio',
 'Bony thorax is intact',
 'Both costophrenic sulci are not remarkable']

In [101]:
atelec_keyword = r'atelec|collap|volume'

## BERT 0, Inspectra 1

Without keyword

In [102]:
atelec_b0i1_unigram = createPatternNgrams(atelec_b0i1_report, 1)

In [103]:
atelec_b0i1_bigram = createPatternNgrams(atelec_b0i1_report, 2)
list(atelec_b0i1_bigram.items())[:20]

[(('<s>', 'no'), 1231),
 (('change', 'of'), 746),
 (('no', 'change'), 550),
 (('atelectasis', 'at'), 489),
 (('<s>', 'the'), 385),
 (('plate', 'atelectasis'), 369),
 (('bony', 'thorax'), 345),
 (('both', 'costophrenic'), 342),
 (('<s>', 'both'), 332),
 (('intact', '<\\s>'), 304),
 (('thorax', 'is'), 295),
 (('is', 'intact'), 292),
 (('is', 'noted'), 287),
 (('noted', '<\\s>'), 287),
 (('<s>', 'normal'), 282),
 (('pulmonary', 'infiltration'), 274),
 (('active', 'pulmonary'), 260),
 (('at', 'right'), 258),
 (('pleural', 'effusion'), 245),
 (('at', 'left'), 232)]

In [104]:
atelec_b0i1_trigram = createPatternNgrams(atelec_b0i1_report, 3)
list(atelec_b0i1_trigram.items())[:20]

[(('no', 'change', 'of'), 536),
 (('<s>', 'no', 'change'), 378),
 (('plate', 'atelectasis', 'at'), 315),
 (('<s>', 'both', 'costophrenic'), 309),
 (('bony', 'thorax', 'is'), 293),
 (('is', 'intact', '<\\s>'), 292),
 (('thorax', 'is', 'intact'), 277),
 (('is', 'noted', '<\\s>'), 241),
 (('normal', 'cardiothoracic', 'ratio'), 221),
 (('<s>', 'normal', 'cardiothoracic'), 217),
 (('not', 'remarkable', '<\\s>'), 208),
 (('cardiothoracic', 'ratio', '<\\s>'), 205),
 (('are', 'not', 'remarkable'), 205),
 (('active', 'pulmonary', 'infiltration'), 204),
 (('both', 'costophrenic', 'sulci'), 201),
 (('<s>', 'bony', 'thorax'), 198),
 (('is', 'seen', '<\\s>'), 190),
 (('of', 'plate', 'atelectasis'), 184),
 (('are', 'clear', '<\\s>'), 166),
 (('no', 'significant', 'change'), 164)]

With keyword

In [105]:
atelec_b0i1_bigramk = createPatternNgrams(atelec_b0i1_report, 2, atelec_keyword)
list(atelec_b0i1_bigramk.items())[:20]

[(('change', 'of'), 508),
 (('atelectasis', 'at'), 489),
 (('<s>', 'no'), 476),
 (('no', 'change'), 392),
 (('plate', 'atelectasis'), 369),
 (('of', 'plate'), 200),
 (('at', 'left'), 171),
 (('lung', '<\\s>'), 138),
 (('at', 'right'), 131),
 (('at', 'lll'), 128),
 (('no', 'significant'), 124),
 (('changed', 'of'), 106),
 (('<s>', 'plate'), 105),
 (('significant', 'change'), 102),
 (('of', 'atelectasis'), 95),
 (('no', 'changed'), 86),
 (('lll', '<\\s>'), 83),
 (('noted', '<\\s>'), 81),
 (('lower', 'lung'), 80),
 (('and', 'atelectasis'), 77)]

In [106]:
atelec_b0i1_trigramk = createPatternNgrams(atelec_b0i1_report, 3, atelec_keyword)
list(atelec_b0i1_trigramk.items())[:20]

[(('no', 'change', 'of'), 390),
 (('plate', 'atelectasis', 'at'), 315),
 (('<s>', 'no', 'change'), 279),
 (('of', 'plate', 'atelectasis'), 184),
 (('change', 'of', 'plate'), 156),
 (('atelectasis', 'at', 'left'), 145),
 (('atelectasis', 'at', 'lll'), 112),
 (('no', 'significant', 'change'), 102),
 (('significant', 'change', 'of'), 102),
 (('<s>', 'plate', 'atelectasis'), 100),
 (('<s>', 'no', 'significant'), 87),
 (('no', 'changed', 'of'), 86),
 (('atelectasis', 'at', 'right'), 85),
 (('change', 'of', 'atelectasis'), 80),
 (('at', 'lll', '<\\s>'), 74),
 (('<s>', 'no', 'changed'), 73),
 (('is', 'noted', '<\\s>'), 69),
 (('of', 'atelectasis', 'at'), 51),
 (('<s>', 'the', 'chest'), 47),
 (('the', 'chest', 'shows'), 47)]

In [107]:
atelec_b0i1_fourgramk = createPatternNgrams(atelec_b0i1_report, 4, atelec_keyword)
list(atelec_b0i1_fourgramk.items())[:20]

[(('<s>', 'no', 'change', 'of'), 279),
 (('of', 'plate', 'atelectasis', 'at'), 147),
 (('no', 'change', 'of', 'plate'), 143),
 (('change', 'of', 'plate', 'atelectasis'), 143),
 (('plate', 'atelectasis', 'at', 'left'), 116),
 (('no', 'significant', 'change', 'of'), 102),
 (('plate', 'atelectasis', 'at', 'lll'), 99),
 (('<s>', 'plate', 'atelectasis', 'at'), 96),
 (('<s>', 'no', 'significant', 'change'), 73),
 (('<s>', 'no', 'changed', 'of'), 73),
 (('atelectasis', 'at', 'lll', '<\\s>'), 68),
 (('no', 'change', 'of', 'atelectasis'), 58),
 (('<s>', 'the', 'chest', 'shows'), 47),
 (('the', 'chest', 'shows', 'no'), 46),
 (('shows', 'no', 'change', 'of'), 43),
 (('atelectasis', 'at', 'left', 'lower'), 43),
 (('chest', 'shows', 'no', 'change'), 42),
 (('at', 'left', 'lower', 'lung'), 42),
 (('change', 'of', 'atelectasis', 'at'), 40),
 (('changed', 'of', 'plate', 'atelectasis'), 39)]

In [108]:
atelec_b0i1_fivegramk = createPatternNgrams(atelec_b0i1_report, 5, atelec_keyword)
list(atelec_b0i1_fivegramk.items())[:20]

[(('no', 'change', 'of', 'plate', 'atelectasis'), 130),
 (('<s>', 'no', 'change', 'of', 'plate'), 117),
 (('change', 'of', 'plate', 'atelectasis', 'at'), 106),
 (('<s>', 'no', 'significant', 'change', 'of'), 73),
 (('plate', 'atelectasis', 'at', 'lll', '<\\s>'), 59),
 (('of', 'plate', 'atelectasis', 'at', 'lll'), 58),
 (('<s>', 'plate', 'atelectasis', 'at', 'left'), 51),
 (('<s>', 'the', 'chest', 'shows', 'no'), 46),
 (('the', 'chest', 'shows', 'no', 'change'), 42),
 (('chest', 'shows', 'no', 'change', 'of'), 42),
 (('of', 'plate', 'atelectasis', 'at', 'left'), 41),
 (('atelectasis', 'at', 'left', 'lower', 'lung'), 41),
 (('plate', 'atelectasis', 'at', 'left', 'lower'), 40),
 (('changed', 'of', 'plate', 'atelectasis', 'at'), 39),
 (('no', 'changed', 'of', 'plate', 'atelectasis'), 37),
 (('<s>', 'no', 'changed', 'of', 'plate'), 35),
 (('plate', 'atelectasis', 'at', 'lll', 'is'), 32),
 (('no', 'change', 'of', 'atelectasis', 'at'), 29),
 (('fibrosis', 'or', 'plate', 'atelectasis', 'at'), 

In [109]:
atelec_b0i1_sixgramk = createPatternNgrams(atelec_b0i1_report, 6, atelec_keyword)
list(atelec_b0i1_sixgramk.items())[:20]

[(('<s>', 'no', 'change', 'of', 'plate', 'atelectasis'), 110),
 (('no', 'change', 'of', 'plate', 'atelectasis', 'at'), 99),
 (('change', 'of', 'plate', 'atelectasis', 'at', 'lll'), 50),
 (('<s>', 'the', 'chest', 'shows', 'no', 'change'), 42),
 (('the', 'chest', 'shows', 'no', 'change', 'of'), 42),
 (('plate', 'atelectasis', 'at', 'left', 'lower', 'lung'), 39),
 (('no', 'changed', 'of', 'plate', 'atelectasis', 'at'), 37),
 (('<s>', 'no', 'changed', 'of', 'plate', 'atelectasis'), 35),
 (('of', 'plate', 'atelectasis', 'at', 'lll', '<\\s>'), 28),
 (('plate', 'atelectasis', 'at', 'left', 'mid', 'lung'), 25),
 (('change', 'of', 'plate', 'atelectasis', 'at', 'left'), 25),
 (('of', 'plate', 'atelectasis', 'at', 'lll', 'is'), 23),
 (('plate', 'atelectasis', 'at', 'left', 'basal', 'lung'), 20),
 (('<s>', 'plate', 'atelectasis', 'at', 'lll', '<\\s>'), 20),
 (('chest', 'shows', 'no', 'change', 'of', 'atelectasis'), 19),
 (('plate', 'atelectasis', 'at', 'lll', 'is', 'noted'), 19),
 (('atelectasis',

## BERT 1, Inspectra 0

Without keyword

In [110]:
atelec_b1i0_unigram = createPatternNgrams(atelec_b1i0_report, 1)

In [111]:
atelec_b1i0_bigram = createPatternNgrams(atelec_b1i0_report, 2)
list(atelec_b1i0_bigram.items())[:20]

[(('<s>', 'no'), 1511),
 (('noted', '<\\s>'), 818),
 (('is', 'seen'), 783),
 (('seen', '<\\s>'), 782),
 (('<s>', 'the'), 745),
 (('is', 'noted'), 729),
 (('pleural', 'effusion'), 606),
 (('<\\s>', '<s>'), 526),
 (('change', 'of'), 486),
 (('pulmonary', 'infiltration'), 479),
 (('clear', '<\\s>'), 421),
 (('both', 'costophrenic'), 381),
 (('cardiomegaly', 'is'), 355),
 (('<s>', 'both'), 331),
 (('degenerative', 'change'), 329),
 (('effusion', '<\\s>'), 315),
 (('are', 'clear'), 308),
 (('at', 'right'), 306),
 (('no', 'active'), 304),
 (('costophrenic', 'angles'), 300)]

In [112]:
atelec_b1i0_trigram = createPatternNgrams(atelec_b1i0_report, 3)
list(atelec_b1i0_trigram.items())[:20]

[(('is', 'seen', '<\\s>'), 698),
 (('is', 'noted', '<\\s>'), 652),
 (('are', 'clear', '<\\s>'), 306),
 (('costophrenic', 'angles', 'are'), 290),
 (('pleural', 'effusion', '<\\s>'), 280),
 (('both', 'costophrenic', 'angles'), 278),
 (('<s>', 'no', 'cardiomegaly'), 263),
 (('<s>', 'both', 'costophrenic'), 245),
 (('pleural', 'effusion', 'is'), 228),
 (('degenerative', 'change', 'of'), 210),
 (('<s>', 'degenerative', 'change'), 204),
 (('is', 'intact', '<\\s>'), 198),
 (('bony', 'thorax', 'is'), 192),
 (('is', 'observed', '<\\s>'), 192),
 (('cardiomegaly', 'is', 'seen'), 177),
 (('thorax', 'is', 'intact'), 176),
 (('angles', 'are', 'clear'), 170),
 (('pulmonary', 'infiltration', 'or'), 169),
 (('no', 'cardiomegaly', 'is'), 168),
 (('calcified', 'aortic', 'knob'), 167)]

With keyword

In [113]:
atelec_b1i0_bigramk = createPatternNgrams(atelec_b1i0_report, 2, atelec_keyword)
list(atelec_b1i0_bigramk.items())[:20]

[(('<s>', 'collapsed'), 209),
 (('atelectasis', 'at'), 187),
 (('<\\s>', '<s>'), 186),
 (('change', 'of'), 183),
 (('noted', '<\\s>'), 164),
 (('left', 'lung'), 149),
 (('degenerative', 'change'), 136),
 (('is', 'noted'), 135),
 (('<s>', 'the'), 133),
 (('right', 'lung'), 125),
 (('is', 'seen'), 119),
 (('<s>', 'plate'), 117),
 (('with', 'collapsed'), 116),
 (('seen', '<\\s>'), 106),
 (('<s>', 'degenerative'), 105),
 (('at', 'left'), 102),
 (('collapse', 'of'), 99),
 (('<s>', 'no'), 87),
 (('lung', '<\\s>'), 85),
 (('at', 'right'), 82)]

In [114]:
atelec_b1i0_trigramk = createPatternNgrams(atelec_b1i0_report, 3, atelec_keyword)
list(atelec_b1i0_trigramk.items())[:20]

[(('is', 'noted', '<\\s>'), 105),
 (('<s>', 'degenerative', 'change'), 95),
 (('degenerative', 'change', 'of'), 94),
 (('is', 'seen', '<\\s>'), 83),
 (('plate', 'like', 'atelectasis'), 77),
 (('like', 'atelectasis', 'at'), 76),
 (('<s>', 'plate', 'like'), 71),
 (('collapsed', 'compression', 'fracture'), 68),
 (('atelectasis', 'at', 'lll'), 66),
 (('plate', 'atelectasis', 'at'), 65),
 (('change', 'of', 'spine'), 64),
 (('compression', 'fracture', 'at'), 63),
 (('atelectasis', 'at', 'left'), 57),
 (('of', 'spine', 'with'), 54),
 (('<s>', 'the', 'chest'), 52),
 (('are', 'noted', '<\\s>'), 52),
 (('the', 'chest', 'shows'), 51),
 (('of', 'left', 'lung'), 47),
 (('no', 'significant', 'change'), 46),
 (('<s>', 'collapsed', 't12'), 43)]

In [115]:
atelec_b1i0_fourgramk = createPatternNgrams(atelec_b1i0_report, 4, atelec_keyword)
list(atelec_b1i0_fourgramk.items())[:20]

[(('<s>', 'degenerative', 'change', 'of'), 88),
 (('plate', 'like', 'atelectasis', 'at'), 76),
 (('<s>', 'plate', 'like', 'atelectasis'), 71),
 (('degenerative', 'change', 'of', 'spine'), 63),
 (('collapsed', 'compression', 'fracture', 'at'), 63),
 (('<s>', 'the', 'chest', 'shows'), 51),
 (('change', 'of', 'spine', 'with'), 50),
 (('like', 'atelectasis', 'at', 'left'), 40),
 (('no', 'significant', 'change', 'of'), 39),
 (('atelectasis', 'at', 'lll', 'is'), 33),
 (('the', 'chest', 'shows', 'no'), 31),
 (('atelectasis', 'at', 'left', 'lower'), 30),
 (('of', 'spine', 'with', 'collapsed'), 29),
 (('plate', 'atelectasis', 'at', 'lll'), 29),
 (('<s>', 'the', 'visualized', 'bony'), 29),
 (('the', 'visualized', 'bony', 'structures'), 29),
 (('<s>', 'no', 'significant', 'change'), 28),
 (('visualized', 'bony', 'structures', 'are'), 28),
 (('bony', 'structures', 'are', 'degenerative'), 26),
 (('structures', 'are', 'degenerative', 'change'), 26)]

In [116]:
atelec_b1i0_fivegramk = createPatternNgrams(atelec_b1i0_report, 5, atelec_keyword)
list(atelec_b1i0_fivegramk.items())[:20]

[(('<s>', 'plate', 'like', 'atelectasis', 'at'), 70),
 (('<s>', 'degenerative', 'change', 'of', 'spine'), 60),
 (('degenerative', 'change', 'of', 'spine', 'with'), 49),
 (('plate', 'like', 'atelectasis', 'at', 'left'), 40),
 (('<s>', 'the', 'chest', 'shows', 'no'), 31),
 (('<s>', 'the', 'visualized', 'bony', 'structures'), 29),
 (('change', 'of', 'spine', 'with', 'collapsed'), 28),
 (('the', 'visualized', 'bony', 'structures', 'are'), 27),
 (('visualized', 'bony', 'structures', 'are', 'degenerative'), 26),
 (('bony', 'structures', 'are', 'degenerative', 'change'), 26),
 (('<s>', 'no', 'significant', 'change', 'of'), 25),
 (('<s>', 'collapsed', 'compression', 'fracture', 'at'), 24),
 (('the', 'chest', 'shows', 'no', 'change'), 23),
 (('like', 'atelectasis', 'at', 'left', 'lower'), 23),
 (('structures', 'are', 'degenerative', 'change', 'and'), 22),
 (('chest', 'shows', 'no', 'change', 'of'), 21),
 (('<s>', 'plate', 'liked', 'atelectasis', 'at'), 20),
 (('plate', 'like', 'atelectasis', 'a

In [117]:
atelec_b1i0_sixgramk = createPatternNgrams(atelec_b1i0_report, 6, atelec_keyword)
list(atelec_b1i0_sixgramk.items())[:20]

[(('<s>', 'degenerative', 'change', 'of', 'spine', 'with'), 49),
 (('<s>', 'plate', 'like', 'atelectasis', 'at', 'left'), 39),
 (('degenerative', 'change', 'of', 'spine', 'with', 'collapsed'), 28),
 (('<s>', 'the', 'visualized', 'bony', 'structures', 'are'), 27),
 (('visualized', 'bony', 'structures', 'are', 'degenerative', 'change'), 26),
 (('the', 'visualized', 'bony', 'structures', 'are', 'degenerative'), 25),
 (('<s>', 'the', 'chest', 'shows', 'no', 'change'), 23),
 (('plate', 'like', 'atelectasis', 'at', 'left', 'lower'), 23),
 (('bony', 'structures', 'are', 'degenerative', 'change', 'and'), 22),
 (('the', 'chest', 'shows', 'no', 'change', 'of'), 21),
 (('<s>', 'plate', 'like', 'atelectasis', 'at', 'lll'), 15),
 (('degenerative', 'change', 'of', 'spine', 'with', 'some'), 13),
 (('<s>', 'the', 'chest', 'shows', 'total', 'collapse'), 13),
 (('like', 'atelectasis', 'at', 'left', 'lower', 'lung'), 12),
 (('<s>', 'plate', 'liked', 'atelectasis', 'at', 'lll'), 12),
 (('plate', 'like', '

## Find Word Before

In [118]:
before_atelec = findWordBefore(atelec_b0i1_bigram, 'atelectasis')
print('atelectasis :', atelec_b0i1_unigram['atelectasis',])
print('word before atelectasis :', sum(before_atelec.values()))
list(before_atelec.items())[:10]

atelectasis : 766
word before atelectasis : 766


[('plate', 369),
 ('of', 95),
 ('and', 77),
 ('rul', 32),
 ('like', 31),
 ('subsegmental', 24),
 ('partial', 20),
 ('or', 14),
 ('lul', 11),
 ('<s>', 10)]

In [119]:
before_plate = findWordBefore(atelec_b0i1_bigram, 'plate')
print('plate :', atelec_b0i1_unigram['plate',])
print('word before plate :', sum(before_plate.values()))
list(before_plate.items())[:10]

plate : 420
word before plate : 420


[('of', 200),
 ('<s>', 110),
 ('or', 42),
 ('and', 22),
 ('small', 10),
 ('few', 7),
 ('fibrosis/', 6),
 ('minimal', 6),
 ('/', 2),
 ('the', 2)]

In [120]:
before_atelec = findWordBefore(atelec_b1i0_bigram, 'atelectasis')
print('atelectasis :', atelec_b1i0_unigram['atelectasis',])
print('word before atelectasis :', sum(before_atelec.values()))
list(before_atelec.items())[:10]

atelectasis : 270
word before atelectasis : 270


[('plate', 79),
 ('like', 77),
 ('liked', 20),
 ('subsegmental', 19),
 ('or', 18),
 ('rul', 7),
 ('lung', 6),
 ('and', 5),
 ('rml', 5),
 ('change', 4)]

# Lung Lesion
Mass + Nodule

## Prepare Data

In [121]:
# choose rows
lesion_df, lesion_b0i1_df, lesion_b1i0_df = selectData(df, 'Lung Lesion')
lesion_df

Unnamed: 0,Image Index,Reports,Cardiomegaly BERT Labeler,Edema BERT Labeler,Inspectra Lung Opacity v1 BERT Labeler,Atelectasis BERT Labeler,Pleural Effusion BERT Labeler,Lung Lesion BERT Labeler,Cardiomegaly Inspectra Labeler,Edema Inspectra Labeler,...,Atelectasis Inspectra Labeler,Pleural Effusion Inspectra Labeler,Lung Lesion Inspectra Labeler,Inspectra Lung Opacity v1 Balanced Score,Mass Balanced Score,Nodule Balanced Score,Edema Balanced Score,Atelectasis Balanced Score,Cardiomegaly Balanced Score,Pleural Effusion Balanced Score
0,20292772,CXR (PA upright)\n\nReticulonodular infiltrati...,0,0,0,0,0,1,0,0,...,0,0,0,0.9478721681,0.1678770714,0.4644771763,0.08673519902,0.2267119501,0.05830180422,0.878199365
1,20294567,Chest:-\nPa chest study reveals fibronodular i...,0,0,0,0,0,1,0,0,...,0,0,0,0.8226948088,0.1517771742,0.6166349793,0.01350364634,0.235725454,0.00647519908,0.08067476336
2,20294712,CHEST :\nP.A. upright view .\nFibronodular inf...,0,0,0,0,0,1,0,0,...,0,0,0,0.6979679996,0.1366843406,0.6163465142,0.04663362481,0.08208811247,0.02999480347,0.01371037712
3,20295457,CXR\n No previous film to be compared.\n RLL...,0,0,1,0,1,1,0,0,...,0,1,0,0.8582449093,0.5941714662,0.5386270446,0.2007227667,0.4146865418,0.2195140952,0.3749837166
4,20296088,CXR PA upright\n The chest shows mild reticul...,0,0,0,0,0,1,0,0,...,0,0,0,0.7284609589,0.07017927657,0.3137481618,0.04706920547,0.04304903915,0.005609614713,0.01186107229
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34343,25105659,CXR PA upright\n\n DIffuse reticulonodular in...,0,0,0,0,0,1,0,0,...,0,0,0,0.8751587523,0.1217781402,0.6192859137,0.05592964319,0.2191973016,0.1200685221,0.05588470611
34344,25105743,CXR PA \n\nDiffuse reticulonodular infiltratio...,0,0,0,0,0,1,0,0,...,0,0,0,0.02014802623,0.2061536583,0.9158935925,0,0.03189752104,0.01101295625,0.05947037651
34345,25105767,Chest PA upright \n\nknown CA breast\n\nSurgic...,0,0,0,0,0,1,0,0,...,0,0,0,0.8246030918,0.1140795483,0.5280247845,0.004297864794,0.427581139,0.0260741439,0.05032755883
34346,25105998,CXR(PA)\n \n Retilocalcified nodules of bot...,1,0,0,0,0,0,1,0,...,0,0,1,0.8039141853,0.1314715573,0.774282467,0.3601595743,0.1181119851,0.9613563218,0.08755137394


In [122]:
# create report list
lesion_b0i1_report = createReportList(lesion_b0i1_df)
lesion_b1i0_report = createReportList(lesion_b1i0_df)
lesion_b0i1_report[0]

Lenght : 14773
Lenght : 19575


['Chest PA upright',
 ' chronic bilateral apical lung infiltration with subsegmental both upper lungs atelectasis is shown. Prominent nodular component of active infiltration at LUL is noted.',
 ' RUL bleb is probable.',
 ' Blunt both costophrenic angles are observed, could be represented of thick basal pleurae or minimal pleural effusion.']

In [123]:
# define keyword to select sentence from report
lesion_keyword = r'nodule|mass|granu'

## BERT 0, Inspectra 1

Without keyword

In [124]:
lesion_b0i1_unigram = createPatternNgrams(lesion_b0i1_report, 1)

In [125]:
lesion_b0i1_bigram = createPatternNgrams(lesion_b0i1_report, 2)
list(lesion_b0i1_bigram.items())[:20]

[(('<s>', 'no'), 20289),
 (('change', 'of'), 10289),
 (('<s>', 'the'), 9031),
 (('bony', 'thorax'), 7013),
 (('both', 'costophrenic'), 6267),
 (('intact', '<\\s>'), 6206),
 (('thorax', 'is'), 6016),
 (('pleural', 'effusion'), 5907),
 (('<s>', 'both'), 5891),
 (('noted', '<\\s>'), 5671),
 (('is', 'intact'), 5635),
 (('is', 'seen'), 5461),
 (('clear', '<\\s>'), 5371),
 (('no', 'change'), 5366),
 (('is', 'noted'), 5245),
 (('seen', '<\\s>'), 5221),
 (('are', 'clear'), 4668),
 (('pulmonary', 'infiltration'), 4658),
 (('<s>', 'bony'), 4570),
 (('at', 'right'), 4426)]

In [126]:
lesion_b0i1_trigram = createPatternNgrams(lesion_b0i1_report, 3)
list(lesion_b0i1_trigram.items())[:20]

[(('bony', 'thorax', 'is'), 6011),
 (('is', 'intact', '<\\s>'), 5633),
 (('thorax', 'is', 'intact'), 5381),
 (('<s>', 'both', 'costophrenic'), 5302),
 (('no', 'change', 'of'), 5147),
 (('is', 'seen', '<\\s>'), 4776),
 (('is', 'noted', '<\\s>'), 4689),
 (('are', 'clear', '<\\s>'), 4652),
 (('<s>', 'bony', 'thorax'), 4107),
 (('costophrenic', 'angles', 'are'), 4003),
 (('both', 'costophrenic', 'angles'), 3688),
 (('<s>', 'no', 'cardiomegaly'), 3653),
 (('no', 'significant', 'change'), 3505),
 (('significant', 'change', 'of'), 3413),
 (('pleural', 'effusion', '<\\s>'), 3344),
 (('angles', 'are', 'clear'), 3209),
 (('active', 'pulmonary', 'infiltration'), 2804),
 (('<s>', 'there', 'is'), 2789),
 (('<s>', 'no', 'significant'), 2731),
 (('no', 'pleural', 'effusion'), 2651)]

With keyword

In [127]:
lesion_b0i1_bigramk = createPatternNgrams(lesion_b0i1_report, 2, lesion_keyword)
list(lesion_b0i1_bigramk.items())[:20]

[(('change', 'of'), 6152),
 (('<s>', 'no'), 4685),
 (('nodule', 'at'), 3702),
 (('no', 'change'), 3462),
 (('at', 'right'), 2612),
 (('no', 'significant'), 2510),
 (('significant', 'change'), 2306),
 (('calcified', 'nodule'), 2202),
 (('nodules', 'at'), 2017),
 (('<s>', 'the'), 1997),
 (('<\\s>', '<s>'), 1775),
 (('calcified', 'granuloma'), 1774),
 (('at', 'left'), 1717),
 (('at', 'rul'), 1613),
 (('the', 'chest'), 1556),
 (('chest', 'shows'), 1552),
 (('shows', 'no'), 1484),
 (('lung', '<\\s>'), 1455),
 (('there', 'is'), 1397),
 (('at', 'both'), 1342)]

In [128]:
lesion_b0i1_trigramk = createPatternNgrams(lesion_b0i1_report, 3, lesion_keyword)
list(lesion_b0i1_trigramk.items())[:20]

[(('no', 'change', 'of'), 3390),
 (('no', 'significant', 'change'), 2302),
 (('significant', 'change', 'of'), 2274),
 (('calcified', 'nodule', 'at'), 1751),
 (('<s>', 'no', 'significant'), 1718),
 (('<s>', 'the', 'chest'), 1555),
 (('the', 'chest', 'shows'), 1552),
 (('chest', 'shows', 'no'), 1445),
 (('shows', 'no', 'change'), 1310),
 (('<s>', 'no', 'change'), 1275),
 (('<s>', 'there', 'is'), 1139),
 (('calcified', 'granuloma', 'at'), 1001),
 (('change', 'of', 'multiple'), 1000),
 (('nodule', 'at', 'right'), 955),
 (('change', 'of', 'a'), 948),
 (('there', 'is', 'no'), 946),
 (('nodules', 'at', 'both'), 862),
 (('is', 'noted', '<\\s>'), 785),
 (('calcific', 'nodule', 'at'), 781),
 (('both', 'lungs', '<\\s>'), 724)]

In [129]:
lesion_b0i1_fourgramk = createPatternNgrams(lesion_b0i1_report, 4, lesion_keyword)
list(lesion_b0i1_fourgramk.items())[:20]

[(('no', 'significant', 'change', 'of'), 2272),
 (('<s>', 'no', 'significant', 'change'), 1590),
 (('<s>', 'the', 'chest', 'shows'), 1552),
 (('the', 'chest', 'shows', 'no'), 1445),
 (('shows', 'no', 'change', 'of'), 1308),
 (('chest', 'shows', 'no', 'change'), 1302),
 (('<s>', 'no', 'change', 'of'), 1262),
 (('<s>', 'there', 'is', 'no'), 743),
 (('no', 'change', 'of', 'a'), 648),
 (('nodules', 'at', 'both', 'lungs'), 569),
 (('at', 'both', 'lungs', '<\\s>'), 532),
 (('small', 'calcified', 'nodule', 'at'), 528),
 (('no', 'change', 'of', 'multiple'), 516),
 (('significant', 'change', 'of', 'multiple'), 429),
 (('calcified', 'nodule', 'at', 'right'), 421),
 (('small', 'calcific', 'nodule', 'at'), 418),
 (('mass', 'or', 'atelectasis', '<\\s>'), 388),
 (('at', 'right', 'lower', 'lung'), 386),
 (('<s>', 'no', 'changed', 'of'), 385),
 (('<\\s>', '<s>', 'there', 'is'), 379)]

In [130]:
lesion_b0i1_fivegramk = createPatternNgrams(lesion_b0i1_report, 5, lesion_keyword)
list(lesion_b0i1_fivegramk.items())[:20]

[(('<s>', 'no', 'significant', 'change', 'of'), 1584),
 (('<s>', 'the', 'chest', 'shows', 'no'), 1445),
 (('the', 'chest', 'shows', 'no', 'change'), 1302),
 (('chest', 'shows', 'no', 'change', 'of'), 1301),
 (('nodules', 'at', 'both', 'lungs', '<\\s>'), 462),
 (('no', 'significant', 'change', 'of', 'multiple'), 429),
 (('shows', 'no', 'change', 'of', 'multiple'), 342),
 (('pulmonary', 'infiltration', 'mass', 'or', 'atelectasis'), 340),
 (('<s>', 'no', 'demonstrable', 'active', 'pulmonary'), 337),
 (('infiltration', 'mass', 'or', 'atelectasis', '<\\s>'), 337),
 (('no', 'demonstrable', 'active', 'pulmonary', 'infiltration'), 336),
 (('active', 'pulmonary', 'infiltration', 'mass', 'or'), 313),
 (('demonstrable', 'active', 'pulmonary', 'infiltration', 'mass'), 307),
 (('<s>', 'as', 'compared', 'to', 'prior'), 305),
 (('as', 'compared', 'to', 'prior', 'film'), 277),
 (('there', 'is', 'no', 'change', 'of'), 274),
 (('compared', 'to', 'prior', 'film', 'on'), 272),
 (('a', 'small', 'calcified'

In [131]:
lesion_b0i1_sixgramk = createPatternNgrams(lesion_b0i1_report, 6, lesion_keyword)
list(lesion_b0i1_sixgramk.items())[:20]

[(('<s>', 'the', 'chest', 'shows', 'no', 'change'), 1302),
 (('the', 'chest', 'shows', 'no', 'change', 'of'), 1301),
 (('chest', 'shows', 'no', 'change', 'of', 'multiple'), 341),
 (('pulmonary', 'infiltration', 'mass', 'or', 'atelectasis', '<\\s>'), 337),
 (('<s>', 'no', 'demonstrable', 'active', 'pulmonary', 'infiltration'), 336),
 (('active', 'pulmonary', 'infiltration', 'mass', 'or', 'atelectasis'), 309),
 (('no', 'demonstrable', 'active', 'pulmonary', 'infiltration', 'mass'), 305),
 (('demonstrable', 'active', 'pulmonary', 'infiltration', 'mass', 'or'), 296),
 (('<s>', 'as', 'compared', 'to', 'prior', 'film'), 271),
 (('as', 'compared', 'to', 'prior', 'film', 'on'), 271),
 (('<s>', 'no', 'significant', 'change', 'of', 'multiple'), 256),
 (('<s>', 'there', 'is', 'no', 'change', 'of'), 238),
 (('chest', 'shows', 'no', 'change', 'of', 'a'), 233),
 (('there', 'is', 'no', 'significant', 'change', 'of'), 203),
 (('change', 'of', 'multiple', 'nodules', 'at', 'both'), 191),
 (('<s>', 'no',

## BERT 1, Inspectra 0

Without keyword

In [132]:
lesion_b1i0_unigram = createPatternNgrams(lesion_b1i0_report, 1)

In [133]:
lesion_b1i0_bigram = createPatternNgrams(lesion_b1i0_report, 2)
list(lesion_b1i0_bigram.items())[:20]

[(('<s>', 'no'), 16163),
 (('infiltration', 'at'), 11318),
 (('bony', 'thorax'), 10081),
 (('<s>', 'the'), 9771),
 (('pleural', 'effusion'), 9473),
 (('reticulonodular', 'infiltration'), 9355),
 (('intact', '<\\s>'), 7992),
 (('thorax', 'is'), 7532),
 (('is', 'intact'), 7244),
 (('effusion', '<\\s>'), 6764),
 (('<s>', 'bony'), 6493),
 (('<\\s>', '<s>'), 6321),
 (('is', 'seen'), 6126),
 (('no', 'pleural'), 6060),
 (('seen', '<\\s>'), 6027),
 (('clear', '<\\s>'), 5844),
 (('noted', '<\\s>'), 5508),
 (('<s>', 'normal'), 5106),
 (('at', 'both'), 5105),
 (('both', 'costophrenic'), 5083)]

In [134]:
lesion_b1i0_trigram = createPatternNgrams(lesion_b1i0_report, 3)
list(lesion_b1i0_trigram.items())[:20]

[(('bony', 'thorax', 'is'), 7514),
 (('is', 'intact', '<\\s>'), 7241),
 (('thorax', 'is', 'intact'), 6870),
 (('reticulonodular', 'infiltration', 'at'), 6634),
 (('pleural', 'effusion', '<\\s>'), 6390),
 (('no', 'pleural', 'effusion'), 6029),
 (('<s>', 'bony', 'thorax'), 5841),
 (('<s>', 'no', 'pleural'), 5688),
 (('is', 'seen', '<\\s>'), 5219),
 (('are', 'clear', '<\\s>'), 4800),
 (('<s>', 'no', 'cardiomegaly'), 4353),
 (('<s>', 'both', 'costophrenic'), 4318),
 (('is', 'noted', '<\\s>'), 4196),
 (('<s>', 'there', 'is'), 3826),
 (('costophrenic', 'angles', 'are'), 3668),
 (('both', 'costophrenic', 'angles'), 3280),
 (('<s>', 'reticulonodular', 'infiltration'), 3272),
 (('<s>', 'the', 'heart'), 3120),
 (('angles', 'are', 'clear'), 3061),
 (('infiltration', 'at', 'both'), 3033)]

With keyword

In [135]:
lesion_b1i0_bigramk = createPatternNgrams(lesion_b1i0_report, 2, lesion_keyword)
list(lesion_b1i0_bigramk.items())[:20]

[(('<s>', 'no'), 333),
 (('<\\s>', '<s>'), 172),
 (('pulmonary', 'infiltration'), 171),
 (('infiltration', 'or'), 159),
 (('nodule', 'at'), 155),
 (('or', 'nodule'), 150),
 (('nodule', 'is'), 132),
 (('nodule', '<\\s>'), 127),
 (('noted', '<\\s>'), 125),
 (('is', 'noted'), 121),
 (('<s>', 'history'), 120),
 (('pulmonary', 'nodule'), 107),
 (('is', 'seen'), 101),
 (('<s>', 'the'), 99),
 (('seen', '<\\s>'), 95),
 (('at', 'right'), 87),
 (('mass', '<\\s>'), 85),
 (('<s>', 'there'), 79),
 (('there', 'is'), 75),
 (('active', 'pulmonary'), 71)]

In [136]:
lesion_b1i0_trigramk = createPatternNgrams(lesion_b1i0_report, 3, lesion_keyword)
list(lesion_b1i0_trigramk.items())[:20]

[(('infiltration', 'or', 'nodule'), 143),
 (('pulmonary', 'infiltration', 'or'), 117),
 (('is', 'noted', '<\\s>'), 107),
 (('is', 'seen', '<\\s>'), 79),
 (('or', 'nodule', 'is'), 78),
 (('<s>', 'there', 'is'), 67),
 (('active', 'pulmonary', 'infiltration'), 64),
 (('nodule', 'is', 'seen'), 50),
 (('nodule', 'is', 'noted'), 50),
 (('<s>', 'no', 'pulmonary'), 47),
 (('no', 'definite', 'pulmonary'), 47),
 (('no', 'detectable', 'pulmonary'), 47),
 (('or', 'nodule', '<\\s>'), 45),
 (('<s>', 'findings', 'no'), 43),
 (('<s>', 'no', 'definite'), 41),
 (('detectable', 'pulmonary', 'infiltration'), 41),
 (('findings', 'no', 'detectable'), 40),
 (('there', 'is', 'no'), 38),
 (('at', 'both', 'lungs'), 38),
 (('pulmonary', 'nodules', 'at'), 35)]

In [137]:
lesion_b1i0_fourgramk = createPatternNgrams(lesion_b1i0_report, 4, lesion_keyword)
list(lesion_b1i0_fourgramk.items())[:20]

[(('pulmonary', 'infiltration', 'or', 'nodule'), 108),
 (('infiltration', 'or', 'nodule', 'is'), 75),
 (('nodule', 'is', 'noted', '<\\s>'), 48),
 (('infiltration', 'or', 'nodule', '<\\s>'), 42),
 (('nodule', 'is', 'seen', '<\\s>'), 42),
 (('no', 'detectable', 'pulmonary', 'infiltration'), 41),
 (('detectable', 'pulmonary', 'infiltration', 'or'), 41),
 (('<s>', 'findings', 'no', 'detectable'), 40),
 (('or', 'nodule', 'is', 'noted'), 39),
 (('<s>', 'no', 'definite', 'pulmonary'), 33),
 (('findings', 'no', 'detectable', 'pulmonary'), 33),
 (('active', 'pulmonary', 'infiltration', 'or'), 33),
 (('or', 'nodule', 'is', 'seen'), 33),
 (('<s>', 'there', 'is', 'no'), 32),
 (('<s>', 'no', 'infiltration', 'or'), 32),
 (('no', 'infiltration', 'or', 'nodule'), 32),
 (('<s>', 'the', 'chest', 'shows'), 31),
 (('<s>', 'no', 'active', 'pulmonary'), 28),
 (('no', 'active', 'pulmonary', 'infiltration'), 27),
 (('<s>', 'no', 'pulmonary', 'nodule'), 26)]

In [138]:
lesion_b1i0_fivegramk = createPatternNgrams(lesion_b1i0_report, 5, lesion_keyword)
list(lesion_b1i0_fivegramk.items())[:20]

[(('pulmonary', 'infiltration', 'or', 'nodule', 'is'), 45),
 (('no', 'detectable', 'pulmonary', 'infiltration', 'or'), 41),
 (('pulmonary', 'infiltration', 'or', 'nodule', '<\\s>'), 40),
 (('infiltration', 'or', 'nodule', 'is', 'noted'), 39),
 (('or', 'nodule', 'is', 'noted', '<\\s>'), 39),
 (('detectable', 'pulmonary', 'infiltration', 'or', 'nodule'), 38),
 (('<s>', 'findings', 'no', 'detectable', 'pulmonary'), 33),
 (('findings', 'no', 'detectable', 'pulmonary', 'infiltration'), 33),
 (('infiltration', 'or', 'nodule', 'is', 'seen'), 33),
 (('or', 'nodule', 'is', 'seen', '<\\s>'), 33),
 (('<s>', 'no', 'infiltration', 'or', 'nodule'), 32),
 (('active', 'pulmonary', 'infiltration', 'or', 'nodule'), 30),
 (('no', 'infiltration', 'or', 'nodule', 'is'), 30),
 (('<s>', 'the', 'chest', 'shows', 'no'), 25),
 (('<s>', 'no', 'demonstrable', 'active', 'pulmonary'), 24),
 (('no', 'demonstrable', 'active', 'pulmonary', 'infiltration'), 24),
 (('no', 'active', 'pulmonary', 'infiltration', 'or'), 23

In [139]:
lesion_b1i0_sixgramk = createPatternNgrams(lesion_b1i0_report, 6, lesion_keyword)
list(lesion_b1i0_sixgramk.items())[:20]

[(('pulmonary', 'infiltration', 'or', 'nodule', 'is', 'noted'), 39),
 (('infiltration', 'or', 'nodule', 'is', 'noted', '<\\s>'), 39),
 (('no', 'detectable', 'pulmonary', 'infiltration', 'or', 'nodule'), 38),
 (('<s>', 'findings', 'no', 'detectable', 'pulmonary', 'infiltration'), 33),
 (('findings', 'no', 'detectable', 'pulmonary', 'infiltration', 'or'), 33),
 (('infiltration', 'or', 'nodule', 'is', 'seen', '<\\s>'), 33),
 (('<s>', 'no', 'infiltration', 'or', 'nodule', 'is'), 30),
 (('no', 'infiltration', 'or', 'nodule', 'is', 'seen'), 30),
 (('<s>', 'no', 'demonstrable', 'active', 'pulmonary', 'infiltration'), 24),
 (('no', 'active', 'pulmonary', 'infiltration', 'or', 'nodule'), 20),
 (('no', 'demonstrable', 'active', 'pulmonary', 'infiltration', 'mass'), 20),
 (('<s>', 'no', 'active', 'pulmonary', 'infiltration', 'or'), 19),
 (('detectable', 'pulmonary', 'infiltration', 'or', 'nodule', 'is'), 18),
 (('demonstrable', 'active', 'pulmonary', 'infiltration', 'mass', 'or'), 18),
 (('active

## Find Word Before

In [140]:
before_mass = findWordBefore(lesion_b0i1_bigram, 'mass')
print('mass :', lesion_b0i1_unigram['mass',])
print('word before mass :', sum(before_mass.values()))
list(before_mass.items())[:10]

mass : 2281
word before mass : 2281


[('of', 433),
 ('infiltration', 358),
 ('hilar', 149),
 ('tissue', 128),
 ('or', 99),
 ('mediastinal', 84),
 ('large', 64),
 ('defined', 55),
 ('pulmonary', 54),
 ('paratracheal', 53)]

In [141]:
before_nodule = findWordBefore(lesion_b0i1_bigram, 'nodules')
print('nodules :', lesion_b0i1_unigram['nodules',])
print('word before nodules :', sum(before_nodule.values()))
list(before_nodule.items())[:10]

nodules : 3245
word before nodules : 3245


[('calcified', 811),
 ('calcific', 615),
 ('pulmonary', 601),
 ('multiple', 338),
 ('small', 254),
 ('of', 72),
 ('lung', 65),
 ('two', 44),
 ('round', 42),
 ('few', 34)]

In [142]:
before_granu = findWordBefore(lesion_b0i1_bigram, 'granuloma')
print('granuloma :', lesion_b0i1_unigram['granuloma',])
print('word before granuloma :', sum(before_granu.values()))
list(before_granu.items())[:10]

granuloma : 2488
word before granuloma : 2488


[('calcified', 1774),
 ('old', 431),
 ('small', 62),
 ('be', 34),
 ('calcific', 18),
 ('of', 15),
 ('<s>', 14),
 ('likely', 10),
 ('calcify', 10),
 ('benign', 9)]

In [143]:
before_nodular = findWordBefore(lesion_b1i0_bigram, 'nodular')
print('nodular :', lesion_b1i0_unigram['nodular',])
print('word before nodular :', sum(before_nodular.values()))
list(before_nodular.items())[:10]

nodular : 2060
word before nodular : 2060


[('<s>', 356),
 ('and', 252),
 ('reticulo', 143),
 ('minimal', 142),
 ('of', 126),
 ('diffuse', 124),
 ('small', 122),
 ('fibro', 63),
 ('multiple', 54),
 ('fine', 50)]

In [144]:
before_retino = findWordBefore(lesion_b1i0_bigram, 'reticulonodular')
print('reticulonodular :', lesion_b1i0_unigram['reticulonodular',])
print('word before reticulonodular :', sum(before_retino.values()))
list(before_retino.items())[:10]

reticulonodular : 11845
word before reticulonodular : 11845


[('<s>', 4241),
 ('of', 1321),
 ('diffuse', 1076),
 ('minimal', 748),
 ('mild', 535),
 ('is', 465),
 ('and', 415),
 ('increased', 316),
 ('decreased', 296),
 ('unchanged', 282)]

In [145]:
before_fibrono = findWordBefore(lesion_b1i0_bigram, 'fibronodular')
print('fibronodular :', lesion_b1i0_unigram['fibronodular',])
print('word before fibronodular :', sum(before_fibrono.values()))
list(before_fibrono.items())[:10]

fibronodular : 3284
word before fibronodular : 3284


[('<s>', 1512),
 ('minimal', 500),
 ('unchanged', 268),
 ('of', 230),
 ('and', 93),
 ('reveals', 75),
 ('is', 70),
 ('rul', 42),
 ('lul', 39),
 ('mild', 32)]