In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas(desc='My bar!')
import numpy as np
import time
import matplotlib.pyplot as plt 

In [None]:
df = pd.read_excel('clinical_notes_hourly.xlsx')
print(df.columns)

Index(['icustay_id', 'hr', 'category', 'description', 'text'], dtype='object')


In [None]:
df.dropna(subset=['category', 'description'], how='all', inplace=True)

In [None]:
df.head(20)

Unnamed: 0,icustay_id,hr,category,description,text
1,200039,2,Nursing/other,Report,CCU NSG ADMIT NOTE-NSICU BORDER.\n69 YO [**Hos...
11,200039,12,Radiology,CT HEAD W/O CONTRAST,[**2121-12-26**] 4:56 PM\n CT HEAD W/O CONTRAS...
13,200039,14,Nursing/other,Report,"npn ccu\nn-fc's, mae's, slow to respond but do..."
40,200052,17,Nursing/other,Report,NURSING NOTE (1900-0700)\n\nMrs. [**Known last...
43,200052,20,Radiology,CT HEAD W/O CONTRAST,[**2193-9-19**] 8:41 AM\n CT HEAD W/O CONTRAST...
53,200072,6,Nursing,Nursing Progress Note,Intracerebral hemorrhage (ICH)\n Assessment:...
54,200072,6,Nursing,Nursing Progress Note,Intracerebral hemorrhage (ICH)\n Assessment:...
55,200072,6,Nursing,Nursing Progress Note,Intracerebral hemorrhage (ICH)\n Assessment:...
56,200072,6,Radiology,CT HEAD W/O CONTRAST,[**2106-3-4**] 3:26 AM\n CT HEAD W/O CONTRAST ...
57,200072,7,Physician,Intensivist Note,SICU\n HPI:\n 55M with no PMHx who reports...


In [None]:
# 'text' column is an object but not all strings
df['text'] = df['text'].astype(str)

In [None]:
report = df.loc[77, 'text']
print(report)

[**2106-3-25**] 1:18 PM
 CHEST (PORTABLE AP)                                             Clip # [**Clip Number (Radiology) 53995**]
 Reason: eval for pneumothorax s/p chest tube removal
 Admitting Diagnosis: CEREBELLAR HEMORRHAGE
 ______________________________________________________________________________
 [**Hospital 2**] MEDICAL CONDITION:
  55 year old man s/p CABG
 REASON FOR THIS EXAMINATION:
  eval for pneumothorax s/p chest tube removal
 ______________________________________________________________________________
                                 FINAL REPORT
 PORTABLE CHEST, [**2106-3-25**]

 COMPARISON:  [**2105-3-23**].

 INDICATION:  Chest tube removal.

 Various indwelling devices have been removed in the interval and a right
 internal jugular catheter has been placed within the superior vena cava.  No
 pneumothorax.  Cardiomediastinal contours are widened but unchanged in the
 post-operative setting.  A crescenteric lucency is present adjacent to the
 aortic knob and m

In [None]:
len(report.split())

142

In [None]:
# dropping all rows that are not 'Radiology'
df.drop(df[df['category'] != 'Radiology'].index, inplace = True)
df.head()

Unnamed: 0,icustay_id,hr,category,description,text
11,200039,12,Radiology,CT HEAD W/O CONTRAST,[**2121-12-26**] 4:56 PM\n CT HEAD W/O CONTRAS...
43,200052,20,Radiology,CT HEAD W/O CONTRAST,[**2193-9-19**] 8:41 AM\n CT HEAD W/O CONTRAST...
56,200072,6,Radiology,CT HEAD W/O CONTRAST,[**2106-3-4**] 3:26 AM\n CT HEAD W/O CONTRAST ...
59,200072,8,Radiology,MR HEAD W & W/O CONTRAST,[**2106-3-4**] 5:26 AM\n MR HEAD W & W/O CONTR...
60,200072,8,Radiology,MR HEAD W & W/O CONTRAST,"[**Last Name (LF) 477**],[**First Name3 (LF) 4..."


In [None]:
df.shape

(4882, 5)

# Cleaning step 1 - removing non-informative text

In [None]:
import re

def custom_cleaner(text):
  text = re.sub('\s+', ' ', text).strip()
  text = re.sub(r'\n\.', ' ', text)
  # regular expression to match all substrings between square brackets
  pattern = r"\[.*?\]"
    
  # substitute all matches with an empty string
  text = re.sub(pattern, "", text)

  # Remove time stamps
  text = re.sub(r'\d+:\d+\s*(?:AM|PM)', '', text)

  # Remove all characters except alphabets, full stop, and colon
  text = re.sub(r'[^a-zA-Z0-9.:\s]', '', text)

  colon_index = text.find(':')
  if colon_index != -1:
      text = text[0:colon_index+1] + text[colon_index+1:].split(':', 1)[1]

  words = text.split()
  output = ""
  colon_found = False

  for word in words:
      if ":" in word:
          output += word + " "
          colon_found = True
      elif colon_found:
          output += word + " "
  if output != "":
    text = output
  return text


In [None]:
report_clean2 = custom_cleaner(report)

In [None]:
len(report_clean2.split())

116

In [None]:
report_clean2

'Reason: CEREBELLAR HEMORRHAGE MEDICAL CONDITION: 55 year old man sp CABG REASON FOR THIS EXAMINATION: eval for pneumothorax sp chest tube removal FINAL REPORT PORTABLE CHEST COMPARISON: . INDICATION: Chest tube removal. Various indwelling devices have been removed in the interval and a right internal jugular catheter has been placed within the superior vena cava. No pneumothorax. Cardiomediastinal contours are widened but unchanged in the postoperative setting. A crescenteric lucency is present adjacent to the aortic knob and may represent an area of normally aerated lung adjacent to linear left suprahilar atelectasis. Localized pneumomediastinum is considered less likely. Worsening bibasilar atelectasis and new small left pleural effusion are present as well as new moderate gastric distention. '

In [None]:
df2 = pd.read_csv('cleaned_radiology.csv')
print(df2.columns)

Index(['Unnamed: 0', 'icustay_id', 'hr', 'category', 'description', 'text',
       'mortality', 'cleaned_text'],
      dtype='object')


In [None]:
report2 = df2.loc[5, 'cleaned_text'] #60
len(report2.split())

116

In [None]:
print(report2)

Reason: CEREBELLAR HEMORRHAGE MEDICAL CONDITION: 55 year old man sp CABG REASON FOR THIS EXAMINATION: eval for pneumothorax sp chest tube removal FINAL REPORT PORTABLE CHEST COMPARISON: . INDICATION: Chest tube removal. Various indwelling devices have been removed in the interval and a right internal jugular catheter has been placed within the superior vena cava. No pneumothorax. Cardiomediastinal contours are widened but unchanged in the postoperative setting. A crescenteric lucency is present adjacent to the aortic knob and may represent an area of normally aerated lung adjacent to linear left suprahilar atelectasis. Localized pneumomediastinum is considered less likely. Worsening bibasilar atelectasis and new small left pleural effusion are present as well as new moderate gastric distention. 


# Running the T5-base model on all cells

In [None]:
import transformers
import torch

# Load the T5 model
model = transformers.T5ForConditionalGeneration.from_pretrained('t5-base')
tokenizer = transformers.T5Tokenizer.from_pretrained('t5-base', model_max_length=512)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Define the input data
input_text2 = report2


In [None]:
# Tokenize the input data
input_ids = tokenizer.encode("summarize: " +input_text2, max_length=500,truncation=True, padding='max_length', return_tensors='pt')

# Generate the summary
summary_ids = model.generate(input_ids, num_beams=4, max_length=500, early_stopping=False)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the summary
print("Summary:", summary)

Summary: 55 year old man sp CABG eval for pneumothorax sp chest tube removal. a crescenteric lucency is present adjacent to the aortic knob and may represent an area of normally aerated lung adjacent to linear left suprahilar atelectasis.


In [None]:
def sumarizer(text):
    # Tokenize the input data
    input_ids = tokenizer.encode("summarize: " +text, max_length=500,truncation=True, padding='max_length', return_tensors='pt')

    # Generate the summary
    summary_ids = model.generate(input_ids, num_beams=4, max_length=500, early_stopping=False)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Print the summary
    return summary


In [None]:
tentative = sumarizer(df2.loc[5, 'cleaned_text'])

In [None]:
len(tentative.split())

38

In [None]:
df2['summary_T5'] = df2['cleaned_text'].progress_apply(sumarizer)

My bar!: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 4882/4882 [8:47:18<00:00,  6.48s/it]


In [None]:
df2.head()

Unnamed: 0.1,Unnamed: 0,icustay_id,hr,category,description,text,mortality,cleaned_text,summary_T5
0,0,200039,12,Radiology,CT HEAD W/O CONTRAST,[**2121-12-26**] 4:56 PM\n CT HEAD W/O CONTRAS...,0,Reason: 69 year old woman sp decompressive sur...,69 year old woman sp decompressive surgery for...
1,1,200052,20,Radiology,CT HEAD W/O CONTRAST,[**2193-9-19**] 8:41 AM\n CT HEAD W/O CONTRAST...,0,Reason: CEREBELLER HEMORRHAGE MEDICAL CONDITIO...,57 year old woman with cerebellar hemorrhage. ...
2,2,200072,6,Radiology,CT HEAD W/O CONTRAST,[**2106-3-4**] 3:26 AM\n CT HEAD W/O CONTRAST ...,0,Reason: CEREBELLAR HEMORRHAGE MEDICAL CONDITIO...,MEDICAL CONDITION: 55 year old male with left ...
3,3,200072,8,Radiology,MR HEAD W & W/O CONTRAST,[**2106-3-4**] 5:26 AM\n MR HEAD W & W/O CONTR...,0,Reason: CEREBELLAR HEMORRHAGE Contrast: MAGNEV...,a 55 year old man with acute onset diplopia an...
4,4,200072,8,Radiology,MR HEAD W & W/O CONTRAST,"[**Last Name (LF) 477**],[**First Name3 (LF) 4...",0,Reason: CEREBELLAR HEMORRHAGE Contrast: MAGNEV...,man with acute onset diplopia and imbalance hy...


In [None]:
# how many words in the text column
df2["Number_Words"] = df2["text"].apply(lambda n: len(n.split()))
df2.head()

Unnamed: 0.1,Unnamed: 0,icustay_id,hr,category,description,text,mortality,cleaned_text,summary_T5,Number_Words
0,0,200039,12,Radiology,CT HEAD W/O CONTRAST,[**2121-12-26**] 4:56 PM\n CT HEAD W/O CONTRAS...,0,Reason: 69 year old woman sp decompressive sur...,69 year old woman sp decompressive surgery for...,184
1,1,200052,20,Radiology,CT HEAD W/O CONTRAST,[**2193-9-19**] 8:41 AM\n CT HEAD W/O CONTRAST...,0,Reason: CEREBELLER HEMORRHAGE MEDICAL CONDITIO...,57 year old woman with cerebellar hemorrhage. ...,141
2,2,200072,6,Radiology,CT HEAD W/O CONTRAST,[**2106-3-4**] 3:26 AM\n CT HEAD W/O CONTRAST ...,0,Reason: CEREBELLAR HEMORRHAGE MEDICAL CONDITIO...,MEDICAL CONDITION: 55 year old male with left ...,202
3,3,200072,8,Radiology,MR HEAD W & W/O CONTRAST,[**2106-3-4**] 5:26 AM\n MR HEAD W & W/O CONTR...,0,Reason: CEREBELLAR HEMORRHAGE Contrast: MAGNEV...,a 55 year old man with acute onset diplopia an...,244
4,4,200072,8,Radiology,MR HEAD W & W/O CONTRAST,"[**Last Name (LF) 477**],[**First Name3 (LF) 4...",0,Reason: CEREBELLAR HEMORRHAGE Contrast: MAGNEV...,man with acute onset diplopia and imbalance hy...,96


In [None]:
df2["Number_Words"] = df2["text"].apply(lambda n: len(n.split()))
df2["Number_Words_T5"] = df2["summary_T5"].apply(lambda n: len(n.split()))
df2.head(20)

Unnamed: 0.1,Unnamed: 0,icustay_id,hr,category,description,text,mortality,cleaned_text,summary_T5,Number_Words,Number_Words_T5
0,0,200039,12,Radiology,CT HEAD W/O CONTRAST,[**2121-12-26**] 4:56 PM\n CT HEAD W/O CONTRAS...,0,Reason: 69 year old woman sp decompressive sur...,69 year old woman sp decompressive surgery for...,184,22
1,1,200052,20,Radiology,CT HEAD W/O CONTRAST,[**2193-9-19**] 8:41 AM\n CT HEAD W/O CONTRAST...,0,Reason: CEREBELLER HEMORRHAGE MEDICAL CONDITIO...,57 year old woman with cerebellar hemorrhage. ...,141,21
2,2,200072,6,Radiology,CT HEAD W/O CONTRAST,[**2106-3-4**] 3:26 AM\n CT HEAD W/O CONTRAST ...,0,Reason: CEREBELLAR HEMORRHAGE MEDICAL CONDITIO...,MEDICAL CONDITION: 55 year old male with left ...,202,29
3,3,200072,8,Radiology,MR HEAD W & W/O CONTRAST,[**2106-3-4**] 5:26 AM\n MR HEAD W & W/O CONTR...,0,Reason: CEREBELLAR HEMORRHAGE Contrast: MAGNEV...,a 55 year old man with acute onset diplopia an...,244,40
4,4,200072,8,Radiology,MR HEAD W & W/O CONTRAST,"[**Last Name (LF) 477**],[**First Name3 (LF) 4...",0,Reason: CEREBELLAR HEMORRHAGE Contrast: MAGNEV...,man with acute onset diplopia and imbalance hy...,96,22
5,5,200072,22,Radiology,CHEST (PORTABLE AP),[**2106-3-25**] 1:18 PM\n CHEST (PORTABLE AP) ...,0,Reason: CEREBELLAR HEMORRHAGE MEDICAL CONDITIO...,55 year old man sp CABG eval for pneumothorax ...,142,38
6,6,200103,15,Radiology,SEL CATH 3RD ORDER THOR,[**2201-5-18**] 10:55 AM\n CAROT/CEREB [**Hosp...,0,Reason: SUBARACHNOID HEMORRHAGE Contrast: OPTI...,58-year-old man with sAH had a 19gauge single ...,665,62
7,7,200131,16,Radiology,CT HEAD W/O CONTRAST,[**2176-10-31**] 5:25 AM\n CT HEAD W/O CONTRAS...,0,Reason: STROKETELEMETRYTRANSIENT ISCHEMIC ATTA...,pontine hemorrhage appears slightly larger in ...,164,20
8,8,200131,21,Radiology,CHEST (PORTABLE AP),[**2176-10-31**] 10:07 AM\n CHEST (PORTABLE AP...,0,Reason: STROKETELEMETRYTRANSIENT ISCHEMIC ATTA...,69 year old woman with pontine hemorrhage and ...,168,27
9,9,200131,21,Radiology,CHEST (PORTABLE AP),"[**Last Name (LF) **],[**First Name8 (NamePatt...",0,Reason: STROKETELEMETRYTRANSIENT ISCHEMIC ATTA...,69 year old woman with pontine hemorrhage feve...,79,25


In [None]:
df2.to_csv("t5results.csv", index=False)

# Increasing vocabulary of Tokenizer