In [1]:
# mount to google drive
import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
os.chdir('./drive/MyDrive/SFU/CMPT713/Project')


In [3]:
!pip install simplet5
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install rouge_score
!pip install sacrebleu
!pip install accelerate -U

Collecting datasets
  Using cached datasets-2.18.0-py3-none-any.whl (510 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl (116 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
Collecting multiprocess (from datasets)
  Using cached multiprocess-0.70.16-py310-none-any.whl (134 kB)
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2.18.0 dill-0.3.8 multiprocess-0.70.16 xxhash-3.4.1
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0
Collecting rouge_score
  Downloading roug

In [4]:
import sacrebleu
import rouge_score

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch

import nltk
import evaluate

from datasets import load_dataset, DatasetDict, Dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import Trainer, TrainingArguments
import tqdm
from tqdm.auto import tqdm
from simplet5 import SimpleT5

INFO:pytorch_lightning.utilities.seed:Global seed set to 42


In [5]:
torch.cuda.empty_cache()

In [6]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [7]:
# load data
df_iCliniq_train = pd.read_csv('./train_datasets/iCliniq_train.csv')
df_iCliniq_test = pd.read_csv('./test_datasets/iCliniq_test.csv')

df_MedQuAD_train = pd.read_csv('./train_datasets/MedQuAD_train.csv')
df_MedQuAD_test = pd.read_csv('./test_datasets/MedQuAD_test.csv')


df_combined_train = pd.read_csv('./train_datasets/combined_train.csv')
df_combined_test = pd.read_csv('./test_datasets/combined_test.csv')

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_iCliniq_df, val_iCliniq_df = train_test_split(df_iCliniq_train, test_size=0.2, random_state=123)

train_iCliniq_df = train_iCliniq_df.rename(columns={'Question': 'source_text', 'Answer_cut': 'target_text'})
val_iCliniq_df = val_iCliniq_df.rename(columns={'Question': 'source_text', 'Answer_cut': 'target_text'})

train_iCliniq_df['source_text'] = "[Question] " + train_iCliniq_df['source_text']
val_iCliniq_df['source_text'] = "[Question] " + val_iCliniq_df['source_text']


train_MedQuAD_df, val_MedQuAD_df = train_test_split(df_MedQuAD_train, test_size=0.2, random_state=123)

train_MedQuAD_df = train_MedQuAD_df.rename(columns={'Question': 'source_text', 'Answer_cut': 'target_text'})
val_MedQuAD_df = val_MedQuAD_df.rename(columns={'Question': 'source_text', 'Answer_cut': 'target_text'})

train_MedQuAD_df['source_text'] = "[Question] " + train_MedQuAD_df['source_text']
val_MedQuAD_df['source_text'] = "[Question] " + val_MedQuAD_df['source_text']


train_combined_df, val_combined_df = train_test_split(df_combined_train, test_size=0.2, random_state=123)

train_combined_df = train_combined_df.rename(columns={'Question': 'source_text', 'Answer_cut': 'target_text'})
val_combined_df = val_combined_df.rename(columns={'Question': 'source_text', 'Answer_cut': 'target_text'})

train_combined_df['source_text'] = "[Question] " + train_combined_df['source_text']
val_combined_df['source_text'] = "[Question] " + val_combined_df['source_text']

In [11]:
model = SimpleT5()
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

In [12]:
def generate_answers_batch(questions, batch_size=32):
    # initialize the answer list generated by batch processing
    batch_generated_answers = []
    for i in tqdm(range(0, len(questions), batch_size), desc="Generating answers"):
        batch_questions = ["[Question] " + q for q in questions[i:i+batch_size]]
        batch_inputs = tokenizer(batch_questions, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
        with torch.no_grad():
            batch_outputs = model.generate(**batch_inputs, max_length=500,
                                           min_length=50,
                                           length_penalty=2.0)
        batch_answers = [tokenizer.decode(output, skip_special_tokens=True) for output in batch_outputs]
        batch_generated_answers.extend(batch_answers)
    return batch_generated_answers


In [13]:
# Inference on MedQuAD
df_test_MedQuAD = df_MedQuAD_test
generated_answers = generate_answers_batch(df_test_MedQuAD['Question'].tolist())
df_test_MedQuAD['Generated_Answer'] = generated_answers
df_test_MedQuAD.to_csv("./Non_Fintuned/MedQuADGenerated_Answer.csv", index = False)

Generating answers:   0%|          | 0/103 [00:00<?, ?it/s]

In [14]:
references = [[ref_ans] for ref_ans in df_test_MedQuAD['Answer']]
predictions = [pre_ans for pre_ans in df_test_MedQuAD['Generated_Answer']]
print("====================== MedQuAD ======================")
sacrebleu = evaluate.load("sacrebleu")
sacrebleu_results = sacrebleu.compute(predictions=predictions, references=references)
print(f"SacreBLEU Results: {sacrebleu_results}")

rouge = evaluate.load("rouge")
rouge_results = rouge.compute(predictions=predictions, references=references)
print(f"ROUGE Results: {rouge_results}")



Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

SacreBLEU Results: {'score': 0.006326544084991594, 'counts': [13590, 4322, 1976, 944], 'totals': [101122, 97843, 94564, 91285], 'precisions': [13.43921204090109, 4.417280745684413, 2.0895901188613, 1.0341238976830804], 'bp': 0.001879861772476165, 'sys_len': 101122, 'ref_len': 735820}


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ROUGE Results: {'rouge1': 0.05772365703624971, 'rouge2': 0.022608343075734234, 'rougeL': 0.05402921211070883, 'rougeLsum': 0.05405507228663199}


In [15]:
# Inference on iClinq
df_test_iCliniq = df_iCliniq_test
generated_answers = generate_answers_batch(df_test_iCliniq['Question'].tolist())
df_test_iCliniq['Generated_Answer'] = generated_answers
df_test_iCliniq.to_csv("./Non_Fintuned/iCliniqGenerated_Answer.csv", index = False)

Generating answers:   0%|          | 0/186 [00:00<?, ?it/s]

In [16]:
references = [[ref_ans] for ref_ans in df_test_iCliniq['Answer']]
predictions = [pre_ans for pre_ans in df_test_iCliniq['Generated_Answer']]
print("====================== iCliniq ======================")
sacrebleu = evaluate.load("sacrebleu")
sacrebleu_results = sacrebleu.compute(predictions=predictions, references=references)
print(f"SacreBLEU Results: {sacrebleu_results}")

rouge = evaluate.load("rouge")
rouge_results = rouge.compute(predictions=predictions, references=references)
print(f"ROUGE Results: {rouge_results}")

SacreBLEU Results: {'score': 0.4650812727389763, 'counts': [50445, 6280, 1476, 485], 'totals': [271304, 265354, 259404, 253454], 'precisions': [18.593533453248018, 2.3666498338069144, 0.5689966230281722, 0.19135622243089476], 'bp': 0.314352413571964, 'sys_len': 271304, 'ref_len': 585268}
ROUGE Results: {'rouge1': 0.11341270996730268, 'rouge2': 0.0155244618553734, 'rougeL': 0.0825143373154435, 'rougeLsum': 0.08249540014202747}


In [18]:
# Inference on Cobimed
df_test_combined = df_combined_test
generated_answers = generate_answers_batch(df_test_combined['Question'].tolist())
df_test_combined['Generated_Answer'] = generated_answers
df_test_combined.to_csv("./Non_Fintuned/combinedGenerated_Answer.csv", index = False)

Generating answers:   0%|          | 0/289 [00:00<?, ?it/s]

In [19]:
references = [[ref_ans] for ref_ans in df_test_combined['Answer']]
predictions = [pre_ans for pre_ans in df_test_combined['Generated_Answer']]
print("====================== Combined ======================")
sacrebleu = evaluate.load("sacrebleu")
sacrebleu_results = sacrebleu.compute(predictions=predictions, references=references)
print(f"SacreBLEU Results: {sacrebleu_results}")

rouge = evaluate.load("rouge")
rouge_results = rouge.compute(predictions=predictions, references=references)
print(f"ROUGE Results: {rouge_results}")

SacreBLEU Results: {'score': 0.1586736247673811, 'counts': [62950, 10429, 3390, 1368], 'totals': [371916, 362688, 353460, 344232], 'precisions': [16.92586498026436, 2.8754742368095996, 0.9590901374978781, 0.397406400334658], 'bp': 0.07645679417119496, 'sys_len': 371916, 'ref_len': 1328123}
ROUGE Results: {'rouge1': 0.0930517456867453, 'rouge2': 0.01815121672437609, 'rougeL': 0.0723469749221462, 'rougeLsum': 0.07235448062666242}
