In [1]:
!pip install transformers datasets evaluate nltk sentence-transformers rouge-score peft bitsandbytes==0.41.1 accelerate

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bitsandbytes==0.41.1
  Downloading bitsandbytes-0.41.1-py3-none-any.whl.metadata (9.8 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading bitsandbytes-0.41.1-py3-none-any.whl (92.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore", message="Passing a tuple of `past_key_values` is deprecated")
import evaluate
from nltk.translate.bleu_score import sentence_bleu
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import numpy as np
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [5]:
ds = load_dataset("StonyBrookNLP/tellmewhy")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.76k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

train.json:   0%|          | 0.00/70.1M [00:00<?, ?B/s]

validation.json:   0%|          | 0.00/8.71M [00:00<?, ?B/s]

test.json:   0%|          | 0.00/10.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/71892 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8976 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10689 [00:00<?, ? examples/s]

In [None]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['narrative', 'question', 'original_sentence_for_question', 'narrative_lexical_overlap', 'is_ques_answerable', 'answer', 'is_ques_answerable_annotator', 'original_narrative_form', 'question_meta', 'helpful_sentences', 'human_eval', 'val_ann', 'gram_ann'],
        num_rows: 71892
    })
    validation: Dataset({
        features: ['narrative', 'question', 'original_sentence_for_question', 'narrative_lexical_overlap', 'is_ques_answerable', 'answer', 'is_ques_answerable_annotator', 'original_narrative_form', 'question_meta', 'helpful_sentences', 'human_eval', 'val_ann', 'gram_ann'],
        num_rows: 8976
    })
    test: Dataset({
        features: ['narrative', 'question', 'original_sentence_for_question', 'narrative_lexical_overlap', 'is_ques_answerable', 'answer', 'is_ques_answerable_annotator', 'original_narrative_form', 'question_meta', 'helpful_sentences', 'human_eval', 'val_ann', 'gram_ann'],
        num_rows: 10689
    })
})


In [None]:
print(ds['train'].features)

{'narrative': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'original_sentence_for_question': Value(dtype='string', id=None), 'narrative_lexical_overlap': Value(dtype='float64', id=None), 'is_ques_answerable': Value(dtype='string', id=None), 'answer': Value(dtype='string', id=None), 'is_ques_answerable_annotator': Value(dtype='string', id=None), 'original_narrative_form': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'question_meta': Value(dtype='string', id=None), 'helpful_sentences': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'human_eval': Value(dtype='bool', id=None), 'val_ann': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'gram_ann': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}


In [6]:
# Access splits
train_data = ds['train']
val_data = ds['validation']
test_data = ds['test']

In [None]:
def get_model_and_tokenizer(model_name):
    if "t5" in model_name:
        tokenizer = T5Tokenizer.from_pretrained(model_name)
        model = T5ForConditionalGeneration.from_pretrained(model_name)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name)

    # Ensure pad_token is set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token or '[PAD]'
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        model.resize_token_embeddings(len(tokenizer))  # Resize embeddings for new token

    return tokenizer, model

?
# Baseline models
models = {
    "Distilled-T5": "google/flan-t5-small",
    "GPT": "distilgpt2",
    "Gemini": "EleutherAI/pythia-70m"
}

In [None]:
# Hyperparameters
batch_size = 32
learning_rate = 1e-5
num_epochs = 2
max_seq_len = 512

In [None]:
# Preprocess the data
def preprocess_function(batch, tokenizer):
    inputs = tokenizer(batch['question'], padding='max_length', truncation=True, max_length=max_seq_len)
    labels = tokenizer(batch['answer'], padding='max_length', truncation=True, max_length=max_seq_len)
    inputs['labels'] = labels['input_ids']
    return inputs

# DataLoader function
def get_dataloader(dataset, tokenizer):
    dataset = dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Directory to save checkpoints
save_dir = "/content/drive/MyDrive/CSE354Project"
os.makedirs(save_dir, exist_ok=True)

def save_checkpoint(model, tokenizer, model_name, epoch):
    checkpoint_path = os.path.join(save_dir, f"{model_name}_Epoch{epoch}")
    model.save_pretrained(checkpoint_path)  # Save model
    tokenizer.save_pretrained(checkpoint_path)  # Save tokenizer
    print(f"Checkpoint saved for {model_name} at epoch {epoch} in {checkpoint_path}")

In [None]:
def train_model(model, tokenizer, train_loader, val_loader, device, model_name):
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        model.train()
        epoch_loss = 0
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        print(f"Training Loss: {epoch_loss / len(train_loader)}")

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
                labels = batch['labels'].to(device)
                outputs = model(**inputs, labels=labels)
                val_loss += outputs.loss.item()

        print(f"Validation Loss: {val_loss / len(val_loader)}")

        # Save checkpoint after each epoch
        save_checkpoint(model, tokenizer, model_name, epoch + 1)

In [None]:
# Train each model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
for model_name, model_id in models.items():
    print(f"\nTraining {model_name} Model")
    tokenizer, model = get_model_and_tokenizer(model_id)
    train_loader = get_dataloader(train_data, tokenizer)
    val_loader = get_dataloader(val_data, tokenizer)
    train_model(model, tokenizer, train_loader, val_loader, device, model_name)


Training Distilled-T5 Model


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/71892 [00:00<?, ? examples/s]

Map:   0%|          | 0/8976 [00:00<?, ? examples/s]

Epoch 1/2


  0%|          | 0/2247 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
100%|██████████| 2247/2247 [23:34<00:00,  1.59it/s]


Training Loss: 3.373095768919562
Validation Loss: 0.09402130021849561
Checkpoint saved for Distilled-T5 at epoch 1 in /content/drive/MyDrive/CSE354Project/Distilled-T5_Epoch1
Epoch 2/2


100%|██████████| 2247/2247 [23:32<00:00,  1.59it/s]


Training Loss: 0.11931321023899977
Validation Loss: 0.06189239668093118
Checkpoint saved for Distilled-T5 at epoch 2 in /content/drive/MyDrive/CSE354Project/Distilled-T5_Epoch2

Training GPT Model


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Map:   0%|          | 0/71892 [00:00<?, ? examples/s]

Map:   0%|          | 0/8976 [00:00<?, ? examples/s]

Epoch 1/2


100%|██████████| 2247/2247 [21:26<00:00,  1.75it/s]


Training Loss: 1.174742935171486
Validation Loss: 0.11477526873880434
Checkpoint saved for GPT at epoch 1 in /content/drive/MyDrive/CSE354Project/GPT_Epoch1
Epoch 2/2


100%|██████████| 2247/2247 [21:26<00:00,  1.75it/s]


Training Loss: 0.11406511206280989
Validation Loss: 0.10933454414279435
Checkpoint saved for GPT at epoch 2 in /content/drive/MyDrive/CSE354Project/GPT_Epoch2

Training Gemini Model


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/166M [00:00<?, ?B/s]

Map:   0%|          | 0/71892 [00:00<?, ? examples/s]

Map:   0%|          | 0/8976 [00:00<?, ? examples/s]

Epoch 1/2


100%|██████████| 2247/2247 [12:43<00:00,  2.94it/s]


Training Loss: 3.0370549498947397
Validation Loss: 0.13403758934278082
Checkpoint saved for Gemini at epoch 1 in /content/drive/MyDrive/CSE354Project/Gemini_Epoch1
Epoch 2/2


100%|██████████| 2247/2247 [12:43<00:00,  2.94it/s]


Training Loss: 0.12455731303056825
Validation Loss: 0.11893853049367348
Checkpoint saved for Gemini at epoch 2 in /content/drive/MyDrive/CSE354Project/Gemini_Epoch2


In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
checkpoint_dirs = {
    "Distilled-T5_Epoch1": "/content/drive/MyDrive/CSE354Project/Distilled-T5_Epoch1",
    "Distilled-T5_Epoch2": "/content/drive/MyDrive/CSE354Project/Distilled-T5_Epoch2",
    "GPT_Epoch1": "/content/drive/MyDrive/CSE354Project/GPT_Epoch1",
    "GPT_Epoch2": "/content/drive/MyDrive/CSE354Project/GPT_Epoch2",
    "Gemini_Epoch1": "/content/drive/MyDrive/CSE354Project/Gemini_Epoch1",
    "Gemini_Epoch2": "/content/drive/MyDrive/CSE354Project/Gemini_Epoch2"
}


In [None]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def collate_fn(batch):
    # Extract questions and answers from the batch
    questions = [item['question'] for item in batch]
    answers = [item['answer'] for item in batch]

    # Tokenize questions with dynamic padding
    tokenized_questions = tokenizer(
        questions,
        return_tensors="pt",
        truncation=True,
        padding=True,  # Pad to the longest sequence in the batch
        max_length=512
    )

    # Return a dictionary with tokenized inputs and raw answers
    return {
        "input_ids": tokenized_questions["input_ids"],
        "attention_mask": tokenized_questions["attention_mask"],
        "answer": answers
    }


In [None]:
def generate_predictions(model, tokenizer, test_data, device, max_length=512, num_beams=5, batch_size=16):
    model.to(device)  # Move the model to the specified device
    predictions = []
    references = []

    # Create DataLoader with the custom collate function
    dataloader = DataLoader(
        test_data,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate_fn  # Use the custom collate function
    )

    # Disable gradient calculation for efficiency
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            batch_references = batch["answer"]

            # Ensure pad_token_id is set
            if model.config.pad_token_id is None:
                model.config.pad_token_id = tokenizer.pad_token_id

            # Generate predictions for the batch
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=max_length,
                num_beams=num_beams,
                pad_token_id=model.config.pad_token_id
            )

            # Decode predictions
            batch_predictions = [
                tokenizer.decode(output, skip_special_tokens=True) for output in outputs
            ]

            # Collect predictions and references
            predictions.extend(batch_predictions)
            references.extend(batch_references)

    return predictions, references


In [None]:
from nltk.translate.bleu_score import corpus_bleu

# Compute BLEU
def compute_bleu(predictions, references):
    # Ensure references are tokenized correctly
    if isinstance(references[0], list):  # If references are already a list of lists
        tokenized_references = references
    else:
        tokenized_references = [[ref.split()] for ref in references]  # Tokenize references

    # Tokenize predictions
    tokenized_predictions = [pred.split() for pred in predictions]

    # Compute BLEU score
    return corpus_bleu(tokenized_references, tokenized_predictions)

In [None]:
from rouge_score import rouge_scorer
# Compute ROUGE
def compute_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)
        for key in scores:
            scores[key].append(score[key].fmeasure)

    return {key: sum(value) / len(value) for key, value in scores.items()}

In [None]:
# Compute Semantic Similarity
def compute_semantic_similarity(predictions, references):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    pred_embeddings = model.encode(predictions, convert_to_tensor=True)
    ref_embeddings = model.encode(references, convert_to_tensor=True)

    similarities = util.cos_sim(pred_embeddings, ref_embeddings)
    return similarities.diagonal().mean().item()


In [None]:
baseline_results = {}

# Loop through each model and its corresponding single checkpoint directory
for model_name, epoch_dir in checkpoint_dirs.items():
    print(f"Evaluating {model_name} from {epoch_dir}")

    # Initialize results for the model
    baseline_results[model_name] = {}

    # Load model and tokenizer
    if "T5" in model_name:
        tokenizer = T5Tokenizer.from_pretrained(epoch_dir)
        model = T5ForConditionalGeneration.from_pretrained(epoch_dir)
    else:
        tokenizer = AutoTokenizer.from_pretrained(epoch_dir)
        model = AutoModelForCausalLM.from_pretrained(epoch_dir)

        # Handle missing pad_token
        if tokenizer.pad_token is None:
            # Set pad_token to eos_token or add "[PAD]" if neither exists
            tokenizer.pad_token = tokenizer.eos_token or "[PAD]"
            tokenizer.add_special_tokens({'pad_token': tokenizer.pad_token})
            model.resize_token_embeddings(len(tokenizer))  # Resize model embeddings

        # Ensure pad_token_id is set for generation
        if model.config.pad_token_id is None:
            model.config.pad_token_id = tokenizer.pad_token_id

    # Move model to the specified device
    model.to(device)

    # Generate predictions
    predictions, references = generate_predictions(
        model=model,
        tokenizer=tokenizer,
        test_data=test_data,
        device=device
    )

    # Compute metrics
    bleu_score = compute_bleu(predictions, references)
    rouge_scores = compute_rouge(predictions, references)
    semantic_similarity = compute_semantic_similarity(predictions, references)

    # Store results
    baseline_results[model_name] = {
        "BLEU": bleu_score,
        "ROUGE": rouge_scores,
        "Semantic Similarity": semantic_similarity
    }

# Optionally print or save the results
print("\nEvaluation Results:")
for model_name, metrics in baseline_results.items():
    print(f"{model_name}: {metrics}")

In [None]:
checkpoint_dirs = {
    "GPT_Epoch1": "/content/drive/MyDrive/CSE354Project/GPT_Epoch1",
    "GPT_Epoch2": "/content/drive/MyDrive/CSE354Project/GPT_Epoch2",
}

In [None]:
baseline_results = {}

# Loop through each model and its corresponding single checkpoint directory
for model_name, epoch_dir in checkpoint_dirs.items():
    try:
        print(f"Evaluating {model_name} from {epoch_dir}")

        # Initialize results for the model
        baseline_results[model_name] = {}

        # Load model and tokenizer
        if "T5" in model_name.lower():
            tokenizer = T5Tokenizer.from_pretrained(epoch_dir)
            model = T5ForConditionalGeneration.from_pretrained(epoch_dir)
        else:
            tokenizer = AutoTokenizer.from_pretrained(epoch_dir)
            model = AutoModelForCausalLM.from_pretrained(epoch_dir)

            # Fix padding for decoder-only models
            tokenizer.padding_side = "left"  # Left padding is required for GPT-based models

            # Handle missing pad_token
            if tokenizer.pad_token is None:
                # Set pad_token to eos_token or add "[PAD]" if neither exists
                tokenizer.pad_token = tokenizer.eos_token or "[PAD]"
                tokenizer.add_special_tokens({'pad_token': tokenizer.pad_token})
                model.resize_token_embeddings(len(tokenizer))  # Resize model embeddings

            # Ensure pad_token_id is set for generation
            if model.config.pad_token_id is None:
                model.config.pad_token_id = tokenizer.pad_token_id

        # Move model to the specified device
        model.to(device)

        # Generate predictions
        predictions, references = generate_predictions(
            model=model,
            tokenizer=tokenizer,
            test_data=test_data,
            device=device
        )

        # Compute metrics
        bleu_score = compute_bleu(predictions, references)
        rouge_scores = compute_rouge(predictions, references)
        semantic_similarity = compute_semantic_similarity(predictions, references)

        # Store results
        baseline_results[model_name] = {
            "BLEU": bleu_score,
            "ROUGE": rouge_scores,
            "Semantic Similarity": semantic_similarity
        }

    except Exception as e:
        print(f"Error evaluating {model_name}: {e}")
        baseline_results[model_name] = {
            "Error": str(e)
        }

# Optionally print or save the results
print("\nEvaluation Results:")
for model_name, metrics in baseline_results.items():
    print(f"{model_name}: {metrics}")


Evaluating GPT_Epoch1 from /content/drive/MyDrive/CSE354Project/GPT_Epoch1


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Evaluating GPT_Epoch2 from /content/drive/MyDrive/CSE354Project/GPT_Epoch2

Evaluation Results:
GPT_Epoch1: {'BLEU': 0.0017288149245950168, 'ROUGE': {'rouge1': 0.08718118738045252, 'rouge2': 0.02561832661940531, 'rougeL': 0.08239192961373057}, 'Semantic Similarity': 0.36161330342292786}
GPT_Epoch2: {'BLEU': 0.04244894174158954, 'ROUGE': {'rouge1': 0.1898501761987738, 'rouge2': 0.058976794810597644, 'rougeL': 0.17838109366696484}, 'Semantic Similarity': 0.4443044066429138}


In [None]:
# Display results
import pandas as pd
results_df = pd.DataFrame.from_dict(
    {(i, j): baseline_results[i][j] for i in baseline_results.keys() for j in baseline_results[i].keys()},
    orient='index'
)
results_df.index.names = ['Model', 'Checkpoint']
results_df.reset_index(inplace=True)

# Display results in a table
import ace_tools as tools; tools.display_dataframe_to_user(name="Baseline Model Evaluation", dataframe=results_df)

## LoRa Finetuning and Prompt Engineering


In [None]:
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig, TaskType
import evaluate
import numpy as np

In [None]:
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name, load_in_8bit=True, device_map="auto")

In [None]:
# Use a sample of around 2000 from train
train_dataset = ds['train'].shuffle(seed=42).select(range(50000))
eval_dataset = ds['validation']

In [None]:
def preprocess_func(examples):
    inputs = [
        f"answer the question based on the narrative: {narr} question: {ques}"
        for narr, ques in zip(examples["narrative"], examples["question"])
    ]
    targets = examples["answer"]

    # Tokenize inputs and targets
    model_inputs = tokenizer(inputs, truncation=True, padding=False)
    labels = tokenizer(targets, truncation=True, padding=False)

    # T5 expects labels in input_ids
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# Apply the preprocessing
train_dataset = train_dataset.map(preprocess_func, batched=True)
eval_dataset = eval_dataset.map(preprocess_func, batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8976 [00:00<?, ? examples/s]

In [None]:
# Configure 8-bit training
bnb_config = BitsAndBytesConfig(load_in_8bit=True)

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)


In [None]:
model = prepare_model_for_kbit_training(model)

accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=-1)
    filtered_labels = [lbl[0] for lbl in labels if lbl[0] != -100]
    filtered_preds = [pred[0] for pred in predictions]
    min_len = min(len(filtered_labels), len(filtered_preds))
    return {"accuracy": accuracy.compute(predictions=filtered_preds[:min_len], references=filtered_labels[:min_len])}


In [None]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=4,
    lora_alpha=32,
    lora_dropout=0.01,
    target_modules=["k","q","v","o"],
)

peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 344,064 || all params: 77,305,216 || trainable%: 0.4451


In [None]:
# Data collator
label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=peft_model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/CSE354Project/flan-t5-small-finetuned",
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_dir="/content/drive/MyDrive/CSE354Project/flan-t5-small-finetuned/logs",
    logging_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=False,
    report_to="none",  # change this if you want to use W&B
    run_name="flan-t5-small-tellmewhy"
)

In [None]:
peft_model.config.use_cache = False

# Create the trainer
trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss
1563,1.7191
3126,1.649


  return fn(*args, **kwargs)


TrainOutput(global_step=3126, training_loss=1.6840682075485844, metrics={'train_runtime': 1642.408, 'train_samples_per_second': 60.886, 'train_steps_per_second': 1.903, 'total_flos': 3911128005476352.0, 'train_loss': 1.6840682075485844, 'epoch': 2.0})

In [None]:
!pip install acetools

[31mERROR: Could not find a version that satisfies the requirement acetools (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for acetools[0m[31m
[0m

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

from peft import PeftModel

base_model_path = "google/flan-t5-small"  # Original base model
checkpoint_path = "/content/drive/MyDrive/CSE354Project/flan-t5-small-finetuned/checkpoint-3126"

# Load base model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(base_model_path)

# Apply adapter from checkpoint
model = PeftModel.from_pretrained(model, checkpoint_path)

# Define test cases and prompts
config = {
    "tests": [
        {
            "input": {
                "narrative": "The plant uses sunlight, water, and carbon dioxide to produce sugars. It releases oxygen as a byproduct. This process helps the plant grow strong and healthy.",
                "question": "Why does the plant grow strong and healthy?"
            },
            "expected": "Because it produces sugars through photosynthesis, which provides it with energy."
        },
        {
            "input": {
                "narrative": "John decided to wake up early every morning to study more. Over time, he improved his grades and became confident in his abilities.",
                "question": "Why did John's grades improve?"
            },
            "expected": "Because he woke up early to study, giving him more time to understand the material."
        }
    ],
    "prompts": [
        {
            "name": "Direct-Explanation",
            "prompt": "Given the narrative:\n{narrative}\n\nAnswer the following 'why' question based solely on the narrative:\nQuestion: {question}\n\nAnswer:"
        },
        {
            "name": "Hierarchical-Reasoning",
            "prompt": "Read the narrative and then answer the 'why' question by following these steps:\n1. Identify all relevant facts from the narrative related to the question.\n2. Connect these facts logically to show cause, motivation, or reason.\n3. Summarize the causal chain in a final, concise answer.\n\nNarrative:\n{narrative}\n\nQuestion (Why):\n{question}\n\nStep-by-step Reasoning:\n1.\n2.\n3.\n\nFinal Answer:"
        },
        {
            "name": "Context-Only",
            "prompt": "You must answer the 'why' question using only the information in the narrative. If the narrative does not provide enough details, say so.\n\nNarrative:\n{narrative}\n\nQuestion (Why):\n{question}\n\nAnswer:"
        }
    ]
}

# Iterate over prompts and test cases
results = []

for prompt_config in config["prompts"]:
    prompt_name = prompt_config["name"]
    prompt_template = prompt_config["prompt"]

    for test_case in config["tests"]:
        narrative = test_case["input"]["narrative"]
        question = test_case["input"]["question"]
        expected = test_case["expected"]

        prompt = prompt_template.format(narrative=narrative, question=question)

        inputs = tokenizer(prompt, return_tensors="pt", truncation=True)

        outputs = model.generate(**inputs, max_length=100)
        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

        results.append({
            "Prompt Name": prompt_name,
            "Input Narrative": narrative,
            "Input Question": question,
            "Expected Answer": expected,
            "Generated Answer": generated
        })


In [None]:
from IPython.display import display
import pandas as pd

# Assuming 'results' is your data
results_df = pd.DataFrame(results)
display(results_df)

Unnamed: 0,Prompt Name,Input Narrative,Input Question,Expected Answer,Generated Answer
0,Direct-Explanation,"The plant uses sunlight, water, and carbon dio...",Why does the plant grow strong and healthy?,Because it produces sugars through photosynthe...,it releases oxygen as a byproduct.
1,Direct-Explanation,John decided to wake up early every morning to...,Why did John's grades improve?,"Because he woke up early to study, giving him ...",he was able to study more.
2,Hierarchical-Reasoning,"The plant uses sunlight, water, and carbon dio...",Why does the plant grow strong and healthy?,Because it produces sugars through photosynthe...,it releases oxygen as a byproduct.
3,Hierarchical-Reasoning,John decided to wake up early every morning to...,Why did John's grades improve?,"Because he woke up early to study, giving him ...",John's grades improved.
4,Context-Only,"The plant uses sunlight, water, and carbon dio...",Why does the plant grow strong and healthy?,Because it produces sugars through photosynthe...,it releases oxygen as a byproduct.
5,Context-Only,John decided to wake up early every morning to...,Why did John's grades improve?,"Because he woke up early to study, giving him ...",he was able to study more.


In [None]:
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute semantic similarity for each row
results_df['Semantic Similarity'] = results_df.apply(
    lambda row: util.pytorch_cos_sim(
        model.encode(row['Generated Answer'], convert_to_tensor=True),
        model.encode(row['Expected Answer'], convert_to_tensor=True)
    ).item(),
    axis=1
)

# Average Semantic Similarity
avg_similarity = results_df['Semantic Similarity'].mean()
print(f"Average Semantic Similarity: {avg_similarity:.2f}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Average Semantic Similarity: 0.50


In [None]:
# Group by Prompt Name to get average accuracy and similarity
performance = results_df.groupby('Prompt Name').agg(
    Exact_Match_Accuracy=('Exact Match', 'mean'),
    Average_Semantic_Similarity=('Semantic Similarity', 'mean')
)

print(performance)

                        Exact_Match_Accuracy  Average_Semantic_Similarity
Prompt Name                                                              
Context-Only                             0.0                     0.549594
Direct-Explanation                       0.0                     0.549594
Hierarchical-Reasoning                   0.0                     0.395300


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

from peft import PeftModel

base_model_path = "google/flan-t5-small"  # Original base model
checkpoint_path = "/content/drive/MyDrive/CSE354Project/flan-t5-small-finetuned/checkpoint-3126"

tokenizer = AutoTokenizer.from_pretrained(base_model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(base_model_path)

model = PeftModel.from_pretrained(model, checkpoint_path)

best_prompt = """Answer the following 'why' question based only on the information provided in the narrative. Use logical reasoning and ensure your answer is clear and concise.

Reasoning steps:
1. Identify the key details in the narrative relevant to the question.
2. Explain how these details are connected and lead to the answer.
3. Conclude with a concise final answer.

Narrative:
{narrative}

Question (Why):
{question}

Answer:
"""

def generate_answer(narrative, question):
    # Insert the user input (narrative and question) into the chosen prompt
    prompt = best_prompt.format(narrative=narrative, question=question)

    # Tokenize and generate the answer
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=100)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

narrative = "The plant uses sunlight, water, and carbon dioxide to produce sugars. It releases oxygen as a byproduct. This process helps the plant grow strong and healthy."
question = "Why does the plant grow strong and healthy?"

# Generate the answer
result = generate_answer(narrative, question)
print(result)

it releases oxygen as a byproduct.


In [29]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Path to your saved model directory
model_dir = "/content/drive/MyDrive/CSE354Project/Distilled-T5_Epoch2"

# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_dir)

In [30]:
subset_size = 200

# Take the first 200 test cases directly
test_data_subset = test_data.select(range(subset_size))

# Optionally convert to pandas DataFrame for inspection
df_subset = test_data_subset.to_pandas()

In [31]:
# Preprocess the test data for tokenization
def preprocess_function(example):
    return tokenizer(
        example["question"],
        text_pair=example["narrative"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )

# Tokenize test dataset
tokenized_test_data = test_data_subset.map(preprocess_function, batched=True)


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [32]:
import torch
import pandas as pd

def generate_predictions_batch(model, tokenizer, test_dataset, device, max_length=128, batch_size=16):
    model.eval()
    predictions = []

    for start_idx in range(0, len(test_dataset), batch_size):
        batch = test_dataset[start_idx: start_idx + batch_size]

        # Instead of list comprehension on dictionaries, directly access columns
        questions = batch["question"]      # list of questions
        narratives = batch["narrative"]    # list of narratives

        inputs = tokenizer(
            questions,
            text_pair=narratives,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )

        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=max_length,
                num_beams=1,
                early_stopping=True
            )

        decoded_preds = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        predictions.extend(decoded_preds)

    return predictions


In [34]:
# Generate predictions
test_questions = test_data_subset["question"]
test_narratives = test_data_subset["narrative"]
test_answers = test_data_subset["answer"]  # Ground truth answers (optional)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

predictions = generate_predictions_batch(model, tokenizer, test_data_subset, device)
# Save to a DataFrame
df = pd.DataFrame({
    "question": test_questions,
    "narrative": test_narratives,
    "answer": test_answers,
    "T5-prediction": predictions
})

# Save to CSV
output_path = "/content/drive/MyDrive/CSE354Project/Distilled-T5_Epoch2_predictions.csv"
df.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")

Predictions saved to /content/drive/MyDrive/CSE354Project/Distilled-T5_Epoch2_predictions.csv


In [35]:
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


base_model_path = "google/flan-t5-small"  # Original base model
checkpoint_path = "/content/drive/MyDrive/CSE354Project/flan-t5-small-finetuned/checkpoint-3126"

tokenizer = AutoTokenizer.from_pretrained(base_model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(base_model_path)

model = PeftModel.from_pretrained(model, checkpoint_path)

In [39]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel

# Model and tokenizer setup
base_model_path = "google/flan-t5-small"  # Original base model
checkpoint_path = "/content/drive/MyDrive/CSE354Project/Distilled-T5_Epoch2"

tokenizer = AutoTokenizer.from_pretrained(base_model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(base_model_path)
# model = PeftModel.from_pretrained(model, checkpoint_path)

# Prompt for generating answers
best_prompt = """Answer the following 'why' question based only on the information provided in the narrative. Use logical reasoning and ensure your answer is clear and concise.

Reasoning steps:
1. Identify the key details in the narrative relevant to the question.
2. Explain how these details are connected and lead to the answer.
3. Conclude with a concise final answer.

Narrative:
{narrative}

Question (Why):
{question}

Answer:
"""

def generate_answer(narrative, question):
    # Insert narrative and question into the prompt
    prompt = best_prompt.format(narrative=narrative, question=question)

    # Tokenize and generate the answer
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_length=100)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Generate predictions for the test dataset
predictions = []
for example in test_data_subset:
    narrative = example["narrative"]
    question = example["question"]
    ground_truth = example["answer"]  # Optional: Ground truth for comparison

    # Generate the answer
    generated_answer = generate_answer(narrative, question)

    # Append the result
    predictions.append({
        "narrative": narrative,
        "question": question,
        "answer": ground_truth,  # Optional
        "t5-prompt-generated_answer": generated_answer
    })

# Save predictions to a CSV file
df = pd.DataFrame(predictions)
output_path = "/content/drive/MyDrive/CSE354Project/Distill-t5-prompt.csv"
df.to_csv(output_path, index=False)

print(f"Predictions saved to {output_path}")

Predictions saved to /content/drive/MyDrive/CSE354Project/Distill-t5-prompt.csv
