In [None]:
!pip install transformers datasets sentencepiece rouge-score evaluate accelerate




In [1]:
!pip install scikit-learn




In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
from sklearn.model_selection import train_test_split

In [3]:
!pip install -U \
  torch==2.2.2 \
  transformers==4.41.2 \
  accelerate==0.30.1 \
  peft==0.10.0 \
  sentence-transformers==5.2.0 \
  evaluate


Collecting torch==2.2.2
  Downloading torch-2.2.2-cp312-cp312-manylinux1_x86_64.whl.metadata (25 kB)
Collecting peft==0.10.0
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.2)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.2)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.2)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.2)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.2.2)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0

In [5]:
import torch
import transformers
import accelerate
import peft

print("torch:", torch.__version__)
print("transformers:", transformers.__version__)
print("accelerate:", accelerate.__version__)
print("peft:", peft.__version__)


torch: 2.2.2+cu121
transformers: 4.41.2
accelerate: 0.30.1
peft: 0.10.0


In [4]:
from datasets import Dataset, DatasetDict
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
import evaluate
import torch


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.12/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.12/dist-package

# Next

In [8]:

import os

print(os.path.exists("/content/KLA/samsum_train_cleaned.csv"))
print(os.path.exists("/content/KLA/samsum_test_cleaned.csv"))


True
True


In [9]:
df_train = pd.read_csv("/content/KLA/samsum_train_cleaned.csv")
test = pd.read_csv("/content/KLA/samsum_test_cleaned.csv")

In [None]:
# -----------------------------------------
# Model + Tokenizer (FLAN-T5 - Better for Summarization!)
# -----------------------------------------
model_name = "google/flan-t5-base"  # FLAN-T5 is better than regular T5 for summarization
tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Move model to GPU
import torch
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Set seed for reproducibility
import random
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

max_input_length = 768
max_target_length = 80

# -----------------------------------------
# Preprocess Function
# -----------------------------------------
def preprocess(batch):
    inputs = [
        "Summarize the following conversation:\n" + d
        for d in batch["dialogue"]
    ]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding="max_length",
    )

    labels = tokenizer(
        text_target=batch["summary"],
        max_length=max_target_length,
        truncation=True,
        padding="max_length",
    )

    labels["input_ids"] = [
        [(t if t != tokenizer.pad_token_id else -100) for t in seq]
        for seq in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# Process with multiple workers for faster data loading
print("Tokenizing datasets...")

dataset = Dataset.from_pandas(df_train)

dataset = dataset.train_test_split(
    test_size=0.1,
    seed=42
)

dataset = DatasetDict({
    "train": dataset["train"],
    "validation": dataset["test"],  # rename test ‚Üí validation
})

tokenized_dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names,
    num_proc=2
)


print("Tokenization complete!")

# -----------------------------------------
# Metric (FIXED - Handle decoding properly)
# -----------------------------------------
try:
    rouge = evaluate.load("rouge")
    print("Successfully loaded ROUGE from evaluate library")
except Exception as e:
    print(f"Error loading rouge from evaluate: {e}")
    print("Using rouge_score directly instead...")
    from rouge_score import rouge_scorer
    rouge = None

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    
    # Replace -100 in predictions (shouldn't happen but just in case)
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    
    # Decode predictions
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in labels with pad_token_id before decoding
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Strip whitespace
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    if rouge is not None:
        try:
            result = rouge.compute(
                predictions=decoded_preds,
                references=decoded_labels,
                use_stemmer=True
            )
            return result
        except Exception as e:
            print(f"Error using evaluate rouge: {e}")
            pass
    
    # Fallback to rouge_score
    from rouge_score import rouge_scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    
    for pred, ref in zip(decoded_preds, decoded_labels):
        score = scorer.score(ref, pred)
        rouge1_scores.append(score['rouge1'].fmeasure)
        rouge2_scores.append(score['rouge2'].fmeasure)
        rougeL_scores.append(score['rougeL'].fmeasure)
    
    return {
        'rouge1': sum(rouge1_scores) / len(rouge1_scores) if rouge1_scores else 0,
        'rouge2': sum(rouge2_scores) / len(rouge2_scores) if rouge2_scores else 0,
        'rougeL': sum(rougeL_scores) / len(rougeL_scores) if rougeL_scores else 0,
    }

# -----------------------------------------
# Data Collator
# -----------------------------------------
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
)

# -----------------------------------------
# Training Arguments (Matching philschmid/flan-t5-base-samsum)
# -----------------------------------------
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-base-dialogue-sum",

    learning_rate=5e-5,

    per_device_train_batch_size=2,     # ‚Üì from 8
    per_device_eval_batch_size=2,      # ‚Üì from 8
    gradient_accumulation_steps=4,     # NEW ‚Üí effective batch = 8

    num_train_epochs=5,
    seed=42,

    optim="adamw_torch",
    lr_scheduler_type="linear",
    warmup_steps=500,
    weight_decay=0.01,

    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,

    fp16=True,                         # ‚úÖ correct for T4
    bf16=False,                        # ‚ùå must be False
    gradient_checkpointing=True,       # NEW ‚Üí memory saver

    dataloader_num_workers=2,          # ‚Üì safer for Colab
    dataloader_pin_memory=True,

    predict_with_generate=True,
    generation_max_length=80,
    generation_num_beams=4,

    report_to="none",
    logging_first_step=True,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,

    group_by_length=False,             # IMPORTANT on Colab
)


# -----------------------------------------
# Trainer
# -----------------------------------------
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# -----------------------------------------
# TRAINING
# -----------------------------------------
print("="*80)
print("Starting training with FLAN-T5 (philschmid/flan-t5-base-samsum config)...")
print(f"Model: {model_name}")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Learning rate: {training_args.learning_rate}")
print(f"Epochs: {training_args.num_train_epochs}")
print(f"Total training samples: {len(tokenized_dataset['train'])}")
print(f"Total validation samples: {len(tokenized_dataset['validation'])}")
print("="*80)

trainer.train()
print("\n" + "="*80)
print("Training completed!")
print("="*80)

# -----------------------------------------
# Save the best model
# -----------------------------------------
print("\nSaving best model...")
trainer.save_model("./flan-t5-best-model")
tokenizer.save_pretrained("./flan-t5-best-model")
print("Model saved to ./flan-t5-best-model")


Using device: cuda
GPU: Tesla T4
Memory: 15.83 GB
Tokenizing datasets...


Map (num_proc=2):   0%|          | 0/8166 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/908 [00:00<?, ? examples/s]

Tokenization complete!
Successfully loaded ROUGE from evaluate library




Starting training with FLAN-T5 (philschmid/flan-t5-base-samsum config)...
Model: google/flan-t5-base
Batch size: 2
Learning rate: 5e-05
Epochs: 5
Total training samples: 8166
Total validation samples: 908


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss


In [27]:
# -----------------------------------------
# INFERENCE ON TEST SET & SUBMISSION GENERATION
# -----------------------------------------
print("="*80)
print("Starting inference on test set...")
print("="*80)

# Load the best model
from transformers import T5ForConditionalGeneration, T5TokenizerFast
import torch
from tqdm import tqdm

model_path = "./flan-t5-best-model"  # Updated to FLAN-T5 path
tokenizer = T5TokenizerFast.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

print(f"Model loaded from {model_path}")
print(f"Device: {device}")

# Prepare test data
test_dialogues = test["dialogue"].fillna("").tolist()
test_ids = test["id"].tolist()

print(f"\nTotal test samples: {len(test_dialogues)}")

# Generate predictions in batches
batch_size = 32  # A100 can handle large batches
predictions = []

print("\nGenerating predictions...")
for i in tqdm(range(0, len(test_dialogues), batch_size)):
    batch_dialogues = test_dialogues[i:i+batch_size]
    
    # Preprocess
    inputs = ["summarize: " + d for d in batch_dialogues]
    
    # Tokenize
    tokenized = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **tokenized,
            max_length=80,
            num_beams=4,              # Use beam search for better quality
            length_penalty=0.6,
            early_stopping=True,
            no_repeat_ngram_size=3,   # Avoid repetition
        )
    
    # Decode
    batch_predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    predictions.extend(batch_predictions)

print(f"\nGenerated {len(predictions)} predictions")

# -----------------------------------------
# Create submission.csv
# -----------------------------------------
import pandas as pd

submission_df = pd.DataFrame({
    'id': test_ids,
    'summary': predictions
})

# Save submission file
submission_df.to_csv('submission.csv', index=False)

print("\n" + "="*80)
print("Submission file created successfully!")
print("="*80)
print(f"\nSubmission shape: {submission_df.shape}")
print(f"Saved to: submission.csv")

# Display first few predictions
print("\n" + "="*80)
print("Sample predictions:")
print("="*80)
for i in range(min(5, len(submission_df))):
    print(f"\n--- Sample {i+1} ---")
    print(f"ID: {submission_df.iloc[i]['id']}")
    print(f"Dialogue: {test_dialogues[i][:200]}...")
    print(f"Predicted Summary: {submission_df.iloc[i]['summary']}")
    print()

# Verify submission format
print("="*80)
print("Submission file verification:")
print("="*80)
print(submission_df.head(10))
print("\nSubmission file is ready for upload!")

Starting inference on test set...
Model loaded from ./flan-t5-best-model
Device: cuda

Total test samples: 2210

Generating predictions...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 70/70 [03:43<00:00,  3.19s/it]


Generated 2210 predictions

Submission file created successfully!

Submission shape: (2210, 2)
Saved to: submission.csv

Sample predictions:

--- Sample 1 ---
ID: 13816165
Dialogue: Reuben: hey, what are you doing?
Lucy: nothing special
Lucy: why are you asking?
Reuben: I want to take Daisy and Norma for a walk
Reuben: I thought you could join us with Marvin
Lucy: great idea, mee...
Predicted Summary: A will take Daisy and Norma for a walk with Marvin.


--- Sample 2 ---
ID: 13731502
Dialogue: John: Hello, I wanted to ask how should I receive my prize.
Walter: In which competition? We have several of them.
John: 23rd of November, the prize was a phone.
Walter: Okay, let me check and contact...
Predicted Summary: B will receive his prize on 23rd of November. The address of the delivery will be in Allentown. B will choose a color for the prize.


--- Sample 3 ---
ID: 13821053
Dialogue: Amanda: Hi! Do you know actually the meaning of your names?
Jeff: I think everybody knows, right?
Aman




In [None]:
# -----------------------------------------
# INFERENCE ON TEST SET & SUBMISSION GENERATION
# -----------------------------------------
print("="*80)
print("Starting inference on test set...")
print("="*80)

# Load the best model
from transformers import T5ForConditionalGeneration, T5TokenizerFast
import torch
from tqdm import tqdm

model_path = "./flan-t5-best-model"  # Updated to FLAN-T5 path
tokenizer = T5TokenizerFast.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

print(f"Model loaded from {model_path}")
print(f"Device: {device}")

# Prepare test data
test_dialogues = test["dialogue"].fillna("").tolist()
test_ids = test["id"].tolist()

print(f"\nTotal test samples: {len(test_dialogues)}")

# Generate predictions in batches
batch_size = 32  # A100 can handle large batches
predictions = []

print("\nGenerating predictions...")
for i in tqdm(range(0, len(test_dialogues), batch_size)):
    batch_dialogues = test_dialogues[i:i+batch_size]
    
    # Preprocess
    inputs = ["summarize: " + d for d in batch_dialogues]
    
    # Tokenize
    tokenized = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **tokenized,
            max_length=80,
            num_beams=4,              # Use beam search for better quality
            length_penalty=0.6,
            early_stopping=True,
            no_repeat_ngram_size=3,   # Avoid repetition
        )
    
    # Decode
    batch_predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    predictions.extend(batch_predictions)

print(f"\nGenerated {len(predictions)} predictions")

# -----------------------------------------
# Create submission.csv
# -----------------------------------------
import pandas as pd

submission_df = pd.DataFrame({
    'id': test_ids,
    'summary': predictions
})

# Save submission file
submission_df.to_csv('submission.csv', index=False)

print("\n" + "="*80)
print("Submission file created successfully!")
print("="*80)
print(f"\nSubmission shape: {submission_df.shape}")
print(f"Saved to: submission.csv")

# Display first few predictions
print("\n" + "="*80)
print("Sample predictions:")
print("="*80)
for i in range(min(5, len(submission_df))):
    print(f"\n--- Sample {i+1} ---")
    print(f"ID: {submission_df.iloc[i]['id']}")
    print(f"Dialogue: {test_dialogues[i][:200]}...")
    print(f"Predicted Summary: {submission_df.iloc[i]['summary']}")
    print()

# Verify submission format
print("="*80)
print("Submission file verification:")
print("="*80)
print(submission_df.head(10))
print("\nSubmission file is ready for upload!")