In [7]:
import torch, os
print("CUDA available:", torch.cuda.is_available())
print("MPS available:", torch.backends.mps.is_available())
if torch.cuda.is_available():
    print("CUDA device:", torch.cuda.get_device_name(0))
# On Colab with GPU this prints GPU info; locally you'll see MPS True and CUDA False
!nvidia-smi  # works only on machines with Nvidia GPUs (Colab/GPU instances)

CUDA available: True
MPS available: False
CUDA device: NVIDIA A100-SXM4-40GB
Sat Jan  3 13:13:39 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   32C    P0             50W /  400W |       5MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+--------------------

In [8]:
!pip install transformers datasets sentencepiece rouge-score evaluate accelerate




In [9]:
!pip install scikit-learn




In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
from sklearn.model_selection import train_test_split

# Next

In [26]:
# -----------------------------------------
# Model + Tokenizer (FLAN-T5 - Better for Summarization!)
# -----------------------------------------
model_name = "google/flan-t5-base"  # FLAN-T5 is better than regular T5 for summarization
tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Move model to GPU
import torch
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Set seed for reproducibility
import random
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

max_input_length = 512
max_target_length = 80

# -----------------------------------------
# Preprocess Function
# -----------------------------------------
def preprocess(batch):
    inputs = ["summarize: " + d for d in batch["dialogue"]]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding="max_length",
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["summary"],
            max_length=max_target_length,
            truncation=True,
            padding="max_length",
        )

    # IMPORTANT: ignore padding tokens in loss
    labels["input_ids"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in seq]
        for seq in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Process with multiple workers for faster data loading
print("Tokenizing datasets...")
tokenized_dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names,
    num_proc=4
)
print("Tokenization complete!")

# -----------------------------------------
# Metric (FIXED - Handle decoding properly)
# -----------------------------------------
try:
    rouge = evaluate.load("rouge")
    print("Successfully loaded ROUGE from evaluate library")
except Exception as e:
    print(f"Error loading rouge from evaluate: {e}")
    print("Using rouge_score directly instead...")
    from rouge_score import rouge_scorer
    rouge = None

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    
    # Replace -100 in predictions (shouldn't happen but just in case)
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    
    # Decode predictions
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in labels with pad_token_id before decoding
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Strip whitespace
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    if rouge is not None:
        try:
            result = rouge.compute(
                predictions=decoded_preds,
                references=decoded_labels,
                use_stemmer=True
            )
            return result
        except Exception as e:
            print(f"Error using evaluate rouge: {e}")
            pass
    
    # Fallback to rouge_score
    from rouge_score import rouge_scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    
    for pred, ref in zip(decoded_preds, decoded_labels):
        score = scorer.score(ref, pred)
        rouge1_scores.append(score['rouge1'].fmeasure)
        rouge2_scores.append(score['rouge2'].fmeasure)
        rougeL_scores.append(score['rougeL'].fmeasure)
    
    return {
        'rouge1': sum(rouge1_scores) / len(rouge1_scores) if rouge1_scores else 0,
        'rouge2': sum(rouge2_scores) / len(rouge2_scores) if rouge2_scores else 0,
        'rougeL': sum(rougeL_scores) / len(rougeL_scores) if rougeL_scores else 0,
    }

# -----------------------------------------
# Data Collator
# -----------------------------------------
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
)

# -----------------------------------------
# Training Arguments (Matching philschmid/flan-t5-base-samsum)
# -----------------------------------------
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-base-dialogue-sum",
    
    # Hyperparameters from philschmid/flan-t5-base-samsum
    learning_rate=5e-05,                 # Exact match
    per_device_train_batch_size=8,       # Exact match
    per_device_eval_batch_size=8,        # Exact match
    num_train_epochs=5,                  # Exact match (was 3)
    
    # Seed for reproducibility
    seed=42,
    
    # Optimizer - Using AdamW (closest to Adam) with matching betas
    optim="adamw_torch",                 # Adam-like with betas=(0.9,0.999), eps=1e-08
    
    # Learning rate scheduler
    lr_scheduler_type="linear",          # Exact match
    warmup_steps=500,
    weight_decay=0.01,
    
    # Evaluation and logging
    eval_strategy="steps",
    eval_steps=500,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    
    # A100-specific optimizations
    bf16=True,                           # A100 has better bfloat16 support
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    gradient_checkpointing=False,        # A100 has enough memory
    
    # Generation settings
    predict_with_generate=True,
    generation_max_length=80,
    generation_num_beams=4,              # Better quality predictions
    
    # Performance
    report_to="none",
    logging_first_step=True,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    
    # Additional optimizations
    group_by_length=True,
    ddp_find_unused_parameters=False,
)

# -----------------------------------------
# Trainer
# -----------------------------------------
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# -----------------------------------------
# TRAINING
# -----------------------------------------
print("="*80)
print("Starting training with FLAN-T5 (philschmid/flan-t5-base-samsum config)...")
print(f"Model: {model_name}")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Learning rate: {training_args.learning_rate}")
print(f"Epochs: {training_args.num_train_epochs}")
print(f"Total training samples: {len(tokenized_dataset['train'])}")
print(f"Total validation samples: {len(tokenized_dataset['validation'])}")
print("="*80)

trainer.train()
print("\n" + "="*80)
print("Training completed!")
print("="*80)

# -----------------------------------------
# Save the best model
# -----------------------------------------
print("\nSaving best model...")
trainer.save_model("./flan-t5-best-model")
tokenizer.save_pretrained("./flan-t5-best-model")
print("Model saved to ./flan-t5-best-model")


Using device: cuda
GPU: NVIDIA A100-SXM4-40GB
Memory: 42.47 GB
Tokenizing datasets...


Map (num_proc=4):   0%|          | 0/6522 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/1631 [00:00<?, ? examples/s]



Tokenization complete!


Downloading builder script: 0.00B [00:00, ?B/s]

Successfully loaded ROUGE from evaluate library


  trainer = Seq2SeqTrainer(


Starting training with FLAN-T5 (philschmid/flan-t5-base-samsum config)...
Model: google/flan-t5-base
Batch size: 8
Learning rate: 5e-05
Epochs: 5
Total training samples: 6522
Total validation samples: 1631


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
500,1.5731,1.377279,0.497,0.249584,0.405677,0.406168
1000,1.5283,1.360534,0.498709,0.251658,0.407285,0.407561
1500,1.4924,1.346205,0.503938,0.251486,0.410301,0.410493
2000,1.3992,1.350046,0.507115,0.251601,0.409631,0.409763
2500,1.3377,1.352412,0.509212,0.256246,0.410548,0.41059
3000,1.334,1.349421,0.507443,0.255063,0.412513,0.412179
3500,1.3095,1.349449,0.503623,0.25169,0.406572,0.406722
4000,1.3408,1.34787,0.507316,0.253907,0.409627,0.409892


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].



Training completed!

Saving best model...
Model saved to ./flan-t5-best-model


In [27]:
# -----------------------------------------
# INFERENCE ON TEST SET & SUBMISSION GENERATION
# -----------------------------------------
print("="*80)
print("Starting inference on test set...")
print("="*80)

# Load the best model
from transformers import T5ForConditionalGeneration, T5TokenizerFast
import torch
from tqdm import tqdm

model_path = "./flan-t5-best-model"  # Updated to FLAN-T5 path
tokenizer = T5TokenizerFast.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

print(f"Model loaded from {model_path}")
print(f"Device: {device}")

# Prepare test data
test_dialogues = test["dialogue"].fillna("").tolist()
test_ids = test["id"].tolist()

print(f"\nTotal test samples: {len(test_dialogues)}")

# Generate predictions in batches
batch_size = 32  # A100 can handle large batches
predictions = []

print("\nGenerating predictions...")
for i in tqdm(range(0, len(test_dialogues), batch_size)):
    batch_dialogues = test_dialogues[i:i+batch_size]
    
    # Preprocess
    inputs = ["summarize: " + d for d in batch_dialogues]
    
    # Tokenize
    tokenized = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **tokenized,
            max_length=80,
            num_beams=4,              # Use beam search for better quality
            length_penalty=0.6,
            early_stopping=True,
            no_repeat_ngram_size=3,   # Avoid repetition
        )
    
    # Decode
    batch_predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    predictions.extend(batch_predictions)

print(f"\nGenerated {len(predictions)} predictions")

# -----------------------------------------
# Create submission.csv
# -----------------------------------------
import pandas as pd

submission_df = pd.DataFrame({
    'id': test_ids,
    'summary': predictions
})

# Save submission file
submission_df.to_csv('submission.csv', index=False)

print("\n" + "="*80)
print("Submission file created successfully!")
print("="*80)
print(f"\nSubmission shape: {submission_df.shape}")
print(f"Saved to: submission.csv")

# Display first few predictions
print("\n" + "="*80)
print("Sample predictions:")
print("="*80)
for i in range(min(5, len(submission_df))):
    print(f"\n--- Sample {i+1} ---")
    print(f"ID: {submission_df.iloc[i]['id']}")
    print(f"Dialogue: {test_dialogues[i][:200]}...")
    print(f"Predicted Summary: {submission_df.iloc[i]['summary']}")
    print()

# Verify submission format
print("="*80)
print("Submission file verification:")
print("="*80)
print(submission_df.head(10))
print("\nSubmission file is ready for upload!")

Starting inference on test set...
Model loaded from ./flan-t5-best-model
Device: cuda

Total test samples: 2210

Generating predictions...


100%|██████████| 70/70 [03:43<00:00,  3.19s/it]


Generated 2210 predictions

Submission file created successfully!

Submission shape: (2210, 2)
Saved to: submission.csv

Sample predictions:

--- Sample 1 ---
ID: 13816165
Dialogue: Reuben: hey, what are you doing?
Lucy: nothing special
Lucy: why are you asking?
Reuben: I want to take Daisy and Norma for a walk
Reuben: I thought you could join us with Marvin
Lucy: great idea, mee...
Predicted Summary: A will take Daisy and Norma for a walk with Marvin.


--- Sample 2 ---
ID: 13731502
Dialogue: John: Hello, I wanted to ask how should I receive my prize.
Walter: In which competition? We have several of them.
John: 23rd of November, the prize was a phone.
Walter: Okay, let me check and contact...
Predicted Summary: B will receive his prize on 23rd of November. The address of the delivery will be in Allentown. B will choose a color for the prize.


--- Sample 3 ---
ID: 13821053
Dialogue: Amanda: Hi! Do you know actually the meaning of your names?
Jeff: I think everybody knows, right?
Aman




In [28]:
ls

 colab_setup.ipynb           'SAMSum_Analysis_Complete_(1).ipynb'
 [0m[01;34mflan-t5-base-dialogue-sum[0m/   samsum_test_cleaned.csv
 [01;34mflan-t5-best-model[0m/          samsum_train_cleaned.csv
 KLA_Competition.ipynb        submission.csv
 [01;34mpulse-quest-env26[0m/


In [31]:
cd flan-t5-best-model/

/content/KLA/flan-t5-best-model


In [29]:
df = pd.read_csv("submission.csv")

In [30]:
print(df)

            id                                            summary
0     13816165  A will take Daisy and Norma for a walk with Ma...
1     13731502  B will receive his prize on 23rd of November. ...
2     13821053  Jeff's name is diminutive from Jefferson which...
3     13863137                    B spent a lot of time shopping.
4     13864413  Victor has taken his first steps. He's 10 mont...
...        ...                                                ...
2205  13829361  A and B are going to see The Man that Jack Bui...
2206  13864780  Tom is looking for a new employee for the team...
2207  13728480  A has just got her nails done, hair dyed lilac...
2208  13716555  A, Bradshaw and Cutler will meet for coffee in...
2209  13716284  A, Fred and Becky will have lunch at 1 pm in t...

[2210 rows x 2 columns]


In [None]:
# -----------------------------------------
# INFERENCE ON TEST SET & SUBMISSION GENERATION
# -----------------------------------------
print("="*80)
print("Starting inference on test set...")
print("="*80)

# Load the best model
from transformers import T5ForConditionalGeneration, T5TokenizerFast
import torch
from tqdm import tqdm

model_path = "./flan-t5-best-model"  # Updated to FLAN-T5 path
tokenizer = T5TokenizerFast.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

print(f"Model loaded from {model_path}")
print(f"Device: {device}")

# Prepare test data
test_dialogues = test["dialogue"].fillna("").tolist()
test_ids = test["id"].tolist()

print(f"\nTotal test samples: {len(test_dialogues)}")

# Generate predictions in batches
batch_size = 32  # A100 can handle large batches
predictions = []

print("\nGenerating predictions...")
for i in tqdm(range(0, len(test_dialogues), batch_size)):
    batch_dialogues = test_dialogues[i:i+batch_size]
    
    # Preprocess
    inputs = ["summarize: " + d for d in batch_dialogues]
    
    # Tokenize
    tokenized = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding=True,
        return_tensors="pt"
    ).to(device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **tokenized,
            max_length=80,
            num_beams=4,              # Use beam search for better quality
            length_penalty=0.6,
            early_stopping=True,
            no_repeat_ngram_size=3,   # Avoid repetition
        )
    
    # Decode
    batch_predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    predictions.extend(batch_predictions)

print(f"\nGenerated {len(predictions)} predictions")

# -----------------------------------------
# Create submission.csv
# -----------------------------------------
import pandas as pd

submission_df = pd.DataFrame({
    'id': test_ids,
    'summary': predictions
})

# Save submission file
submission_df.to_csv('submission.csv', index=False)

print("\n" + "="*80)
print("Submission file created successfully!")
print("="*80)
print(f"\nSubmission shape: {submission_df.shape}")
print(f"Saved to: submission.csv")

# Display first few predictions
print("\n" + "="*80)
print("Sample predictions:")
print("="*80)
for i in range(min(5, len(submission_df))):
    print(f"\n--- Sample {i+1} ---")
    print(f"ID: {submission_df.iloc[i]['id']}")
    print(f"Dialogue: {test_dialogues[i][:200]}...")
    print(f"Predicted Summary: {submission_df.iloc[i]['summary']}")
    print()

# Verify submission format
print("="*80)
print("Submission file verification:")
print("="*80)
print(submission_df.head(10))
print("\nSubmission file is ready for upload!")