In [None]:
import pandas as pd
import numpy as np
import torch
import warnings
warnings.filterwarnings('ignore')


# Install required packages
print("Installing required packages...")
!pip install transformers datasets rouge-score nltk -q

# Load the dataset
print("Loading dataset...")
# Use the 'python' engine to handle potential parsing issues and skip bad lines
df = pd.read_csv('news_summary_clean.csv', engine='python', on_bad_lines='skip')
print(f"Initial dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

# Check for missing values
print(f"\nMissing values:")
print(df.isnull().sum())

# Data cleaning
print("\nCleaning data...")
# Drop rows where 'article' or 'summary' is missing
df = df.dropna(subset=['article', 'summary'])
print(f"After dropping missing values: {df.shape}")

# Remove rows where either text has fewer than 10 words
def count_words(text):
    return len(str(text).split())

df['article_word_count'] = df['article'].apply(count_words)
df['summary_word_count'] = df['summary'].apply(count_words)

# Filter out rows with less than 10 words in either article or summary
df = df[(df['article_word_count'] >= 10) & (df['summary_word_count'] >= 10)]
print(f"After filtering short texts: {df.shape}")

# Clean up temporary columns
df = df.drop(['article_word_count', 'summary_word_count'], axis=1)

# Reset index
df = df.reset_index(drop=True)

print(f"\nFinal dataset shape: {df.shape}")
print(f"Dataset info:")
print(f"- Articles: {len(df)}")
print(f"- Average article length: {df['article'].str.len().mean():.0f} characters")
print(f"- Average summary length: {df['summary'].str.len().mean():.0f} characters")

# Display 3 sample rows
print(f"\n3 Sample rows:")
for i in range(min(3, len(df))):
    print(f"\n--- Sample {i+1} ---")
    print(f"Article: {df.iloc[i]['article'][:200]}...")
    print(f"Summary: {df.iloc[i]['summary']}")

Installing required packages...
Loading dataset...
Initial dataset shape: (65172, 2)
Columns: ['summary', 'article']

Missing values:
summary    0
article    0
dtype: int64

Cleaning data...
After dropping missing values: (65172, 2)
After filtering short texts: (35477, 4)

Final dataset shape: (35477, 2)
Dataset info:
- Articles: 35477
- Average article length: 351 characters
- Average summary length: 59 characters

3 Sample rows:

--- Sample 1 ---
Article: saurav kant, an alumnus of upgrad and iiitb's pg program in machine learning and artificial intelligence, was a sr systems engineer at infosys with almost 5 years of work experience. the program and u...
Summary: upgrad learner switches to career in ml  al with 90 salary hike

--- Sample 2 ---
Article: kunal shah's credit card bill payment platform, cred, gave users a chance to win free food from swiggy for one year. pranav kaushik, a delhi techie, bagged this reward after spending 2000 cred coins. ...
Summary: delhi techie wins fre

In [None]:
from datasets import Dataset, DatasetDict
from transformers import T5Tokenizer
from sklearn.model_selection import train_test_split

print("Setting up dataset split and tokenization...")

# Split the data (90% train, 10% validation)
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Create DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

print(f"Dataset dict: {dataset_dict}")

# Load T5 tokenizer
print("Loading T5 tokenizer...")
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Define tokenization function
def tokenize_function(examples):
    # T5 expects a task prefix
    inputs = ["summarize: " + doc for doc in examples['article']]
    targets = examples['summary']

    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding="max_length",
        truncation=True
    )

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=64,
            padding="max_length",
            truncation=True
        )

    # Replace padding token id's of the labels by -100 so it's ignored by loss
    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]

    return model_inputs

# Apply tokenization to both datasets
print("Tokenizing datasets...")
tokenized_datasets = dataset_dict.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset_dict["train"].column_names
)

print(f"Tokenized datasets: {tokenized_datasets}")
print(f"Sample tokenized input length: {len(tokenized_datasets['train']['input_ids'][0])}")
print(f"Sample tokenized label length: {len(tokenized_datasets['train']['labels'][0])}")

Setting up dataset split and tokenization...
Train set size: 31929
Validation set size: 3548
Dataset dict: DatasetDict({
    train: Dataset({
        features: ['summary', 'article', '__index_level_0__'],
        num_rows: 31929
    })
    validation: Dataset({
        features: ['summary', 'article', '__index_level_0__'],
        num_rows: 3548
    })
})
Loading T5 tokenizer...
Tokenizing datasets...


Map:   0%|          | 0/31929 [00:00<?, ? examples/s]

Map:   0%|          | 0/3548 [00:00<?, ? examples/s]

Tokenized datasets: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 31929
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3548
    })
})
Sample tokenized input length: 512
Sample tokenized label length: 64


In [None]:
from transformers import (
    T5ForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from datasets import load_metric
import nltk

# Download required NLTK data
print("Downloading NLTK data...")
nltk.download('punkt', quiet=True)

# Load the model
print("Loading T5 model...")
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Check model size
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

# Load ROUGE metric
print("Loading ROUGE metric...")
rouge_metric = load_metric('rouge')

# Define compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Compute ROUGE scores
    result = rouge_metric.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    # Extract the median scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    return {k: round(v, 4) for k, v in result.items()}

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8
)

# Training arguments
print("Setting up training arguments...")
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=500,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=500,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    dataloader_pin_memory=False,
    generation_max_length=64,
    generation_num_beams=4,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    greater_is_better=True,
    report_to=None,  # Disable wandb/tensorboard
)

# Initialize trainer
print("Initializing trainer...")
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Trainer initialized successfully!")
print(f"Training will run for {training_args.num_train_epochs} epochs")
print(f"Batch size: {training_args.per_device_train_batch_size}")

Downloading NLTK data...
Loading T5 model...
Total parameters: 60,506,624
Trainable parameters: 60,506,624
Loading ROUGE metric...
Setting up training arguments...
Initializing trainer...
Trainer initialized successfully!
Training will run for 2 epochs
Batch size: 4


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

# Create local output directory
save_directory = "/content/t5_summarizer_model"
os.makedirs(save_directory, exist_ok=True)

print(f"Model will be saved locally to: {save_directory}")

# Start training
print("Starting training...")
print("="*50)

try:
    # Train the model
    trainer.train()

    print("Training completed successfully!")

    # Evaluate on validation set
    print("\nEvaluating on validation set...")
    eval_results = trainer.evaluate()

    print("Evaluation Results:")
    for key, value in eval_results.items():
        print(f"  {key}: {value}")

    # Save the model and tokenizer
    print(f"\nSaving model and tokenizer to {save_directory}...")
    trainer.save_model(save_directory)
    tokenizer.save_pretrained(save_directory)

    # Save training results
    results_file = os.path.join(save_directory, "training_results.txt")
    with open(results_file, "w") as f:
        f.write("Training Results\n")
        f.write("="*30 + "\n")
        f.write(f"Model: t5-small\n")
        f.write(f"Dataset size: {len(df)}\n")
        f.write(f"Training samples: {len(train_df)}\n")
        f.write(f"Validation samples: {len(val_df)}\n")
        f.write(f"Epochs: {training_args.num_train_epochs}\n")
        f.write(f"Batch size: {training_args.per_device_train_batch_size}\n")
        f.write(f"Learning rate: {training_args.learning_rate}\n")
        f.write("\nEvaluation Results:\n")
        for key, value in eval_results.items():
            f.write(f"  {key}: {value}\n")

    print(f"Training results saved to: {results_file}")

    # Test the model with a sample
    print("\nTesting the model with a sample...")
    sample_article = df.iloc[0]['article']
    sample_summary = df.iloc[0]['summary']

    # Generate summary
    inputs = tokenizer.encode(
        "summarize: " + sample_article,
        return_tensors="pt",
        max_length=512,
        truncation=True
    )

    if torch.cuda.is_available():
        inputs = inputs.to('cuda')
        model = model.to('cuda')

    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=64,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )

    generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"\nSample Test:")
    print(f"Original Article: {sample_article[:300]}...")
    print(f"Original Summary: {sample_summary}")
    print(f"Generated Summary: {generated_summary}")

    # Create a zip file for easy download
    print(f"\nCreating zip file for download...")
    import shutil

    zip_filename = "/content/t5_summarizer_model.zip"
    shutil.make_archive("/content/t5_summarizer_model", 'zip', save_directory)

    print(f"Fine-tuning completed successfully!")
    print(f"Model saved locally to: {save_directory}")
    print(f"Zip file created at: {zip_filename}")
    print(f"To download the model:")
    print(f"   1. Go to Files panel in Colab (left sidebar)")
    print(f"   2. Navigate to /content/")
    print(f"   3. Right-click on 't5_summarizer_model.zip' and select 'Download'")
    print(f"   4. Or download individual files from '{save_directory}' folder")

except Exception as e:
    print(f"Error during training: {str(e)}")
    print("Saving current state...")

    # Save whatever we have
    try:
        trainer.save_model(save_directory)
        tokenizer.save_pretrained(save_directory)
        print(f"Model state saved to: {save_directory}")

        # Create zip file even if training failed
        import shutil
        zip_filename = "/content/t5_summarizer_model_partial.zip"
        shutil.make_archive("/content/t5_summarizer_model_partial", 'zip', save_directory)
        print(f"Partial model zip created at: {zip_filename}")

    except Exception as save_error:
        print(f"Error saving model: {str(save_error)}")

print("\nScript completed!")

Model will be saved locally to: /content/t5_summarizer_model
Starting training...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss


Error during training: You must call wandb.init() before wandb.log()
Saving current state...
Model state saved to: /content/t5_summarizer_model
Partial model zip created at: /content/t5_summarizer_model_partial.zip

Script completed!
