In [19]:
# Install required packages
!pip install transformers datasets rouge-score evaluate

# Import libraries
import pandas as pd
import numpy as np
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import evaluate
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

# Load the fine-tuned T5 model and tokenizer
model_path = '/content/drive/MyDrive/t5_summarizer_model/'
print(f"Loading model from: {model_path}")

try:
    tokenizer = T5Tokenizer.from_pretrained(model_path)
    model = T5ForConditionalGeneration.from_pretrained(model_path)
    model.to(device)
    model.eval()
    print("Model and tokenizer loaded successfully!")
    print(f"Model size: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M parameters")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Please check the model path and ensure the model files exist.")

Using device: cpu
Loading model from: /content/drive/MyDrive/t5_summarizer_model/
Model and tokenizer loaded successfully!
Model size: 60.5M parameters


In [20]:
# Load the CSV file
csv_path = '/content/news_summary_with_emotion_final.csv'
print(f"Loading dataset from: {csv_path}")

try:
    df = pd.read_csv(csv_path)
    print(f"✅ Dataset loaded successfully!")
    print(f"Original dataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Please check the CSV file path.")

# Data cleaning
print("\nData Cleaning:")
initial_count = len(df)

# Drop rows with missing values in key columns
df = df.dropna(subset=['article', 'summary'])
print(f"After removing missing values: {len(df)} rows ({initial_count - len(df)} removed)")

# Remove very short articles (< 50 characters) and summaries (< 10 characters)
df = df[df['article'].str.len() >= 50]
df = df[df['summary'].str.len() >= 10]
print(f"After removing short text: {len(df)} rows")

# Optional: Sample for faster testing (remove this line for full evaluation)
if len(df) > 1000:
    df = df.sample(n=1000, random_state=42).reset_index(drop=True)
    print(f"Sampled to {len(df)} rows for faster testing")

# Display basic statistics
print(f"\nDataset Statistics:")
print(f"Average article length: {df['article'].str.len().mean():.0f} characters")
print(f"Average summary length: {df['summary'].str.len().mean():.0f} characters")
print(f"Emotion distribution:")
print(df['emotion_label'].value_counts())

# Show example rows
print(f"\n Example Data:")
for i in range(min(3, len(df))):
    print(f"\n--- Example {i+1} ---")
    print(f"Article (first 200 chars): {df.iloc[i]['article'][:200]}...")
    print(f"Original Summary: {df.iloc[i]['summary']}")
    print(f"Emotion: {df.iloc[i]['emotion_label']}")

Loading dataset from: /content/news_summary_with_emotion_final.csv
✅ Dataset loaded successfully!
Original dataset shape: (98379, 3)
Columns: ['summary', 'article', 'emotion_label']

Data Cleaning:
After removing missing values: 98379 rows (0 removed)
After removing short text: 98378 rows
Sampled to 1000 rows for faster testing

Dataset Statistics:
Average article length: 355 characters
Average summary length: 57 characters
Emotion distribution:
emotion_label
neutral     363
anger       179
sadness     164
joy         118
fear         83
surprise     60
disgust      33
Name: count, dtype: int64

 Example Data:

--- Example 1 ---
Article (first 200 chars): a student foiled an attempt to loot him on sunday when he was held at gunpoint at a petrol pump near delhi's shahdara. karan chowdhary said the thief asked for his motorbike's keys after which he soug...
Original Summary: held at gunpoint at fuel pump, student snatches gun from thief
Emotion: fear

--- Example 2 ---
Article (first 200

In [21]:
def generate_summary(article, model, tokenizer, max_input_length=512, max_output_length=64):
    """Generate summary for a single article"""
    # Prepare input text with T5 prefix
    input_text = f"summarize: {article}"

    # Tokenize input
    inputs = tokenizer.encode(
        input_text,
        return_tensors='pt',
        max_length=max_input_length,
        truncation=True,
        padding=True
    ).to(device)

    # Generate summary
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_output_length,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=2,
            temperature=0.7
        )

    # Decode the generated summary
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Run inference on all articles
print("Generating summaries...")
predicted_summaries = []
batch_size = 10  # Process in small batches to avoid memory issues

for i in tqdm(range(0, len(df), batch_size), desc="Processing batches"):
    batch_end = min(i + batch_size, len(df))
    batch_articles = df.iloc[i:batch_end]['article'].tolist()

    batch_predictions = []
    for article in batch_articles:
        try:
            prediction = generate_summary(article, model, tokenizer)
            batch_predictions.append(prediction)
        except Exception as e:
            print(f"Error processing article: {e}")
            batch_predictions.append("Error generating summary")

    predicted_summaries.extend(batch_predictions)

# Add predictions to dataframe
df['predicted_summary'] = predicted_summaries
print(f"Generated {len(predicted_summaries)} summaries!")

# Display comparison examples
print(f"\n Summary Comparison Examples:")
for i in range(min(5, len(df))):
    print(f"\n--- Example {i+1} ---")
    print(f"Article (first 150 chars): {df.iloc[i]['article'][:150]}...")
    print(f"Original Summary: {df.iloc[i]['summary']}")
    print(f"Predicted Summary: {df.iloc[i]['predicted_summary']}")
    print(f"Emotion: {df.iloc[i]['emotion_label']}")
    print("-" * 80)

Generating summaries...


Processing batches:   0%|          | 0/100 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Th

Generated 1000 summaries!

 Summary Comparison Examples:

--- Example 1 ---
Article (first 150 chars): a student foiled an attempt to loot him on sunday when he was held at gunpoint at a petrol pump near delhi's shahdara. karan chowdhary said the thief ...
Original Summary: held at gunpoint at fuel pump, student snatches gun from thief
Predicted Summary: karan chowdhary says thief asked for his motorbike's keys.
Emotion: fear
--------------------------------------------------------------------------------

--- Example 2 ---
Article (first 150 chars): actor irrfan khan has revealed he has contracted a rare disease while adding, "sometimes you wake up with a jolt with life shaking you up. the last fi...
Original Summary: i have contracted a rare disease, reveals irrfan khan
Predicted Summary: irrfan khan reveals he has contracted a rare disease.
Emotion: sadness
--------------------------------------------------------------------------------

--- Example 3 ---
Article (first 150 chars): 




In [25]:
# Load ROUGE metric
rouge_metric = evaluate.load("rouge")

# Prepare data for ROUGE evaluation
original_summaries = df['summary'].tolist()
predicted_summaries = df['predicted_summary'].tolist()

# Filter out any error cases
valid_pairs = []
for orig, pred in zip(original_summaries, predicted_summaries):
    if pred != "Error generating summary" and len(pred.strip()) > 0:
        valid_pairs.append((orig, pred))

print(f"Evaluating {len(valid_pairs)} valid summary pairs...")

# Calculate ROUGE scores
if len(valid_pairs) > 0:
    references = [pair[0] for pair in valid_pairs]
    predictions = [pair[1] for pair in valid_pairs]

    # Compute ROUGE scores
    rouge_scores = rouge_metric.compute(
        predictions=predictions,
        references=references,
        use_stemmer=True
    )

    print(f"\n ROUGE Evaluation Results:")
    print(f"{'='*50}")

    # Handle different return formats for ROUGE scores
    def get_rouge_score(score_dict, metric_name):
        if hasattr(score_dict[metric_name], 'mid'):
            # Old format with .mid attribute
            return score_dict[metric_name].mid
        else:
            # New format - direct numpy values
            return score_dict[metric_name]

    # ROUGE-1 scores
    rouge1_score = get_rouge_score(rouge_scores, 'rouge1')
    print(f"ROUGE-1 (Unigram Overlap):")
    if hasattr(rouge1_score, 'precision'):
        print(f"  Precision: {rouge1_score.precision:.4f}")
        print(f"  Recall:    {rouge1_score.recall:.4f}")
        print(f"  F1-Score:  {rouge1_score.fmeasure:.4f}")
    else:
        print(f"  F1-Score:  {rouge1_score:.4f}")

    # ROUGE-2 scores
    rouge2_score = get_rouge_score(rouge_scores, 'rouge2')
    print(f"\nROUGE-2 (Bigram Overlap):")
    if hasattr(rouge2_score, 'precision'):
        print(f"  Precision: {rouge2_score.precision:.4f}")
        print(f"  Recall:    {rouge2_score.recall:.4f}")
        print(f"  F1-Score:  {rouge2_score.fmeasure:.4f}")
    else:
        print(f"  F1-Score:  {rouge2_score:.4f}")

    # ROUGE-L scores
    rougeL_score = get_rouge_score(rouge_scores, 'rougeL')
    print(f"\nROUGE-L (Longest Common Subsequence):")
    if hasattr(rougeL_score, 'precision'):
        print(f"  Precision: {rougeL_score.precision:.4f}")
        print(f"  Recall:    {rougeL_score.recall:.4f}")
        print(f"  F1-Score:  {rougeL_score.fmeasure:.4f}")
    else:
        print(f"  F1-Score:  {rougeL_score:.4f}")

    # ROUGE-Lsum scores
    rougeLsum_score = get_rouge_score(rouge_scores, 'rougeLsum')
    print(f"\nROUGE-Lsum (Summary-level LCS):")
    if hasattr(rougeLsum_score, 'precision'):
        print(f"  Precision: {rougeLsum_score.precision:.4f}")
        print(f"  Recall:    {rougeLsum_score.recall:.4f}")
        print(f"  F1-Score:  {rougeLsum_score.fmeasure:.4f}")
    else:
        print(f"  F1-Score:  {rougeLsum_score:.4f}")

    # Additional statistics
    print(f"\n Additional Statistics:")
    print(f"Average original summary length: {np.mean([len(s.split()) for s in references]):.1f} words")
    print(f"Average predicted summary length: {np.mean([len(s.split()) for s in predictions]):.1f} words")

    # Length comparison
    orig_lengths = [len(s.split()) for s in references]
    pred_lengths = [len(s.split()) for s in predictions]

    print(f"Original summaries - Min: {min(orig_lengths)} words, Max: {max(orig_lengths)} words")
    print(f"Predicted summaries - Min: {min(pred_lengths)} words, Max: {max(pred_lengths)} words")

    # Show best and worst examples based on ROUGE-1 F1
    individual_scores = []
    for ref, pred in zip(references, predictions):
        score = rouge_metric.compute(predictions=[pred], references=[ref])
        rouge1_individual = get_rouge_score(score, 'rouge1')
        if hasattr(rouge1_individual, 'fmeasure'):
            individual_scores.append(rouge1_individual.fmeasure)
        else:
            individual_scores.append(rouge1_individual)

    # Best example
    best_idx = np.argmax(individual_scores)
    print(f"\n Best Summary (ROUGE-1 F1: {individual_scores[best_idx]:.4f}):")
    print(f"Original: {references[best_idx]}")
    print(f"Predicted: {predictions[best_idx]}")

    # Worst example
    worst_idx = np.argmin(individual_scores)
    print(f"\n Worst Summary (ROUGE-1 F1: {individual_scores[worst_idx]:.4f}):")
    print(f"Original: {references[worst_idx]}")
    print(f"Predicted: {predictions[worst_idx]}")

else:
    print(" No valid predictions found for evaluation!")

# Save results to CSV
output_path = '/content/drive/MyDrive/evaluation_results.csv'
df.to_csv(output_path, index=False)
print(f"\n Results saved to: {output_path}")

print(f"\n Evaluation Complete!")
print(f"Summary: Evaluated {len(valid_pairs)} summaries with T5 fine-tuned model")

Evaluating 1000 valid summary pairs...

 ROUGE Evaluation Results:
ROUGE-1 (Unigram Overlap):
  F1-Score:  0.3961

ROUGE-2 (Bigram Overlap):
  F1-Score:  0.1731

ROUGE-L (Longest Common Subsequence):
  F1-Score:  0.3472

ROUGE-Lsum (Summary-level LCS):
  F1-Score:  0.3477

 Additional Statistics:
Average original summary length: 9.5 words
Average predicted summary length: 12.5 words
Original summaries - Min: 5 words, Max: 14 words
Predicted summaries - Min: 4 words, Max: 36 words

 Best Summary (ROUGE-1 F1: 1.0000):
Original: trump trolled for misspelling 'hamburgers' as 'hamberders'
Predicted: trump trolled for misspelling hamburgers as 'hamberders'

 Worst Summary (ROUGE-1 F1: 0.0000):
Original: i played too many dot balls, concedes kohli after rcb defeat
Predicted: royal challengers bangalore lost to kolkata knight riders in ipl 2018 opener

 Results saved to: /content/drive/MyDrive/evaluation_results.csv

 Evaluation Complete!
Summary: Evaluated 1000 summaries with T5 fine-tuned mo