In [6]:

import os
import shutil
import pandas as pd
from datasets import Dataset
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)

# Load Samsum dataset from a CSV file (using pandas first for error-free loading)
file_path = "samsum-test.csv"  # Replace with your actual file path

# Load the dataset into a Pandas DataFrame
data = pd.read_csv(file_path)

# Ensure the dataset contains 'dialogue' and 'summary'
if 'dialogue' not in data.columns or 'summary' not in data.columns:
    raise ValueError("The dataset must have 'dialogue' and 'summary' columns.")

# Split the dataset into training and validation sets
train_size = int(0.9 * len(data))
train_data = data[:train_size]
valid_data = data[train_size:]

# Rename columns to match 'text' and 'labels' for consistency with the tokenization step
train_data = train_data.rename(columns={'dialogue': 'text', 'summary': 'labels'})
valid_data = valid_data.rename(columns={'dialogue': 'text', 'summary': 'labels'})

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
valid_dataset = Dataset.from_pandas(valid_data)

# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
special_tokens_dict = {"pad_token": "[PAD]"}
tokenizer.add_special_tokens(special_tokens_dict)

# Resize model embeddings to accommodate the new pad token
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# Tie weights to fix missing keys issue
model.tie_weights()

# Tokenize the dataset
def tokenize_function(examples):
    inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    labels = tokenizer(examples["labels"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = labels["input_ids"]
    return inputs

train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)

# Data collator for dynamic padding (use this for language modeling)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # GPT-2 uses causal language modeling
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",  # Evaluate after every epoch
    save_strategy="epoch",  # Save after every epoch
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
    load_best_model_at_end=True,  # Load the best model at the end
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
temp_dir = "./temp_fine_tuned_gpt2"
model.save_pretrained(temp_dir, safe_serialization=False)
tokenizer.save_pretrained(temp_dir)

# Ensure the destination directory doesn't already exist
final_dir = "./fine_tuned_gpt2"
if os.path.exists(final_dir):
    # Remove the final directory if it exists
    def remove_readonly(func, path, excinfo):
        # Change the file permissions to be writable before deleting
        os.chmod(path, 0o777)
        func(path)

    shutil.rmtree(final_dir, onerror=remove_readonly)

# Move or copy the directory to the desired location
shutil.copytree(temp_dir, final_dir)

# Clean up the temporary directory
shutil.rmtree(temp_dir)

print("Fine-tuning complete. Model saved at './fine_tuned_gpt2'")

# Safely extract the final training loss
final_loss = None
for log in reversed(trainer.state.log_history):
    if "loss" in log:
        final_loss = log["loss"]
        break

if final_loss is not None:
    print(f"Final Training Loss: {final_loss}")
else:
    print("Final Training Loss could not be found in log history.")



Map:   0%|          | 0/737 [00:00<?, ? examples/s]

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,2.928765
2,No log,2.89652
3,2.874400,2.902786


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


PermissionError: [WinError 5] Access is denied: './fine_tuned_gpt2\\model.safetensors'

In [20]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
import os
import shutil
import torch  # Import torch to check for GPU availability

# Report the final training loss
final_loss = None
for log in reversed(trainer.state.log_history):  # Ensure `trainer` is defined
    if "loss" in log:
        final_loss = log["loss"]
        break

if final_loss is not None:
    print(f"Final Training Loss: {final_loss}")
else:
    print("Final Training Loss could not be found in log history.")

# Save the fine-tuned model and tokenizer
output_dir = "./fine_tuned_gpt2_v2"  # Directory to save the model
if os.path.exists(output_dir):
    shutil.rmtree(output_dir, ignore_errors=True)  # Remove existing directory
os.makedirs(output_dir, exist_ok=True)  # Create a clean directory

try:
    fine_tuned_model.save_pretrained(output_dir)  # Default safe_serialization=True
    tokenizer.save_pretrained(output_dir)
    print(f"Model and tokenizer saved successfully to {output_dir}")
except Exception as e:
    print(f"Error saving model: {e}")

# Load the fine-tuned model and tokenizer
try:
    fine_tuned_model = GPT2LMHeadModel.from_pretrained(output_dir, ignore_mismatched_sizes=True)
    tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading model or tokenizer: {e}")

# If the pad token is missing, add it
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Check if GPU is available, if not use CPU
device = 0 if torch.cuda.is_available() else -1  # Set device to GPU if available, otherwise CPU

# Set up a text generation pipeline with adjusted parameters
text_generator = pipeline(
    "text-generation",
    model=fine_tuned_model,
    tokenizer=tokenizer,
    device=device  # Use GPU if available; set to -1 for CPU
)

# Sample dialogue from the dataset
sample_dialogue = data["dialogue"].iloc[0]  # Replace with your dataset column name
print(f"Original Text:\n{sample_dialogue}")

# Generate the summary with adjusted parameters
try:
    generated_summary = text_generator(
        sample_dialogue,
        max_new_tokens=50,  # Limit the number of new tokens
        num_return_sequences=1,  # Generate only one summary
        pad_token_id=tokenizer.pad_token_id,
        temperature=0.7,  # Controls randomness
        top_k=50,         # Limits sampling to top 50 tokens
        top_p=0.9         # Nucleus sampling for diversity
    )

    # Clean repetitive outputs
    def clean_generated_text(text):
        words = text.split()
        cleaned_text = []
        for i, word in enumerate(words):
            if i == 0 or word != words[i - 1]:  # Remove consecutive duplicates
                cleaned_text.append(word)
        return " ".join(cleaned_text)

    raw_output = generated_summary[0]['generated_text']
    cleaned_summary = clean_generated_text(raw_output)

    print(f"Predicted Summary (Raw):\n{raw_output}")
    print(f"Predicted Summary (Cleaned):\n{cleaned_summary}")
except Exception as e:
    print(f"Error during text generation: {e}")


Final Training Loss: 2.8744
Error saving model: Error while serializing: IoError(Os { code: 1224, kind: Uncategorized, message: "The requested operation cannot be performed on a file with a user-mapped section open." })
Error loading model or tokenizer: Can't load tokenizer for './fine_tuned_gpt2_v2'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure './fine_tuned_gpt2_v2' is the correct path to a directory containing all relevant files for a GPT2Tokenizer tokenizer.
Original Text:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye




Predicted Summary (Raw):
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly honestly
Predicted Summary (Cleaned):
Hannah: Hey, do you have Betty's number? Amanda: Lemme check Hannah: <file_gif> Amanda: 

In [3]:
from rouge_score import rouge_scorer
import pandas as pd

data = pd.read_csv('samsum-test.csv')
# Sample dialogue from dataset
sample_dialogue = data["dialogue"].iloc[0]

# Replace with the predicted summary from your model
cleaned_summary = "Hannah wanted Betty's number. Amanda suggested asking Larry, but Hannah hesitated before agreeing to text him."

# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Compute ROUGE scores
scores = scorer.score(sample_dialogue, cleaned_summary)

# Output results
print(f"Original Dialogue:\n{sample_dialogue}\n")
print(f"Generated Summary:\n{cleaned_summary}\n")
print(f"ROUGE Scores:\n{scores}")

Original Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Generated Summary:
Hannah wanted Betty's number. Amanda suggested asking Larry, but Hannah hesitated before agreeing to text him.

ROUGE Scores:
{'rouge1': Score(precision=0.5882352941176471, recall=0.1282051282051282, fmeasure=0.21052631578947367), 'rouge2': Score(precision=0.3125, recall=0.06493506493506493, fmeasure=0.1075268817204301), 'rougeL': Score(precision=0.5882352941176471, recall=0.1282051282051282, fmeasure=0.21052631578947367)}


In [1]:
import os
import shutil
import pandas as pd
from datasets import Dataset
from transformers import (
    BertTokenizer,
    BertForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
)

# Load Samsum dataset from a CSV file (using pandas first for error-free loading)
file_path = "samsum-test.csv"  # Replace with your actual file path

# Load the dataset into a Pandas DataFrame
data = pd.read_csv(file_path)

# Ensure the dataset contains 'dialogue' and 'summary'
if 'dialogue' not in data.columns or 'summary' not in data.columns:
    raise ValueError("The dataset must have 'dialogue' and 'summary' columns.")

# Split the dataset into training and validation sets
train_size = int(0.9 * len(data))
train_data = data[:train_size]
valid_data = data[train_size:]

# Rename columns to match 'text' and 'labels' for consistency with the tokenization step
train_data = train_data.rename(columns={'dialogue': 'text', 'summary': 'labels'})
valid_data = valid_data.rename(columns={'dialogue': 'text', 'summary': 'labels'})

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
valid_dataset = Dataset.from_pandas(valid_data)

# Load BERT tokenizer and model for token classification (extractive summarization)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Tokenize the dataset and create token-level labels (1 for summary, 0 for non-summary)
def tokenize_function(examples):
    # Tokenize text
    encodings = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

    # Create token-level labels (1 for summary token, 0 for non-summary token)
    labels = []
    for i, text in enumerate(examples["labels"]):
        tokenized_summary = tokenizer(text, truncation=True, padding="max_length", max_length=512)
        label = [1 if token in tokenized_summary["input_ids"] else 0 for token in encodings["input_ids"][i]]
        labels.append(label)
    
    encodings["labels"] = labels
    return encodings

train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)

# Data collator for token classification (dynamic padding)
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",  # Evaluate after every epoch
    save_strategy="epoch",  # Save after every epoch
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
    load_best_model_at_end=True,  # Load the best model at the end
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
temp_dir = "./temp_fine_tuned_bert"
model.save_pretrained(temp_dir, safe_serialization=False)
tokenizer.save_pretrained(temp_dir)

# Ensure the destination directory doesn't already exist
final_dir = "./fine_tuned_bert"
if os.path.exists(final_dir):
    # Remove the final directory if it exists
    def remove_readonly(func, path, excinfo):
        # Change the file permissions to be writable before deleting
        os.chmod(path, 0o777)
        func(path)

    shutil.rmtree(final_dir, onerror=remove_readonly)

# Move or copy the directory to the desired location
shutil.copytree(temp_dir, final_dir)

# Clean up the temporary directory
shutil.rmtree(temp_dir)

print("Fine-tuning complete. Model saved at './fine_tuned_bert'")

# Safely extract the final training loss
final_loss = None
for log in reversed(trainer.state.log_history):
    if "loss" in log:
        final_loss = log["loss"]
        break

if final_loss is not None:
    print(f"Final Training Loss: {final_loss}")
else:
    print("Final Training Loss could not be found in log history.")





Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/737 [00:00<?, ? examples/s]

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.091255
2,No log,0.089432
3,0.086600,0.0971


Fine-tuning complete. Model saved at './fine_tuned_bert'
Final Training Loss: 0.0866


In [2]:
from transformers import BertTokenizer, BertForTokenClassification
import torch
from datasets import load_metric

# Load the fine-tuned model and tokenizer
model_path = "./fine_tuned_bert"  # Path to the saved fine-tuned model
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForTokenClassification.from_pretrained(model_path)

# Define a sample text
sample_text = """Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye"""

# Tokenize the input
inputs = tokenizer(
    sample_text,
    truncation=True,
    padding="max_length",
    max_length=512,
    return_tensors="pt",
)

# Get predictions
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)

# Decode predictions to form the summary
predicted_tokens = []
for input_id, prediction in zip(inputs["input_ids"][0], predictions[0]):
    if prediction == 1:  # Tokens marked as part of the summary
        predicted_tokens.append(tokenizer.decode(input_id))

predicted_summary = " ".join(predicted_tokens)

# Print the original and predicted summaries
print("Original Text:")
print(sample_text)
print("\nPredicted Summary:")
print(predicted_summary)

# Evaluate with ROUGE
metric = load_metric("rouge")
reference_summary = "Amanda suggests Hannah ask Larry for Betty's number."  # Example ground truth
results = metric.compute(predictions=[predicted_summary], references=[reference_summary])

# Print ROUGE scores
print("\nROUGE Scores:")
for key, value in results.items():
    print(f"{key}: {value.mid}")


Original Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Generated Summary:
Hannah wanted Betty's number. Amanda suggested asking Larry, but Hannah hesitated before agreeing to text him.

ROUGE Scores:
{'rouge1': Score(precision=0.5882352941176471, recall=0.1282051282051282, fmeasure=0.21052631578947367), 'rouge2': Score(precision=0.3125, recall=0.06493506493506493, fmeasure=0.1075268817204301), 'rougeL': Score(precision=0.5882352941176471, recall=0.1282051282051282, fmeasure=0.21052631578947367)}


In [None]:
import os
import shutil
import pandas as pd
from datasets import Dataset
from transformers import (
    LlamaTokenizer,
    LlamaForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)

# Load Samsum dataset from a CSV file (using pandas for loading)
file_path = "samsum-test.csv"  # Replace with your actual file path

# Load the dataset into a Pandas DataFrame
data = pd.read_csv(file_path)

# Ensure the dataset contains 'dialogue' and 'summary'
if 'dialogue' not in data.columns or 'summary' not in data.columns:
    raise ValueError("The dataset must have 'dialogue' and 'summary' columns.")

# Split the dataset into training and validation sets
train_size = int(0.9 * len(data))
train_data = data[:train_size]
valid_data = data[train_size:]

# Rename columns to match 'text' and 'labels' for consistency with the tokenization step
train_data = train_data.rename(columns={'dialogue': 'text', 'summary': 'labels'})
valid_data = valid_data.rename(columns={'dialogue': 'text', 'summary': 'labels'})

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
valid_dataset = Dataset.from_pandas(valid_data)

# Load LLaMA tokenizer and model
tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer.pad_token = "[PAD]"

# Load the model
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
model.resize_token_embeddings(len(tokenizer))  # Resize token embeddings to account for padding token

# Tokenize the dataset
def tokenize_function(examples):
    inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    labels = tokenizer(examples["labels"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = labels["input_ids"]
    return inputs

train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)

# Data collator for dynamic padding (for causal language modeling)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # LLaMA uses causal language modeling
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",  # Evaluate after every epoch
    save_strategy="epoch",  # Save after every epoch
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
    load_best_model_at_end=True,  # Load the best model at the end
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
temp_dir = "./temp_fine_tuned_llama"
model.save_pretrained(temp_dir, safe_serialization=False)
tokenizer.save_pretrained(temp_dir)

# Ensure the destination directory doesn't already exist
final_dir = "./fine_tuned_llama"
if os.path.exists(final_dir):
    # Remove the final directory if it exists
    def remove_readonly(func, path, excinfo):
        # Change the file permissions to be writable before deleting
        os.chmod(path, 0o777)
        func(path)

    shutil.rmtree(final_dir, onerror=remove_readonly)

# Move or copy the directory to the desired location
shutil.copytree(temp_dir, final_dir)

# Clean up the temporary directory
shutil.rmtree(temp_dir)

print("Fine-tuning complete. Model saved at './fine_tuned_llama'")

# Safely extract the final training loss
final_loss = None
for log in reversed(trainer.state.log_history):
    if "loss" in log:
        final_loss = log["loss"]
        break

if final_loss is not None:
    print(f"Final Training Loss: {final_loss}")
else:
    print("Final Training Loss could not be found in log history.")


In [None]:
from transformers import pipeline, Trainer

# Load the fine-tuned model and tokenizer
model_dir = "./fine_tuned_llama"
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForCausalLM.from_pretrained(model_dir)

# Load evaluation dataset
eval_dataset = valid_dataset  # Use the validation dataset loaded during training

# Set up the evaluation pipeline
generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0  # Use GPU if available, else set to -1 for CPU
)

# Function to generate and evaluate predictions
def evaluate_model(eval_dataset, generation_pipeline):
    predictions = []
    references = []

    for example in eval_dataset:
        input_text = example["text"]
        reference_summary = example["labels"]
        
        # Generate prediction
        generated = generation_pipeline(
            input_text, max_length=50, num_return_sequences=1, truncation=True
        )
        generated_text = generated[0]["generated_text"]
        
        predictions.append(generated_text)
        references.append(reference_summary)
    
    return predictions, references

# Get predictions and references
predictions, references = evaluate_model(eval_dataset, generation_pipeline)

# Evaluate BLEU score
from datasets import load_metric

metric = load_metric("bleu")
references = [[ref] for ref in references]  # BLEU expects list of list of references
results = metric.compute(predictions=predictions, references=references)

print("Evaluation Results:")
print(f"BLEU Score: {results['bleu']}")

# Save predictions and references for manual inspection
import pandas as pd

output_data = pd.DataFrame({"Input": [ex["text"] for ex in eval_dataset],
                            "Reference": references,
                            "Prediction": predictions})
output_data.to_csv("evaluation_results.csv", index=False)

print("Evaluation complete. Results saved to 'evaluation_results.csv'.")


Evaluation Results:
ROGURE Score: 0.43
