In [1]:
import torch
from transformers import MarianMTModel, MarianTokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction 
from nltk.translate.gleu_score import sentence_gleu
import matplotlib.pyplot as plt

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Free up memory
torch.cuda.empty_cache()

# Load the tokenizer and model for MarianMT
model_name = "Helsinki-NLP/opus-mt-hi-en"  # Using a Hindi to English model for demonstration purposes
tokenizer = MarianTokenizer.from_pretrained(model_name)
non_finetuned_model = MarianMTModel.from_pretrained(model_name).to(device)

# Load the datasets
def read_sentences(src_path, tgt_path, limit=None):
    with open(src_path, "r", encoding="utf-8") as src_file, open(tgt_path, "r", encoding="utf-8") as tgt_file:
        incorrect_sentences = [line.strip() for line in tqdm(src_file.readlines(), desc="Reading Incorrect Sentences")]
        correct_sentences = [line.strip() for line in tqdm(tgt_file.readlines(), desc="Reading Correct Sentences")]
    if limit:
        incorrect_sentences = incorrect_sentences[:limit]
        correct_sentences = correct_sentences[:limit]
    return incorrect_sentences, correct_sentences

# Paths for training, validation, and test datasets
train_src_path = "wikiExtractsData/data/train_merge.src"
train_tgt_path = "wikiExtractsData/data/train_merge.tgt"
valid_src_path = "wikiExtractsData/data/valid.src"
valid_tgt_path = "wikiExtractsData/data/valid.tgt"
test_src_path = "Wiki-edits/hiwiki.extracted.clean.src"
test_tgt_path = "Wiki-edits/hiwiki.extracted.clean.trg"

# Load Training, Validation, and Test Data (Limited subset for faster training initially)
train_incorrect, train_correct = read_sentences(train_src_path, train_tgt_path)
valid_incorrect, valid_correct = read_sentences(valid_src_path, valid_tgt_path)
test_incorrect, test_correct = read_sentences(test_src_path, test_tgt_path)

# Define Dataset class for sentence correction
class SentenceCorrectionDataset(Dataset):
    def __init__(self, incorrect_sentences, correct_sentences, tokenizer, max_len=128):
        self.incorrect_sentences = incorrect_sentences
        self.correct_sentences = correct_sentences
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.incorrect_sentences)

    def __getitem__(self, idx):
        incorrect = self.incorrect_sentences[idx]
        correct = self.correct_sentences[idx]

        # Add a prefix to indicate the task type
        input_text = incorrect
        target_text = correct

        # Tokenize input and target texts
        inputs = self.tokenizer(
            input_text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"
        )

        targets = self.tokenizer(
            target_text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"
        )

        # Squeeze tensors to remove unnecessary dimensions
        input_ids = inputs.input_ids.squeeze()
        attention_mask = inputs.attention_mask.squeeze()
        labels = targets.input_ids.squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

# Create datasets using the small subset
train_dataset = SentenceCorrectionDataset(train_incorrect, train_correct, tokenizer)
valid_dataset = SentenceCorrectionDataset(valid_incorrect, valid_correct, tokenizer)
test_dataset = SentenceCorrectionDataset(test_incorrect, test_correct, tokenizer)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,  # Standard learning rate for fine-tuning
    per_device_train_batch_size=128,  # Adjust batch size to fit in GPU memory
    per_device_eval_batch_size=64,
    num_train_epochs=5,  # More epochs for better fine-tuning
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),  # Mixed precision training if CUDA is available
)

# Define the Trainer
finetuned_model = MarianMTModel.from_pretrained(model_name).to(device)
trainer = Trainer(
    model=finetuned_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)

# Train the Fine-Tuned Model using the subset
trainer.train()

# Save the Fine-Tuned Model
finetuned_model.save_pretrained("./sentence_correction_model")
tokenizer.save_pretrained("./sentence_correction_model")

# Evaluation Function
def evaluate_model(model, tokenizer, test_sentences, correct_sentences):
    model.eval()
    predicted_sentences = []

    with torch.no_grad():
        for incorrect_sentence in tqdm(test_sentences, desc="Evaluating Model"):
            # Prepare the input
            inputs = tokenizer(incorrect_sentence, return_tensors="pt", padding="max_length", truncation=True, max_length=128).to(device)

            # Generate the corrected sentence
            output_ids = model.generate(input_ids=inputs["input_ids"], max_length=128, num_beams=5, early_stopping=True)

            # Decode the output
            corrected_sentence = tokenizer.decode(output_ids[0], skip_special_tokens=True)
            predicted_sentences.append(corrected_sentence)

    return predicted_sentences

# Evaluate the Non Fine-Tuned Model on the Test Dataset
non_finetuned_predictions = evaluate_model(non_finetuned_model, tokenizer, test_incorrect, test_correct)

# Evaluate the Fine-Tuned Model on the Test Dataset
finetuned_model = MarianMTModel.from_pretrained("./sentence_correction_model").to(device)
finetuned_predictions = evaluate_model(finetuned_model, tokenizer, test_incorrect, test_correct)


# Smoothing function
smoothing_function = SmoothingFunction().method1

# Function to Calculate Metrics
def calculate_metrics(predictions, references):
    total_bleu_score = 0
    total_gleu_score = 0

    for pred, ref in zip(predictions, references):
        # Tokenize the sentences
        pred_tokens = pred.split()
        ref_tokens = ref.split()

        # Calculate BLEU and GLEU scores for the individual sentence
        total_bleu_score += sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smoothing_function)
        total_gleu_score += sentence_gleu([ref_tokens], pred_tokens)

    # Average scores
    avg_bleu_score = total_bleu_score / len(predictions)
    avg_gleu_score = total_gleu_score / len(predictions)

    return avg_bleu_score, avg_gleu_score

# Get BLEU and GLEU Scores for Both Models
non_finetuned_bleu, non_finetuned_gleu = calculate_metrics(non_finetuned_predictions, test_correct)
finetuned_bleu, finetuned_gleu = calculate_metrics(finetuned_predictions, test_correct)

# Summary of Results
print("=== Model Comparison ===")
print(f"Non Fine-Tuned Model - BLEU Score: {non_finetuned_bleu}, GLEU Score: {non_finetuned_gleu}")
print(f"Fine-Tuned Model - BLEU Score: {finetuned_bleu}, GLEU Score: {finetuned_gleu}")

improvement_bleu = finetuned_bleu - non_finetuned_bleu
improvement_gleu = finetuned_gleu - non_finetuned_gleu

print(f"Improvement in BLEU Score: {improvement_bleu}")
print(f"Improvement in GLEU Score: {improvement_gleu}")

# Plotting the Graphical Comparison of Scores
models = ['Non Fine-Tuned', 'Fine-Tuned']
bleu_scores = [non_finetuned_bleu, finetuned_bleu]
gleu_scores = [non_finetuned_gleu, finetuned_gleu]

fig, ax = plt.subplots()

bar_width = 0.35
index = range(len(models))

# Plotting the bars
ax.bar(index, bleu_scores, bar_width, label='BLEU Score', color='b')
ax.bar([i + bar_width for i in index], gleu_scores, bar_width, label='GLEU Score', color='g')

# Adding details to the plot
ax.set_xlabel('Models')
ax.set_ylabel('Scores')
ax.set_title('Comparison of BLEU and GLEU Scores for Non Fine-Tuned and Fine-Tuned Models')
ax.set_xticks([i + bar_width / 2 for i in index])
ax.set_xticklabels(models)
ax.legend()

# Display the plot
plt.tight_layout()
plt.show()


  warn(f"Failed to load image Python extension: {e}")


Using device: cuda


Reading Incorrect Sentences: 100%|██████████| 2607757/2607757 [00:00<00:00, 3544697.96it/s]
Reading Correct Sentences: 100%|██████████| 2607757/2607757 [00:00<00:00, 3546149.44it/s]
Reading Incorrect Sentences: 100%|██████████| 521552/521552 [00:00<00:00, 3254969.23it/s]
Reading Correct Sentences: 100%|██████████| 521552/521552 [00:00<00:00, 3241286.71it/s]
Reading Incorrect Sentences: 100%|██████████| 13187/13187 [00:00<00:00, 3134260.04it/s]
Reading Correct Sentences: 100%|██████████| 13187/13187 [00:00<00:00, 3445050.57it/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.73 GiB. GPU 0 has a total capacity of 19.50 GiB of which 890.94 MiB is free. Process 3037842 has 8.51 GiB memory in use. Including non-PyTorch memory, this process has 18.60 GiB memory in use. Of the allocated memory 14.74 GiB is allocated by PyTorch, and 3.69 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)