In [1]:
!pip install -q transformers datasets evaluate sacrebleu jiwer pandas torch

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m105.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [15]:
from transformers import T5TokenizerFast
from transformers import T5ForConditionalGeneration
from transformers import Seq2SeqTrainingArguments
from datasets import Dataset, DatasetDict
import random
import string
import numpy as np
import pandas as pd
import torch
from transformers import (
    T5TokenizerFast,
    T5ForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import evaluate
from sklearn.model_selection import train_test_split

#Cell 2: Load Dataset Files

In [16]:
# Cell 2: Load Dataset Files (using dictionaries)
def load_sentences(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None, usecols=[0])
    return {i: sent for i, sent in enumerate(df[0].tolist())}

train_sentences = load_sentences('tune.tsv')
val_sentences = load_sentences('validation.tsv')
test_sentences = load_sentences('test.tsv')

print(f"Loaded {len(train_sentences)} training sentences")
print(f"Loaded {len(val_sentences)} validation sentences")
print(f"Loaded {len(test_sentences)} test sentences")

Loaded 5000 training sentences
Loaded 5000 validation sentences
Loaded 5000 test sentences


#Cell 3: Error Generation Functions


In [17]:
# Cell 3: Error Generation Functions
def introduce_errors(sentence, min_errors=3, max_errors=5):
    words = sentence.split()
    if not words:
        return sentence

    current_words = list(words)
    num_errors = random.randint(min_errors, max_errors)

    for _ in range(num_errors):
        if not current_words:
            break

        word_idx = random.randint(0, len(current_words) - 1)
        word = current_words[word_idx]
        if len(word) < 1:
            continue

        op = random.choice(["delete", "insert", "substitute", "transpose", "duplicate"])

        if op == "delete" and len(word) > 0:
            pos = random.randint(0, len(word)-1)
            word = word[:pos] + word[pos+1:]
        elif op == "insert":
            pos = random.randint(0, len(word))
            word = word[:pos] + random.choice(string.ascii_lowercase) + word[pos:]
        elif op == "substitute":
            pos = random.randint(0, len(word)-1)
            word = word[:pos] + random.choice(string.ascii_lowercase) + word[pos+1:]
        elif op == "transpose" and len(word) > 1:
            pos = random.randint(0, len(word)-2)
            word = word[:pos] + word[pos+1] + word[pos] + word[pos+2:]
        elif op == "duplicate":
            current_words.insert(word_idx + 1, word)
            continue

        current_words[word_idx] = word

    return ' '.join(current_words)

def generate_pairs(sentences_dict, versions=2):
    pairs = []
    for sent_id, sent in sentences_dict.items():
        for _ in range(versions):
            corrupted = introduce_errors(sent)
            if corrupted != sent:
                pairs.append({
                    'id': f"{sent_id}_{_}",
                    'input_text': corrupted,
                    'target_text': sent
                })
    return pairs

train_pairs = generate_pairs(train_sentences)
val_pairs = generate_pairs(val_sentences, versions=1)
test_pairs = generate_pairs(test_sentences, versions=1)

print(f"Generated {len(train_pairs)} training pairs")
print(f"Generated {len(val_pairs)} validation pairs")
print(f"Generated {len(test_pairs)} test pairs")

Generated 10000 training pairs
Generated 5000 validation pairs
Generated 5000 test pairs


#Cell 3.1: Create DatasetDict

In [18]:
datasets = DatasetDict({
    'train': Dataset.from_list(train_pairs),
    'validation': Dataset.from_list(val_pairs),
    'test': Dataset.from_list(test_pairs)
})

print(datasets)

DatasetDict({
    train: Dataset({
        features: ['id', 'input_text', 'target_text'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['id', 'input_text', 'target_text'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['id', 'input_text', 'target_text'],
        num_rows: 5000
    })
})


#Cell 4: Tokenization and Data Preparation

In [24]:
# Cell 4: Tokenization and Data Preparation
MODEL_NAME = "t5-small"
tokenizer = T5TokenizerFast.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    # Tokenize input and target sequences
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=128,
        truncation=True,
        padding="max_length" # Added padding here
    )
    labels = tokenizer(
        text_target=examples["target_text"],
        max_length=128,
        truncation=True,
        padding="max_length" # Added padding here
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization using Dataset.map()
tokenized_datasets = datasets.map(tokenize_function, batched=True)

# The data collator will handle batching and converting to tensors
# No need to manually create TensorDatasets

print("Tokenized dataset structure:")
print(tokenized_datasets)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Tokenized dataset structure:
DatasetDict({
    train: Dataset({
        features: ['id', 'input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['id', 'input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['id', 'input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})


#Cell 5: Initialize Model and Metrics


In [25]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

wer_metric = evaluate.load("wer")
bleu_metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    wer = wer_metric.compute(predictions=decoded_preds, references=decoded_labels)
    bleu = bleu_metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])

    return {"wer": wer, "bleu": bleu["score"]}

#Cell 6: Training Configuration



In [28]:
# Cell 6: Training Configuration

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="t5_spelling_correction",
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    # Use the tokenized datasets instead of the original TensorDatasets
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


#Cell 7: Training and Evaluation



In [29]:
# Cell 8: Training and Evaluation

trainer.train()

test_results = trainer.evaluate(
    # Use the tokenized test dataset
    eval_dataset=tokenized_datasets["test"],
    metric_key_prefix="test"
)

print("\nTest Results:")
print(f"WER: {test_results['test_wer']:.4f}")
print(f"BLEU: {test_results['test_bleu']:.4f}")

trainer.save_model("t5_small_spelling_correction")
print("Model saved successfully!")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss,Wer,Bleu
500,1.0477,0.122922,0.633578,17.600643
1000,0.1403,0.103471,0.610293,20.327078
1500,0.124,0.096333,0.603981,21.089498
2000,0.1155,0.09222,0.60058,21.498507
2500,0.1108,0.090253,0.599753,21.584418
3000,0.1074,0.088748,0.598886,21.698682
3500,0.105,0.087849,0.598138,21.779551


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].



Test Results:
WER: 0.5999
BLEU: 21.7090
Model saved successfully!


#Cell 9: Example Corrections






In [30]:
print("Displaying predictions using `trainer.model`...\n")
print("--- Example Predictions on Test Set (first 10 examples) ---")
print("------------------------------------------------------------")

for i in range(10):
    example = datasets["test"][i]
    inputs = tokenizer(example["input_text"], return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_length=128)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"Example {i+1}:")
    print(f"  Input (Misspelled)   : {repr(example['input_text'])}")
    print(f"  Predicted (Corrected): {repr(prediction)}")
    print(f"  Actual (Correct)     : {repr(example['target_text'])}")
    print("------------------------------------------------------------")

Displaying predictions using `trainer.model`...

--- Example Predictions on Test Set (first 10 examples) ---
------------------------------------------------------------
Example 1:
  Input (Misspelled)   : "' Bandolier - Budgie ' , a free iTunes app for iPad , iPhone and iPod touch , released n December 2011 , tells the story of the making of Bandolier in the band s' own words r- including an extensive audio interview with Burke Shelley ."
  Predicted (Corrected): "' Bandolier - Budgie ', a free iTunes app for iPad, iPhone and iPod touch, released in December 2011, tells the story of the making of Bandolier in the band s own words - including an extensive audio interview with Burke Shelley."
  Actual (Correct)     : "' Bandolier - Budgie ' , a free iTunes app for iPad , iPhone and iPod touch , released in December 2011 , tells the story of the making of Bandolier in the band 's own words - including an extensive audio interview with Burke Shelley ."
------------------------------------