In [2]:
!pip install transformers
!pip install datasets
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [3]:
import pandas as pd
import random
import re
from datasets import Dataset
from evaluate import load
from transformers import (
    T5ForConditionalGeneration, T5Tokenizer,
    BartForConditionalGeneration, BartTokenizer,
    Seq2SeqTrainer, Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
import torch


In [4]:
pip install evaluate



In [5]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0


In [6]:
import requests

base_url = "https://raw.githubusercontent.com/google-research-datasets/wiki-split/master/"
files = ["tune.tsv", "validation.tsv", "test.tsv"]

for file in files:
    url = base_url + file
    response = requests.get(url)
    if response.status_code == 200:
        with open(file, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {file}")
    else:
        print(f"Failed to download {file}")


Downloaded tune.tsv
Downloaded validation.tsv
Downloaded test.tsv


In [7]:
# Load datasets
train_df = pd.read_csv("tune.tsv", sep='\t', header=None, names=['original', 'split'])
val_df = pd.read_csv("validation.tsv", sep='\t', header=None, names=['original', 'split'])
test_df = pd.read_csv("test.tsv", sep='\t', header=None, names=['original', 'split'])

In [8]:
sentence = train_df.head(1)

print(sentence.to_string())

                                                                                                                                                             original                                                                                                                                                                      split
0  ' ( 1990 ) was the second sequel to appear , though Hooper did not return to direct due to scheduling conflicts with another film , '' Spontaneous Combustion '' .  ' ( 1990 ) was the second sequel to appear . <::::> Though Hooper did not return to direct due to scheduling conflicts with another film , '' Spontaneous Combustion '' .


In [9]:
# Keep only the original sentence column
train_sentences = train_df['original'].dropna().tolist()
val_sentences = val_df['original'].dropna().tolist()
test_sentences = test_df['original'].dropna().tolist()


In [10]:
# prompt: WHAT IS LENGTH OF EACH dataset

print(f"Length of train dataset: {len(train_sentences)}")
print(f"Length of validation dataset: {len(val_sentences)}")
print(f"Length of test dataset: {len(test_sentences)}")


Length of train dataset: 5000
Length of validation dataset: 5000
Length of test dataset: 5000


In [11]:
# prompt: print first item in train_sentences completely

train_sentences[0]


"' ( 1990 ) was the second sequel to appear , though Hooper did not return to direct due to scheduling conflicts with another film , '' Spontaneous Combustion '' ."

In [12]:
import random
import pandas as pd

# Map of keyboard neighbors for typo simulation
keyboard_adj = {
    'a': 'qwsz', 'b': 'vghn', 'c': 'xdfv', 'd': 'ersfcx',
    'e': 'wsdr', 'f': 'rtgdvc', 'g': 'tyfhvb', 'h': 'yugjnb',
    'i': 'ujko', 'j': 'uikhmn', 'k': 'ijolm', 'l': 'kop',
    'm': 'njk', 'n': 'bhjm', 'o': 'iklp', 'p': 'ol',
    'q': 'wa', 'r': 'edft', 's': 'wedxza', 't': 'rfgy',
    'u': 'yhji', 'v': 'cfgb', 'w': 'qase', 'x': 'zsdc',
    'y': 'tghu', 'z': 'asx',
}

def introduce_typos(sentence, typo_prob=0.2):
    def typo(word):
        if (
            len(word) <= 3 or
            not word.isalpha() or
            word[0].isupper() or
            (len(word) > 1 and word[1].isupper()) or
            random.random() > typo_prob
        ):
            return word

        ops = ['delete', 'swap', 'replace', 'add', 'keyboard']
        op = random.choice(ops)
        i = random.randint(0, len(word) - 1)
        c = word[i].lower()

        if op == 'delete':
            return word[:i] + word[i+1:]
        elif op == 'swap' and i < len(word) - 1:
            return word[:i] + word[i+1] + word[i] + word[i+2:]
        elif op == 'replace':
            return word[:i] + random.choice('abcdefghijklmnopqrstuvwxyz') + word[i+1:]
        elif op == 'add':
            return word[:i] + random.choice('abcdefghijklmnopqrstuvwxyz') + word[i:]
        elif op == 'keyboard' and c in keyboard_adj:
            replacement = random.choice(keyboard_adj[c])
            return word[:i] + replacement + word[i+1:]
        return word

    return ' '.join([typo(w) for w in sentence.split()])


In [13]:
def generate_dataset(sentences, typo_prob=0.2, clean_percent=0.15, n_augmented=2):
    corrupted = []
    targets = []

    for sent in sentences:
        if random.random() < clean_percent:
            corrupted.append(sent)
            targets.append(sent)
        else:
            for _ in range(n_augmented):
                corrupted.append(introduce_typos(sent, typo_prob))
                targets.append(sent)

    return pd.DataFrame({'input': corrupted, 'target': targets})


train_data = generate_dataset(train_sentences, typo_prob=0.2, clean_percent=0.15)
val_data = generate_dataset(val_sentences, typo_prob=0.2, clean_percent=0.15)
test_data = generate_dataset(test_sentences, typo_prob=0.2, clean_percent=0.15)


In [14]:
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)


In [15]:
train_dataset[0] , val_dataset[0], test_dataset[0]

({'input': "' ( 1990 ) was the econd sequel to appear , though Hooper did not return to direct due to scheduling conflicts qith another film , '' Spontaneous Combustion '' .",
  'target': "' ( 1990 ) was the second sequel to appear , though Hooper did not return to direct due to scheduling conflicts with another film , '' Spontaneous Combustion '' ."},
 {'input': "' Lauren Rose Crace ' , born 25th of May 1986 in Birmingham , England , currently plays the part of '' Ronnie Mitchell 's '' long lost daughter Danielle Jones ( Amy ) in the soap opera '' EastEnders '' .",
  'target': "' Lauren Rose Crace ' , born 25th of May 1986 in Birmingham , England , currently plays the part of '' Ronnie Mitchell 's '' long lost daughter Danielle Jones ( Amy ) in the soap opera '' EastEnders '' ."},
 {'input': "' Bandolier - Budgie ' , a free iTunes app for iPad , iPhone and iPod touch , released in December 2011 , teols the story of the making of Bandolier in the band 's own words - incljding an extens

In [16]:

print(f"Length of train dataset: {len(train_dataset)}")
print(f"Length of validation dataset: {len(val_dataset)}")
print(f"Length of test dataset: {len(test_dataset)}")


Length of train dataset: 9204
Length of validation dataset: 9283
Length of test dataset: 9283


In [17]:
# prompt: keep first 1000 rows in test_dataset

test_dataset = test_dataset.select(range(1000))
print(f"Length of test dataset: {len(test_dataset)}")


Length of test dataset: 1000


In [18]:
# prompt: get the max length of the train and val dataset inputs

max_train_len = 0
for example in train_dataset:
    max_train_len = max(max_train_len, len(example['input']), len(example['target']))

max_val_len = 0
for example in val_dataset:
    max_val_len = max(max_val_len, len(example['input']), len(example['target']))

print(f"Max train length: {max_train_len}")
print(f"Max val length: {max_val_len}")


max_length = max(max_train_len, max_val_len)  # = 346


Max train length: 340
Max val length: 347


In [19]:
def preprocess(tokenizer, dataset):
    def tokenize(example):
        model_inputs = tokenizer(example["input"], max_length=128, truncation=True, padding="max_length")
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(example["target"], max_length=128, truncation=True, padding="max_length")
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    return dataset.map(tokenize, batched=True)


In [20]:
import os
def train_model(model_name, tokenizer_cls, model_cls):

    tokenizer = tokenizer_cls.from_pretrained(model_name)
    model = model_cls.from_pretrained(model_name)

    tokenized_train = preprocess(tokenizer, train_dataset)
    tokenized_val = preprocess(tokenizer, val_dataset)

    training_args = Seq2SeqTrainingArguments(
        output_dir=f"./{model_name}-finetuned-spell",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        weight_decay=0.01,
        save_total_limit=2,
        num_train_epochs=3,
        predict_with_generate=True,
        logging_dir="./logs",
        fp16=True
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    try:
        model = model_cls.from_pretrained(f"./{model_name}-finetuned-spell")
        tokenizer = tokenizer_cls.from_pretrained(f"./{model_name}-finetuned-spell")
        # Load trainer state directly on instantiation.
        trainer = Seq2SeqTrainer(
            model=model,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_val,
            args=training_args,
            tokenizer=tokenizer,
            data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
        )
        print("Loading saved model...")
        trainer.train(resume_from_checkpoint=os.path.join(f"./{model_name}-finetuned-spell/checkpoint-3750"))

        print("Returning")
        return trainer, tokenizer, model
    except Exception as e:
        print(f"Error loading saved model: {e}")
        pass # or handle the exception appropriately
    print("Training model...")
    trainer.train()

    # save the model, tokenizer and trainer
    model.save_pretrained(f"./{model_name}-finetuned-spell")
    tokenizer.save_pretrained(f"./{model_name}-finetuned-spell")
    trainer.save_state()


    return trainer, tokenizer, model


In [21]:
bart_trainer, bart_tokenizer, bart_model = train_model("facebook/bart-large", BartTokenizer, BartForConditionalGeneration)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Map:   0%|          | 0/9204 [00:00<?, ? examples/s]



Map:   0%|          | 0/9283 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Error loading saved model: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory ./facebook/bart-large-finetuned-spell.
Training model...




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkhaledibrahim[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,1.4532
1000,0.0427
1500,0.035
2000,0.0284
2500,0.0153
3000,0.0118




In [22]:
pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.27.1 python-Levenshtein-0.27.1


In [23]:
import Levenshtein

def compute_levenshtein(predictions, references):
    distances = []
    for pred, ref in zip(predictions, references):
        dist = Levenshtein.distance(pred, ref)
        distances.append(dist)
    avg_distance = sum(distances) / len(distances)
    return {"avg_levenshtein_distance": avg_distance}


In [24]:
from evaluate import load
import Levenshtein

cer = load("cer")
wer = load("wer")

def evaluate_model_metrics_batched(trainer, tokenizer, dataset, batch_size=32):
    inputs = dataset['input']
    targets = dataset['target']

    preds = []
    for i in range(0, len(inputs), batch_size):
        batch_inputs = inputs[i:i+batch_size]
        tokenized = tokenizer(batch_inputs, return_tensors="pt", padding=True, truncation=True, max_length=128).to(trainer.model.device)
        output_ids = trainer.model.generate(**tokenized, max_length=128)
        batch_preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        preds.extend(batch_preds)

    cer_score = cer.compute(predictions=preds, references=targets)
    wer_score = wer.compute(predictions=preds, references=targets)
    levenshtein_score = compute_levenshtein(preds, targets)

    return {
        "CER": cer_score,
        "WER": wer_score,
        "Levenshtein": levenshtein_score
    }, preds


bart_metrics, bart_preds = evaluate_model_metrics_batched(bart_trainer, bart_tokenizer, test_dataset)


Downloading builder script:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [25]:
bart_metrics

{'CER': 0.024651637769164042,
 'WER': 0.03176504967855055,
 'Levenshtein': {'avg_levenshtein_distance': 4.489}}

In [26]:
def correct_sentence(model, tokenizer, sentence):
    inputs = tokenizer(sentence, return_tensors="pt").to(model.device)
    output_ids = model.generate(**inputs)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


user_input = "Ths is a sentnce with spleling errors."
print("User Input:", user_input)
print("BART Corrected:", correct_sentence(bart_model, bart_tokenizer, user_input))


User Input: Ths is a sentnce with spleling errors.
BART Corrected: Ths is a term with spelling errors.


In [27]:
user_input = "I haate Inter Milan fery much"
print("User Input:", user_input)
print("BART Corrected:", correct_sentence(bart_model, bart_tokenizer, user_input))

User Input: I haate Inter Milan fery much
BART Corrected: I hate Inter Milan very much


In [28]:
examples = [
    "I havw a dream about technology.",
    "The quick brown fox jumped over the lazey dog.",
    "Pleas correct this entire sentence with many typos."
]

for s in examples:
    print(f"\nOriginal:  {s}")
    print(f"Corrected: {correct_sentence(bart_model, bart_tokenizer, s)}")



Original:  I havw a dream about technology.
Corrected: I have a dream about technology.

Original:  The quick brown fox jumped over the lazey dog.
Corrected: The quick brown fox jumped over the lazy dog.

Original:  Pleas correct this entire sentence with many typos.
Corrected: Pleas correct this entire sentence with many typos.


In [29]:
for i in range(5):
    idx = random.randint(0, len(test_dataset))
    print(f"\nOriginal:  {test_dataset[idx]['input']}")
    print(f"Corrected: {correct_sentence(bart_model, bart_tokenizer, test_dataset[idx]['input'])}")


Original:  '' For All Time '' was formally released to radio on February 12 , 2002 .
Corrected: '' For All Time '' was formally released to radio on February 12 , 2002 .

Original:  Artist friends followed him , including painter and illustrator Maxfield Parrish , who designed and butlt his estate , the Oaks , in the area , and the surrouning are became the center of the popular Cornish Art Colony .
Corrected: Artist friends followed him , including painter and illustrator Maxfield Parrish , who designed and

Original:  According to Oswald , he met with four more Soviet officials that same day , who asked if he wanted to return to the United States ; he insisted to yhem that he wwnted to live in the Soviet Union as a Soviet national .
Corrected: According to Oswald , he met with four more Soviet officials that same day , who asked if

Original:  ABS - CBN also used what is probably the biggest touch screen display ever used in a Philippine television show as well as a huge WAR ( Wirel

In [30]:
original_grammer = "I have 3 shrits"
corrected_sentence = correct_sentence(bart_model, bart_tokenizer, original_grammer)
print(f"Original:  {original_grammer}")
print(f"Corrected: {corrected_sentence}")

Original:  I have 3 shrits
Corrected: I have 3 shrits


In [31]:
!pip install rapidfuzz

from rapidfuzz import fuzz

def fuzzy_ratio_test(dataset):

  ratios = []
  for example in dataset:
      ratio = fuzz.ratio(example['input'], example['target'])
      ratios.append(ratio)
  return ratios

fuzzy_ratios = fuzzy_ratio_test(test_dataset)

print(fuzzy_ratios[:10])

[98.93842887473461, 97.43589743589743, 100.0, 98.33729216152018, 99.0521327014218, 98.25581395348837, 98.84393063583815, 100.0, 99.73753280839895, 97.45762711864407]
