In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import matplotlib.pyplot as plt
from datasets import load_dataset
# Load pre-trained model and tokenizer
model_name = 'oliverguhr/spelling-correction-english-base'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [2]:
# Tokenize input text
input_text = "the shop cloed due to covid 19"
tokens = tokenizer(input_text, return_tensors="pt")

# Make prediction
outputs = model.generate(**tokens)

# Convert token IDs back to words
predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Original text:", input_text)
print("Corrected text:", predicted_text)

Original text: the shop cloed due to covid 19
Corrected text: The shop closed due to Covid 19.




In [None]:
dataset = load_dataset('csv', data_files='/content/drive/MyDrive/Ted_talk_sentences.csv')


# Preprocess the dataset
def preprocess_function(examples):
    inputs = [doc for doc in examples['incorrect']]
    targets = [doc for doc in examples['correct']]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer
)


train_result = trainer.train()

model.save_pretrained('./fine_tuned_spelling_model', saved_model=True)
# After training, plot the training and validation loss
training_loss = train_result.training_loss
validation_loss = [log['eval_loss'] for log in trainer.state.log_history if 'eval_loss' in log]

# Let's assume these are your pre-trained model metrics for plotting purposes
pretrained_accuracy = [0.85 - i*0.025 for i in range(10)]  # Mock data
pretrained_loss = [0.6 - i*0.04 for i in range(10)]        # Mock data


# Plotting the accuracy and loss
epochs = range(1, 11)
plt.figure(figsize=(14, 7))

# Accuracy subplot
plt.subplot(1, 2, 1)
plt.plot(epochs, pretrained_accuracy, 'b--', label='Pre-trained Model Accuracy (Curved)')
plt.plot(epochs, [1-l for l in validation_loss], 'b', label='Fine-tuned Model Accuracy (Curved)')  # Mock accuracy from loss
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Model Accuracy')
plt.legend()



In [9]:
import nltk
from nltk.tokenize import sent_tokenize

# Read input text from file
with open('../Data/myvoice_transcription.txt', 'r', encoding='utf-8') as file:
    input_text = file.read().strip()

# Tokenize input text into sentences
sentences = sent_tokenize(input_text)

corrected_sentences = []
for sentence in sentences:
    tokens = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=tokenizer.model_max_length)
    
    # Set max_length for generation, you can adjust this value
    outputs = model.generate(**tokens, max_length=tokenizer.model_max_length)

    corrected_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    corrected_sentences.append(corrected_sentence)

# Combine corrected sentences
predicted_text = ' '.join(corrected_sentences)

print("Original text:", input_text)
print("Corrected text:", predicted_text)

Original text: there don't sien to be any firm ruse above houseman demands correspond to follower accounts is the grantel taquon that the pogram is his early stage and that is still experimenting with the foremad we are continuing to test the payments as we roll up to more creatious and except the twillian san chentel flacture v
Corrected text: There don't seem to be any firm rules above houseman demands correspond to follower accounts. is the grantel taquon that the program is his early stage, and that is still experimenting with the foremad. We are continuing to test the payments as we roll up to more creative and except the twillian san chentel flacture. .


In [10]:
import numpy as np

def calculate_wer(reference, hypothesis):
    r = reference.split()
    h = hypothesis.split()
    # Building the matrix
    d = np.zeros((len(r)+1)*(len(h)+1), dtype=np.uint8)
    d = d.reshape((len(r)+1, len(h)+1))
    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0: 
                d[0][j] = j
            elif j == 0: 
                d[i][0] = i
    # Calculation
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
            else:
                substitute = d[i-1][j-1] + 1
                insert = d[i][j-1] + 1
                delete = d[i-1][j] + 1
                d[i][j] = min(substitute, insert, delete)
    return d[len(r)][len(h)] / float(len(r))


In [11]:
def calculate_cer(reference, hypothesis):
    r = reference
    h = hypothesis
    # Building the matrix
    d = np.zeros((len(r)+1)*(len(h)+1), dtype=np.uint8)
    d = d.reshape((len(r)+1, len(h)+1))
    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                d[0][j] = j
            elif j == 0:
                d[i][0] = i
    # Calculation
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
            else:
                substitute = d[i-1][j-1] + 1
                insert = d[i][j-1] + 1
                delete = d[i-1][j] + 1
                d[i][j] = min(substitute, insert, delete)
    return d[len(r)][len(h)] / float(len(r))


In [12]:
# Read ground truth text from file
with open('../Data/ground-truth.txt', 'r', encoding='utf-8') as file:
    ground_truth_text = file.read().strip()

# Calculate WER and CER
wer = calculate_wer(ground_truth_text, predicted_text)
cer = calculate_cer(ground_truth_text, predicted_text)

print("Word Error Rate:", wer)
print("Character Error Rate:", cer)

Word Error Rate: 0.5333333333333333
Character Error Rate: 0.3159340659340659
