## Base model test

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import torch
import matplotlib as plt
# Load pre-trained model and tokenizer
model_name = 'vennify/t5-base-grammar-correction'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [5]:
# Tokenize input text
input_text = "She don't likes to eat vegetables."
tokens = tokenizer(input_text, return_tensors="pt")

# Make prediction
outputs = model.generate(**tokens)

# Convert token IDs back to words
corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Original text:", input_text)
print("Corrected text:", corrected_text)



Original text: She don't likes to eat vegetables.
Corrected text: She doesn't like to eat vegetables.


In [7]:
!pwd

/content


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Fine Tuning

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Using device: cuda


In [None]:
def preprocess_function(examples):
    input_texts = ["grammar: " + sentence for sentence in examples['incorrect']]
    target_texts = [sentence for sentence in examples['correct']]
    model_inputs = tokenizer(input_texts, max_length=128, truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(target_texts, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# load dataset
dataset = load_dataset('csv', data_files='/content/drive/MyDrive/Ted_talk_sentences.csv')
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# load the model and tokenizer
model_name = 'vennify/t5-base-grammar-correction'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = model.to(device)

# training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test']
)

# train the model
trainer.train()

metrics = trainer.state.log_history

# Extract the evaluation metrics
eval_metrics = [x for x in metrics if 'eval_loss' in x]

# Separate the accuracy and loss for both pre-trained and fine-tuned models
pretrained_accuracy = []
pretrained_loss = []
finetuned_accuracy = []
finetuned_loss = []

for metric in eval_metrics:
    if 'pretrained' in metric:
        pretrained_accuracy.append(metric['eval_accuracy'])
        pretrained_loss.append(metric['eval_loss'])
    else:
        finetuned_accuracy.append(metric['eval_accuracy'])
        finetuned_loss.append(metric['eval_loss'])

# Plotting
epochs = range(1, len(pretrained_accuracy) + 1)

plt.figure(figsize=(12, 6))
plt.plot(epochs, pretrained_accuracy, 'b--', label='Pre-trained Model Accuracy')
plt.plot(epochs, finetuned_accuracy, 'b-', label='Fine-tuned Model Accuracy')
plt.plot(epochs, pretrained_loss, 'r--', label='Pre-trained Model Loss')
plt.plot(epochs, finetuned_loss, 'r-', label='Fine-tuned Model Loss')
plt.title('Accuracy and Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Metrics')
plt.legend()
plt.show()

# Save the model
#model.save_pretrained("./trained_grammar")
model.to('cpu')
model.save_pretrained('./fine_tuned_grammar_model', saved_model=True)

<img src="../Data/grammar_loss.png" height="500"/>

In [None]:
import nltk
from nltk.tokenize import sent_tokenize

# Read input text from file
with open('../Data/myvoice_transcription.txt', 'r', encoding='utf-8') as file:
    input_text = file.read().strip()

# Tokenize input text into sentences
sentences = sent_tokenize(input_text)

corrected_sentences = []
for sentence in sentences:
    tokens = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=tokenizer.model_max_length)

    # Set max_length for generation, you can adjust this value
    outputs = model.generate(**tokens, max_length=tokenizer.model_max_length)

    corrected_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    corrected_sentences.append(corrected_sentence)

# Combine corrected sentences
predicted_text = ' '.join(corrected_sentences)

print("Original text:", input_text)
print("Corrected text:", predicted_text)

Original text: there don't sien to be any firm ruse above houseman demands correspond to follower accounts is the grantel taquon that the pogram is his early stage and that is still experimenting with the foremad we are continuing to test the payments as we roll up to more creatious and except the twillian san chentel flacture v
Corrected text: There don't seem to be any firm ruse above houseman demands correspond to follower accounts is the grantel taquon that the program is his early stage and that is still experimenting with the foremad we are continuing to test the payments as we roll up to more creative and except the twillian san chentel flacture vs. the twillian san chentel.


In [None]:
import numpy as np

def calculate_wer(reference, hypothesis):
    r = reference.split()
    h = hypothesis.split()
    # Building the matrix
    d = np.zeros((len(r)+1)*(len(h)+1), dtype=np.uint8)
    d = d.reshape((len(r)+1, len(h)+1))
    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                d[0][j] = j
            elif j == 0:
                d[i][0] = i
    # Calculation
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
            else:
                substitute = d[i-1][j-1] + 1
                insert = d[i][j-1] + 1
                delete = d[i-1][j] + 1
                d[i][j] = min(substitute, insert, delete)
    return d[len(r)][len(h)] / float(len(r))


In [None]:
def calculate_cer(reference, hypothesis):
    r = reference
    h = hypothesis
    # Building the matrix
    d = np.zeros((len(r)+1)*(len(h)+1), dtype=np.uint8)
    d = d.reshape((len(r)+1, len(h)+1))
    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                d[0][j] = j
            elif j == 0:
                d[i][0] = i
    # Calculation
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
            else:
                substitute = d[i-1][j-1] + 1
                insert = d[i][j-1] + 1
                delete = d[i-1][j] + 1
                d[i][j] = min(substitute, insert, delete)
    return d[len(r)][len(h)] / float(len(r))


In [None]:
# Read ground truth text from file
with open('../Data/ground-truth.txt', 'r', encoding='utf-8') as file:
    ground_truth_text = file.read().strip()

# Calculate WER and CER
wer = calculate_wer(ground_truth_text, predicted_text)
cer = calculate_cer(ground_truth_text, predicted_text)

print("Word Error Rate:", wer)
print("Character Error Rate:", cer)

Word Error Rate: 0.55
Character Error Rate: 0.33516483516483514
