In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load pre-trained model and tokenizer
model_name = 'vennify/t5-base-grammar-correction'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [2]:
# Tokenize input text
input_text = "She don't likes to eat vegetables."
tokens = tokenizer(input_text, return_tensors="pt")

# Make prediction
outputs = model.generate(**tokens)

# Convert token IDs back to words
corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Original text:", input_text)
print("Corrected text:", corrected_text)

Original text: She don't likes to eat vegetables.
Corrected text: She doesn't like to eat vegetables.




In [3]:
# Read input text from file
with open('../Data/myvoice_transcription.txt', 'r', encoding='utf-8') as file:
    input_text = file.read().strip()

# Tokenize input text
tokens = tokenizer(input_text, return_tensors="pt")

# Make prediction
outputs = model.generate(**tokens)

# Convert token IDs back to words
predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Original text:", input_text)
print("Corrected text:", predicted_text)

Original text: there don't sien to be any firm ruse above houseman demands correspond to follower accounts is the grantel taquon that the pogram is his early stage and that is still experimenting with the foremad we are continuing to test the payments as we roll up to more creatious and except the twillian san chentel flacture v
Corrected text: There don't seem to be any firm ruse above houseman demands correspond to follow


In [4]:
import numpy as np

def calculate_wer(reference, hypothesis):
    r = reference.split()
    h = hypothesis.split()
    # Building the matrix
    d = np.zeros((len(r)+1)*(len(h)+1), dtype=np.uint8)
    d = d.reshape((len(r)+1, len(h)+1))
    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0: 
                d[0][j] = j
            elif j == 0: 
                d[i][0] = i
    # Calculation
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
            else:
                substitute = d[i-1][j-1] + 1
                insert = d[i][j-1] + 1
                delete = d[i-1][j] + 1
                d[i][j] = min(substitute, insert, delete)
    return d[len(r)][len(h)] / float(len(r))


In [5]:
def calculate_cer(reference, hypothesis):
    r = reference
    h = hypothesis
    # Building the matrix
    d = np.zeros((len(r)+1)*(len(h)+1), dtype=np.uint8)
    d = d.reshape((len(r)+1, len(h)+1))
    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                d[0][j] = j
            elif j == 0:
                d[i][0] = i
    # Calculation
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
            else:
                substitute = d[i-1][j-1] + 1
                insert = d[i][j-1] + 1
                delete = d[i-1][j] + 1
                d[i][j] = min(substitute, insert, delete)
    return d[len(r)][len(h)] / float(len(r))


In [6]:
# Read ground truth text from file
with open('../Data/ground-truth.txt', 'r', encoding='utf-8') as file:
    ground_truth_text = file.read().strip()

# Calculate WER and CER
wer = calculate_wer(ground_truth_text, predicted_text)
cer = calculate_cer(ground_truth_text, predicted_text)

print("Word Error Rate:", wer)
print("Character Error Rate:", cer)

Word Error Rate: 0.8666666666666667
Character Error Rate: 0.23351648351648352
