# Exercise on n-gram language models

Students:

Dimopoulos Vasileios, Christos Katrinakis, Giannis Trantalidis

In [1]:
import nltk
nltk.download('punkt')
nltk.download('webtext')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vassi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package webtext to
[nltk_data]     C:\Users\vassi\AppData\Roaming\nltk_data...
[nltk_data]   Package webtext is already up-to-date!


True

For the purpose of this exercise we used a corpus from the NLTK corpus library, named "Web Text" including web text and messages from the Firefox discussion forums. As we see below, this corpus includes 25733 sentences and comes in a form of tokenized sentences, using webtext.sents().

In [2]:
from nltk.corpus import webtext
len(webtext.sents())

25733

In [3]:
webtext.raw()



In [4]:
sentences = webtext.sents()

In [5]:
for sent in sentences[:10]:
    print(sent)
    print("_________________")

['Cookie', 'Manager', ':', '"', 'Don', "'", 't', 'allow', 'sites', 'that', 'set', 'removed', 'cookies', 'to', 'set', 'future', 'cookies', '"', 'should', 'stay', 'checked', 'When', 'in', 'full', 'screen', 'mode', 'Pressing', 'Ctrl', '-', 'N', 'should', 'open', 'a', 'new', 'browser', 'when', 'only', 'download', 'dialog', 'is', 'left', 'open', 'add', 'icons', 'to', 'context', 'menu', 'So', 'called', '"', 'tab', 'bar', '"', 'should', 'be', 'made', 'a', 'proper', 'toolbar', 'or', 'given', 'the', 'ability', 'collapse', '/', 'expand', '.']
_________________
['[', 'XUL', ']', 'Implement', 'Cocoa', '-', 'style', 'toolbar', 'customization', '.']
_________________
['#', 'ifdefs', 'for', 'MOZ_PHOENIX', 'customize', 'dialog', "'", 's', 'toolbar', 'has', 'small', 'icons', 'when', 'small', 'icons', 'is', 'not', 'checked', 'nightly', 'builds', 'and', 'tinderboxen', 'for', 'Phoenix', 'finish', 'tearing', 'prefs', 'UI', 'to', 'pieces', 'and', 'then', 'make', 'it', 'not', 'suck', '"', 'mozbrowser', '"', 

We split the total sentences into two sets, training and test, for the purpose of the models' training.

In [6]:
from sklearn.model_selection import train_test_split

split_ratio = 0.8
train_set, test_set = train_test_split(sentences, test_size=1 - split_ratio, random_state=42)

We can see below the most frequent words found in our corpus, sorted in a descending order.

In [7]:
from collections import Counter

flat_words = [word for sentence in train_set for word in sentence]
word_freq = Counter(flat_words)
for word, freq in word_freq.items():
    print(f"{word}: {freq} times")

***: 402 times
Very: 191 times
fine: 111 times
,: 9850 times
elegant: 42 times
scented: 3 times
balanced: 39 times
.: 14194 times
Mother: 65 times
:: 11335 times
No: 649 times
way: 193 times
...: 1259 times
She: 228 times
is: 2327 times
singing: 35 times
into: 263 times
the: 5843 times
saw: 64 times
?: 3480 times
Not: 179 times
Rated: 37 times
light: 29 times
and: 3288 times
delicate: 8 times
-: 2660 times
decent: 22 times
but: 902 times
a: 4630 times
bit: 213 times
attenuated: 1 times
Rich: 23 times
sweet: 45 times
palate: 95 times
Woman: 525 times
#: 2987 times
1: 1834 times
Omigod: 6 times
!: 3256 times
In: 81 times
morning: 51 times
8: 104 times
am: 182 times
while: 190 times
I: 6213 times
': 8632 times
m: 933 times
takin: 3 times
shit: 242 times
all: 680 times
fuckin: 44 times
day: 128 times
Chick: 402 times
Because: 55 times
not: 1983 times
in: 3382 times
love: 127 times
with: 1556 times
him: 342 times
Is: 116 times
it: 2089 times
Connecticut: 4 times
Teen: 431 times
girl: 965 ti

We will keep words found more than 10 times in our corpus to create our vocabulary

In [8]:
filtered_words = {word: freq for word, freq in word_freq.items() if freq > 10}
vocabulary = filtered_words.keys()
vocabulary = list(vocabulary)
vocabulary.append('<e>')
vocabulary[:10]

['***', 'Very', 'fine', ',', 'elegant', 'balanced', '.', 'Mother', ':', 'No']

We create the function replace_oov_with_unk to replace all the out-of-vocabulary-words of our training and test sets with the special token **UNK**.

In [9]:
def replace_oov_with_unk(vocabulary, sentences):
    replaced_sentences = []

    for sentence in sentences:
        replaced_sentence = [word if word in vocabulary else "*UNK*" for word in sentence]
        replaced_sentences.append(replaced_sentence)

    return replaced_sentences

In [10]:
train_set_2 = replace_oov_with_unk(vocabulary, train_set)
train_set_2[:3]

[['***', 'Very', 'fine', ',', 'elegant', ',', '*UNK*', ',', 'balanced', '.'],
 ['Mother',
  ':',
  'No',
  'way',
  '...',
  'She',
  'is',
  'singing',
  'into',
  'the',
  'saw',
  '?'],
 ['Not',
  'Rated',
  'Very',
  'light',
  'and',
  '*UNK*',
  '-',
  'decent',
  'but',
  'a',
  'bit',
  '*UNK*',
  '.']]

In [11]:
test_set_2 = replace_oov_with_unk(vocabulary, test_set)
test_set_2[:3]

[['and', 'looking', 'for', '*UNK*', '/', 'friendship', '.'],
 ['And',
  'here',
  'in',
  '*UNK*',
  '*UNK*',
  ',',
  'we',
  'have',
  'but',
  'one',
  '*UNK*',
  'for',
  'setting',
  '*UNK*',
  'the',
  '*UNK*',
  '-',
  '*UNK*',
  '*UNK*',
  '.'],
 ['FATHER', ':', 'Right', '.']]

Below we train our ngram models (unigram, bigram and trigram) and print the 5 most frequent word combinations of each model. 

In [12]:
from collections import Counter
from nltk.util import ngrams
import pprint

N = 5

unigram_counter = Counter()
bigram_counter = Counter()
trigram_counter = Counter()

for sent in train_set_2:
    unigram_counter.update([gram for gram in ngrams(sent, 1, pad_left=True, pad_right=True,
                                                   left_pad_symbol='<s>',right_pad_symbol='<e>') ])
    bigram_counter.update([gram for gram in ngrams(sent, 2, pad_left=True, pad_right=True,
                                                   left_pad_symbol='<s>',right_pad_symbol='<e>') ])
    trigram_counter.update([gram for gram in ngrams(sent, 3, pad_left=True, pad_right=True,
                                                   left_pad_symbol='<s>',right_pad_symbol='<e>') ])

In [13]:
print("Most Frequent Unigrams:")    
print(unigram_counter.most_common(N))
print("------------------------------------------------")
print("Most Frequent Bigrams:") 
print(bigram_counter.most_common(N))
print("------------------------------------------------")
print("Most Frequent Trigrams:") 
print(trigram_counter.most_common(N))

Most Frequent Unigrams:
[(('*UNK*',), 37459), (('.',), 14194), ((':',), 11335), ((',',), 9850), (("'",), 8632)]
------------------------------------------------
Most Frequent Bigrams:
[(('.', '<e>'), 12872), (('*UNK*', '*UNK*'), 5230), (('*UNK*', '.'), 4073), (('?', '<e>'), 3477), (('!', '<e>'), 3255)]
------------------------------------------------
Most Frequent Trigrams:
[(('.', '<e>', '<e>'), 12872), (('*UNK*', '.', '<e>'), 3701), (('?', '<e>', '<e>'), 3477), (('!', '<e>', '<e>'), 3255), (('<s>', '<s>', '*UNK*'), 2979)]


## Bigram and Trigram Cross Entropy and Perplexity

We calculate our bigrams LM Cross Entropy and Perplexity, using Laplace a-smoothing (a = 0.001) for the calculations of the bigram probabilities. We also ignore the probabilities including the start token (P(*start*|…)).

In [14]:
import math
from itertools import pairwise

alpha = 0.001
vocab_size = len(set(vocabulary))
sum_prob = 0
bigram_cnt = 0

for sent in test_set_2:
    sent = ['<s>'] + sent + ['<e>']
    for first_token, second_token in pairwise(sent):
        bigram_prob = (bigram_counter[(first_token, second_token)] + alpha) / (unigram_counter[(first_token,)] + alpha*vocab_size)
        sum_prob += math.log2(bigram_prob)
        bigram_cnt += 1

HC = -sum_prob / bigram_cnt
perpl = math.pow(2, HC)
print("Cross Entropy: {0:.3f}".format(HC))
print("perplexity: {0:.3f}".format(perpl))

Cross Entropy: 5.366
perplexity: 41.253


We do the same for our trigram model ignoring again P(*start1*|…), P(*start2*|…).

In [15]:
from more_itertools import windowed

sum_prob = 0
trigram_cnt = 0

for sent in test_set_2:
    sent = ['<s>'] + ['<s>'] + sent + ['<e>']

    for first_token, second_token, third_token in windowed(sent, n=3):
        trigram_prob = (trigram_counter[(first_token, second_token, third_token)] + alpha) / (bigram_counter[(first_token, second_token)] + alpha*vocab_size)
        sum_prob += math.log2(trigram_prob)
        trigram_cnt+=1

HC = -sum_prob / trigram_cnt
perpl = math.pow(2,HC)
print("Cross Entropy: {0:.3f}".format(HC))
print("perplexity: {0:.3f}".format(perpl))

Cross Entropy: 6.351
perplexity: 81.629


## Text Auto-Completion

Below we include code that generates the most probable next words, using our bigram and trigram frequencies, to complete the missing words of a sentence.

In [16]:
def generate_candidates_bi(state):
    # Given state , generate possible next words
    last_word = state[-1]
    next_words = [word for (prev_word, word) in bigram_counter if prev_word == last_word]
    my_list = [state + [next_word] for next_word in next_words]
    return my_list

def generate_candidates_tri(state):
    # Given state , generate possible next words
    last_word = state[-1]
    last_2_word = state[-2]
    next_words = [word for (prev_2_word, prev_word, word) in trigram_counter if (prev_word == last_word and prev_2_word == last_2_word)]
    my_list = [state + [next_word] for next_word in next_words]
    return my_list

def score_bi(state):
    # Calculate the probability of the word sequence using the bigram model
    probability = 0.0
    for i in range(1, len(state)):
        prev_word, word = state[i-1], state[i]
        probability += math.log2(bigram_counter.get((prev_word, word), 0.0) + 0.001)
    return probability

def score_tri(state):
    # Calculate the probability of the word sequence using the bigram model
    probability = 0.0
    for i in range(2, len(state)):
        prev_2_word, prev_word, word = state[i-2],state[i-1], state[i]
        probability += math.log2(trigram_counter.get((prev_2_word, prev_word, word), 0.0) + 0.001)
    return probability

def beam_search_decode(initial_state, max_depth, beam_width, generate_candidates_fn, score_fn):
    candidates = [(initial_state, 0.0)]
    string_to_ignore = "*UNK*"
    
    for depth in range(max_depth):
        new_candidates = []
        for candidate, prob in candidates:
            for next_state in generate_candidates_fn(candidate):
                if string_to_ignore in next_state:
                    continue
                else:
                    new_prob = prob + score_fn(next_state)
                    new_candidates.append((next_state, new_prob))

        new_candidates = sorted(new_candidates, key=lambda x: x[1], reverse=True)
        '''print('\n***** NEW candidates *****')
        pprint(new_candidates)
        new_candidates = sorted(new_candidates, key=lambda x: x[1], reverse=True)
        pprint('***** Sorted')
        pprint(new_candidates)'''
        print(f'***** Chosen candidates (top-{beam_width})')
        candidates = new_candidates[:beam_width]
        print(candidates)

    best_sequence, best_prob = max(candidates, key=lambda x: x[1])
    return best_sequence

### Bigram auto-completion example

In [17]:
initial_state = "<s> try to help"
initial_state = initial_state.split()
max_depth = 15
beam_width = 2
best_sequence = beam_search_decode(initial_state, max_depth, beam_width, generate_candidates_bi, score_bi)

print("Best sequence:", best_sequence[1:])  # Excluding the "<start>" token

***** Chosen candidates (top-2)
[(['<s>', 'try', 'to', 'help', 'me'], 2.9966432875315396), (['<s>', 'try', 'to', 'help', 'you'], 2.20820261153887)]
***** Chosen candidates (top-2)
[(['<s>', 'try', 'to', 'help', 'me', '.'], 13.25068384710427), (['<s>', 'try', 'to', 'help', 'me', ','], 12.838789171127178)]
***** Chosen candidates (top-2)
[(['<s>', 'try', 'to', 'help', 'me', '.', '<e>'], 37.15667312948054), (['<s>', 'try', 'to', 'help', 'me', ',', 'I'], 32.68375147646471)]
***** Chosen candidates (top-2)
[(['<s>', 'try', 'to', 'help', 'me', ',', 'I', "'"], 62.908093231977475), (['<s>', 'try', 'to', 'help', 'me', ',', 'I', 'don'], 61.24981638921399)]
***** Chosen candidates (top-2)
[(['<s>', 'try', 'to', 'help', 'me', ',', 'I', "'", 't'], 104.5821000417672), (['<s>', 'try', 'to', 'help', 'me', ',', 'I', "'", 's'], 104.46447209546972)]
***** Chosen candidates (top-2)
[(['<s>', 'try', 'to', 'help', 'me', ',', 'I', "'", 's', 'a'], 153.78240883975536), (['<s>', 'try', 'to', 'help', 'me', ',', 

### Trigram auto-completion example

In [18]:
initial_state = "<s> <s> try to help"
initial_state = initial_state.split()
max_depth = 6
beam_width = 2
best_sequence = beam_search_decode(initial_state, max_depth, beam_width, generate_candidates_tri, score_tri)

print("Best sequence:", best_sequence[1:])  # Excluding the "<start>" token

***** Chosen candidates (top-2)
[(['<s>', '<s>', 'try', 'to', 'help', 'you'], -28.89663168674261), (['<s>', '<s>', 'try', 'to', 'help', ','], -28.89663168674261)]
***** Chosen candidates (top-2)
[(['<s>', '<s>', 'try', 'to', 'help', 'you', '?'], -56.20782005454902), (['<s>', '<s>', 'try', 'to', 'help', 'you', 'with'], -56.79254220624156)]
***** Chosen candidates (top-2)
[(['<s>', '<s>', 'try', 'to', 'help', 'you', '?', '<e>'], -77.23358770752148), (['<s>', '<s>', 'try', 'to', 'help', 'you', 'with', 'anything'], -84.68701075156662)]
***** Chosen candidates (top-2)
[(['<s>', '<s>', 'try', 'to', 'help', 'you', '?', '<e>', '<e>'], -86.49572759384114), (['<s>', '<s>', 'try', 'to', 'help', 'you', 'with', 'anything', 'else'], -112.58003732271776)]
***** Chosen candidates (top-2)
[(['<s>', '<s>', 'try', 'to', 'help', 'you', 'with', 'anything', 'else', '.'], -138.8876205749327), (['<s>', '<s>', 'try', 'to', 'help', 'you', 'with', 'anything', 'else', '?'], -140.471621919695)]
***** Chosen candid

##  Context-aware spelling corrector

We used our ngram models and the normalized Levenshtein edit distance measure, to predict which is the actual word the user meant to type, for each word of a sentence. Specifically, the word chosen was the one that minimized: -log2P(bigram or trigram probability of word with previous words selected) + log2(edit distance of actual word with candidate word). All words of the vocabulary are checked as candidates each time and a beam search method is used that, for each step, selects the top 2 candidates that minimize the score shown above. That way for each word of the mispelled sentence, the top 2 candidates are selected and in the final step the best scored sentence is choosed. We us l1, l2 to balance the importance of the two factors (bigram prob and edit distance) and tune them to get better performance.

In [19]:
import Levenshtein
from itertools import product

def normalize_dict(original_dict):
    # Extract values from the dictionary
    values = list(original_dict.values())

    # Calculate min and max values
    min_value = min(values)
    max_value = max(values)

    # Normalize each value in the dictionary
    normalized_dict = {key: (value - min_value) / (max_value - min_value) for key, value in original_dict.items()}

    return normalized_dict

def generate_candidates_levenshtein_bi(state,word_list):
    # Given state , generate possible next words
    last_word = state[-1]
    distances = {(word, voc_token):Levenshtein.distance(word, voc_token) for word, voc_token in product(word_list, vocabulary)}
    next_words = [word for (prev_word, word) in bigram_counter if prev_word == last_word]
    return [state + [next_word] for next_word in next_words], distances

def score_levenshtein_bi(state,distance):
    l1 = 0.4
    l2 = 0.6
    probability = 0.0
    for i in range(1, len(state)):
        prev_word, word = state[i-1], state[i]
        bigram_prob = (bigram_counter[(prev_word, word)] + 0.001) / (unigram_counter[(prev_word,)] + 0.001*vocab_size)
        probability += -l1*math.log2(bigram_prob) + l2*math.log2(distance + 0.001)
    return probability


def beam_search_decode(initial_state, max_depth, beam_width, generate_candidates_fn, score_fn,word_list):
    candidates = [(initial_state, 0.0)]
    string_to_ignore = "*UNK*"
    string_to_ignore_2 = "<e>"
    
    j=0
    for depth in range(max_depth):
        new_candidates = []
        for candidate, prob in candidates:
            next_state,distances = generate_candidates_fn(candidate,word_list)
            distances = normalize_dict(distances)
            for i,_ in enumerate(next_state):
                if (string_to_ignore in next_state[i]) or (string_to_ignore_2 in next_state[i]):
                    continue
                else:
                    new_prob = prob + score_fn(next_state[i],distances[(word_list[j],next_state[i][-1])])
                    new_candidates.append((next_state[i], new_prob))
        j += 1

        new_candidates = sorted(new_candidates, key=lambda x: x[1], reverse=False)
        candidates = new_candidates[:beam_width]
    if candidates == []:
        best_sequence = []
        return best_sequence 
    best_sequence, best_prob = min(candidates, key=lambda x: x[1])
    return best_sequence

### Type 1 error example (wrong spelling)

In [20]:
word_list_1st_type = 'Noot Rted Ver ligt'
word_list_1st_type = word_list_1st_type.split()
word_list_1st_type

['Noot', 'Rted', 'Ver', 'ligt']

### Type 2 error example (logical error)

In [21]:
word_list_2nd_type = 'He is going two the class'
word_list_2nd_type = word_list_2nd_type.split()
word_list_2nd_type

['He', 'is', 'going', 'two', 'the', 'class']

### Bigram spelling corrector examples

In [22]:
initial_state = "<s>"
initial_state = initial_state.split()
max_depth = 4
beam_width = 2
best_sequence_1 = beam_search_decode(initial_state, max_depth, beam_width, generate_candidates_levenshtein_bi, score_levenshtein_bi,word_list_1st_type)
print("Best sequence:", best_sequence_1[1:])  # Excluding the "<start>" token
max_depth = 6
best_sequence_2 = beam_search_decode(initial_state, max_depth, beam_width, generate_candidates_levenshtein_bi, score_levenshtein_bi,word_list_2nd_type)
print("Best sequence:", best_sequence_2[1:])  # Excluding the "<start>" token

Best sequence: ['Not', 'Rated', 'Very', 'light']
Best sequence: ['He', 'is', 'going', 'to', 'the', 'class']


In [23]:
def generate_candidates_levenshtein_tri(state,word_list):
    # Given state , generate possible next words
    last_word = state[-1]
    last_2_word = state[-2]
    distances = {(word, voc_token):Levenshtein.distance(word, voc_token) for word, voc_token in product(word_list, vocabulary)}
    next_words = [word for (prev_2_word, prev_word, word) in trigram_counter if (prev_word == last_word and prev_2_word == last_2_word)]
    return [state + [next_word] for next_word in next_words], distances

def score_levenshtein_tri(state,distance):
    l1 = 0.4
    l2 = 0.6
    probability = 0.0
    for i in range(2, len(state)):
        prev_2_word, prev_word, word = state[i-2], state[i-1], state[i]
        trigram_prob = (trigram_counter[(prev_2_word, prev_word, word)] + 0.001) / (bigram_counter[(prev_2_word, prev_word)] + 0.001*vocab_size)
        probability += -l1*math.log2(trigram_prob) + l2*math.log2(distance + 0.001)
    return probability

## Trigram spelling corrector example

In [24]:
initial_state = "<s> <s>"
initial_state = initial_state.split()
max_depth = 4
beam_width = 2
best_sequence_1 = beam_search_decode(initial_state, max_depth, beam_width, generate_candidates_levenshtein_tri, score_levenshtein_tri,word_list_1st_type)
print("Best sequence:", best_sequence_1[2:])  # Excluding the "<start>" token
max_depth = 6
best_sequence_2 = beam_search_decode(initial_state, max_depth, beam_width, generate_candidates_levenshtein_tri, score_levenshtein_tri,word_list_2nd_type)
print("Best sequence:", best_sequence_2[2:])  # Excluding the "<start>" token

Best sequence: ['Not', 'Rated', 'Very', 'light']
Best sequence: ['He', 'is', 'gonna', 'be', 'the', 'best']


## Evaluation

Below we keep part of the sentences of our test set to intentionally make them wrongly written and see how our models correct them. We use average word and character error rates to see how often our modles make a mistake in the word selection.

In [25]:
def filter_by_word_count(texts, max_words):
    filtered_texts = [text for text in texts if len(text) <= max_words]
    return filtered_texts

max_words = 8
filtered_texts = filter_by_word_count(test_set_2, max_words)

In [26]:
import random

error_probability = 0.05

def introduce_errors(sentence, error_probability):
    modified_sentence = []
    for word in sentence:
        if word != "*UNK*":
            modified_word = ""
            for char in word:
                if char != ' ' and random.random() < error_probability:
                    modified_word += random.choice(['a', 'e', 'i', 'o', 'u', 's', 'k', 'r'])
                else:
                    modified_word += char
        else:
            modified_word = "*UNK*"
        modified_sentence.append(modified_word)
    return modified_sentence

tests_set_modified = [introduce_errors(sent, error_probability) for sent in filtered_texts]

In [27]:
for sent in tests_set_modified:
    sentence = ' '.join(sent)
    print(sentence)
    

and loosing for *UNK* / friendship .
FATHER : Right .
HEAD KNIGHT : *UNK* !
ruy # 2 : Who ?
Then you ' re *UNK* .
JACK ePARROW : *UNK* .
I oon ' t eat food .
*UNK* but niie .
*UNK* .
Dad : Did you bring your book ?
*UNK* , we ' re *UNK* .
JACK SPAaROW : Ah .
I don ' t like *UNK* .
[ clang o Bring out your dead !
Get a damn job !
And she had ' em with me !
I mean *UNK* !
They ' d be a *UNK* .
Bare ***(*) *UNK* .
rYU student # 1 : *UNK* *UNK* .
FATHER : Listen , *UNK* .
*UNK* *UNK* u
Burn !
*UNK* !
Tourist guy : Where are the *UNK* ?
*UNK* : And me .
Old woman : " "?
I hate you .
No no !
Burn her !
We ' ve found a witch !
Boyfriend : What are aou looking for ?
Shh !
Hipster chick # 2 k No !
a *UNK* .
Teen girl # 2 : That sucks .
Good *** Veou dark colour .
Ysur dog looks like a *UNK* .
I see your poino .
Bare *** Lovely mature Claret .
*UNK* !
I have *UNK* ID .
I ' m not afraid .
*UNK* .
*a* Very fine *UNK* .
*UNK* modul : I know .
Tsree years ?
That a s 5 years r
I can ' t hakg oua now 

In [28]:
corrected = []
for sent in tests_set_modified:
    max_depth = len(sent)
    initial_state = ['<s>']
    corrected.append(beam_search_decode(initial_state, max_depth, beam_width, generate_candidates_levenshtein_bi, score_levenshtein_bi,sent)[1:])
corrected

[['Guy', ':', 'for', 'the', '/', 'ship', '.'],
 ['Girl', ':', 'Right', '.'],
 ['HEAD', 'KNIGHT', ':', 'No', '!'],
 ['Guy', '#', '2', ':', 'Who', '?'],
 ['Then', 'you', "'", 're', 'not', '.'],
 ['JACK', 'SPARROW', ':', 'No', '.'],
 ['I', 'don', "'", 't', 'eat', 'dog', '.'],
 ['Girl', '#', '1', '.'],
 ['Guy', '.'],
 ['Guy', ':', 'Did', 'you', 'bring', 'your', 'boobs', '?'],
 ['Girl', ',', 'we', "'", 're', 'not', '.'],
 ['JACK', 'SPARROW', ':', 'Ah', '.'],
 ['I', 'don', "'", 't', 'like', 'a', '.'],
 ['[', 'clang', ']', 'Bring', 'out', 'your', 'dead', '!'],
 ['Get', 'a', 'damn', 'hot', '!'],
 ['And', 'she', 'had', 'a', 'new', 'tab', 'is', '!'],
 ['I', 'mean', ',', '!'],
 ['They', "'", 'd', 'be', 'a', 'bit', '.'],
 ['Bare', '***(*)', 'A', '.'],
 ['Guy', '#', '1', ':', 'I', "'", 't', '.'],
 ['Girl', ':', 'Listen', ',', 'I', '.'],
 ['Girl', '#', '1'],
 ['Burn', '!'],
 ['Girl', '#'],
 ['Tourist', 'guy', ':', 'Where', 'are', 'the', 'URL', "'"],
 ['Guy', ':', 'And', 'me', '.'],
 ['Old', 'woman',

In [29]:
import jiwer

total_wer = 0
total_cer = 0
total_sentences = len(filtered_texts)

for original, corrected_sentence in zip(filtered_texts[:len(filtered_texts)], corrected):
    
    original = ' '.join(original)
    corrected_sentence = ' '.join(corrected_sentence)
    
    # Calculate Word Error Rate (WER)
    wer = jiwer.wer(original, corrected_sentence)
    total_wer += wer

    # Calculate Character Error Rate (CER)
    cer = jiwer.cer(original, corrected_sentence)
    total_cer += cer

avg_wer = total_wer / total_sentences
avg_cer = total_cer / total_sentences


print(f"Average Word Error Rate (WER): {avg_wer * 100:.2f}%")
print(f"Average Character Error Rate (CER): {avg_cer * 100:.2f}%")

Average Word Error Rate (WER): 34.13%
Average Character Error Rate (CER): 32.19%
