In [2]:
#CMU with phoneme edit distance

import sys
import stringdist
from string import digits
import birkbeck_parser
import csv
import data_structures
import ast
import numpy as np
import pandas as pd
from tabulate import tabulate

#EMMA'S EDIT DISTANCE FOR PHONEMES
#similarity matrix 
similarities = pd.read_csv(input('matrix name: '), index_col=0)

#cost of insertion/deletion
idc=0.5

# compare two seqs of phon
def compare(string1, string2):
    r = string1
    h = string2


    d = np.zeros((len(r)+1)*(len(h)+1), dtype=float)
    d = d.reshape((len(r)+1, len(h)+1))

    e = np.zeros((len(r)+1)*(len(h)+1), dtype=object)
    e = e.reshape((len(r)+1, len(h)+1))


    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                if j == 0:
                    d[0][0] = 0
                    e[0][j] = (0,0,0)
                else:
                    d[i][j] = d[i][j-1] + idc
                    e[0][j] = (0,0,1)
            elif j == 0:
                if i == 0:
                    e[i][0] = (0,0,0)
                else:
                    d[i][j] = d[i-1][j] + idc
                    e[i][0] = (1,0,0)

    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
                e[i][j] = (0, 1, 0)                
            else:
                #some random extra stuff in here that doesn't get used - could clean up but works as is
                sub = ((d[i-1][j-1] + (score(r[i-1], h[j-1]))) , (str(e[i-1][j-1]) + 'substitution ' + str(j-1) + ', '))
                dell = (d[i-1][j] +idc, (str(e[i][j-1]) + 'deletion ' + str(j-1) + ' ,'))
                if j >= len(h):
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                else:
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                d[i][j] = min(sub, ins, dell)[0]
                e[i][j] = (dell[0]==d[i][j], sub[0]==d[i][j], ins[0]==d[i][j]) * 1    
    # d = table of actual scores
    # e = keeps track of operations
    return d, e

# individual phonemes
def score(l1, l2):
    return similarities[str(l1)][str(l2)]

#naive_backtrace and align code taken from the internet - uses the backtrace to find 
#the optimum path and alignment
#https://giov.dev/2016/01/minimum-edit-distance-in-python.html

# looks to find the smallest edit distance backwards through the matrix
def naive_backtrace(B_matrix):

    i, j = B_matrix.shape[0]-1, B_matrix.shape[1]-1
    backtrace_idxs = [(i, j)]
    while (i, j) != (0, 0):
        if B_matrix[i,j][1]:
            i, j = i-1, j-1
        elif B_matrix[i,j][0]:
            i, j = i-1, j
        elif B_matrix[i,j][2]:
            i, j = i, j-1
        backtrace_idxs.append((i,j))

    return backtrace_idxs

# uses back trace to align phonemes 
# mapping the two diff strings
def align(word_1, word_2, bt):
    
    aligned_word_1 = []
    aligned_word_2 = []
    operations = []

    backtrace = bt[::-1]  # make it a forward trace

    for k in range(len(backtrace) - 1): 
        i_0, j_0 = backtrace[k]
        i_1, j_1 = backtrace[k+1]

        w_1_letter = None
        w_2_letter = None
        op = None

        if i_1 > i_0 and j_1 > j_0:  # either substitution or no-op
            if word_1[i_0] == word_2[j_0]:  # no-op, same symbol
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = " "
            else:  # cost increased: substitution
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = "s"
        elif i_0 == i_1:  # insertion
            w_1_letter = " "
            w_2_letter = word_2[j_0]
            op = "i"
        else: #  j_0 == j_1,  deletion
            w_1_letter = word_1[i_0]
            w_2_letter = " "
            op = "d"

        aligned_word_1.append(w_1_letter)
        aligned_word_2.append(w_2_letter)
        operations.append(op)
    
    

    return aligned_word_1, aligned_word_2, operations

# pretty print of alignment table
def make_table(alignment):
    row1=[]
    row2=[]
    row3=[]
    for i,j,k in alignment:
        row1.append(i)
        row2.append(j)
        row3.append(k)
    table=[row1, row2, row3]
    return table
    
def align_and_score(word1, word2):
    d, e = compare(word1, word2)
    bt = naive_backtrace(e)
    a, b, c =  (align(word1, word2, bt))
    alignment = (list(zip(a,b,c)))
    x,y = (d.shape)
    score = (d[x-1][y-1]) 
    table = make_table(alignment)
    return score

# end of days# ----

#Decalaration and importation of the structures required for this script
birkbeck_dict = birkbeck_parser.birkbeck_dict
reverse_dict = birkbeck_parser.reverse_dict

cmu_dict = data_structures.cmu_dict   # cmu_dict[phonemes]=word
cmu_dict2 = data_structures.cmu_dict2  # cmu_dict2[word]= phonemes
cmu_phones = data_structures.cmu_phones # list of sequence of phonemes
frequency_dict = data_structures.frequency_dict #frequency_dict[word]=frequency (which is a STRING not an INT)

f = open('cmu_results&misspellings.csv') # results from g2p-seq2seq and the birkbeck data set
csv_f = csv.reader(f)
results_dict = {}

#creating dictionary of misspellings and cmu dict phonemes of misspellings
for row in csv_f:
    results_dict[row[0]] = row[1] # results_dict[misspelled word]=sequence of phonemes
    
# 1 - Getting all closes phonemes of the misspelled word

#For each misspelled word, we get a list of sequences of phonemes within an edit distance of 2
#Each sequence of phonemes must start and end with the same phoneme as the missepelled word's sequence of phonemes
close_seq_of_phonemes_dict={}
for misspelling in list(results_dict)[0:]: #results_dict len = 34013
    close_seq_of_phonemes=[]
    phonemes = results_dict.get(misspelling) #phonemes seq associated to the misspelling
    phonemes_list=phonemes.split(" ")
    close_seq_of_phonemes.append(phonemes) #make sure the list of close phonemes also contains the misspelled word's phonemes
    for i in cmu_phones:
        j=i.split(" ") #to compare the phonemes and not each caracter  
        if j[:1]==phonemes_list[:1]:
            if j[-1]==phonemes_list[-1]:
                dist = align_and_score(phonemes_list, j) # Emma's distance for phonemes
                if dist <= 0.3:
                    close_seq_of_phonemes.append(i)
    close_seq_of_phonemes_dict[misspelling]=close_seq_of_phonemes

# 2 - Getting the corresponding word for the most common sequence of phonemes
nbwords_in_cmu = 0
words_in_freq_dic =0
frequency=0
guess_dict={} # dictionary with only the misspelled word and the guess
close_dict={} # dictionary with all the close words
not_in_freq_dict={}
for misspelling in list(close_seq_of_phonemes_dict):
    words_list=[] #store all the close words 
    phonemes_list=close_seq_of_phonemes_dict.get(misspelling)
    popular_seq = phonemes_list[0] # the first et ie the misspelled word  sequence of phonemes
    max_frequency = 0 #frequency of the word
    popular_mot="mot"
    for seq in phonemes_list:
        if seq in cmu_dict:
            guess = cmu_dict.get(seq)
            words_list.append(guess.lower())
            if guess in frequency_dict:
                frequency = int(frequency_dict.get(guess)) # get the frequency associated to the guessed word
                if (frequency > max_frequency):
                    popular_mot= guess
                    max_frequency = frequency 
            else:
                with open("cmu_no_correction.csv", 'a', newline='') as csvfile:
                    Fwriter =csv.writer(csvfile, delimiter=',')
                    Fwriter.writerow([misspelling] + ['frequency'])
        else:
            with open("cmu_no_correction.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([misspelling] + ['cmu'])
                
    close_dict[misspelling]=words_list
    guess_dict[misspelling] = popular_mot
""" if(popular_mot != "mot"): #otherwise words that are either not in CMU nor frequency are added
        guess_dict[misspelling] = popular_mot"""
    
    
    
#dictionary with correct spelling from birkbeck dataset
#get the real spelling from birkbeck data set
cmu_luck_words = open('cmu_luck_words.csv', 'w+')
nb_misspelling = len(guess_dict)
nb_good_correction = 0
nb_luck= 0 # ie nb times real word is in the list of close phonemes

for misspelling in list(guess_dict):
    close_words= close_dict.get(misspelling)
    if misspelling in reverse_dict :
        realword = reverse_dict.get(misspelling).lower()

        if realword in close_words :
            nb_luck += 1
            cmu_luck_words.write(realword +","+ misspelling +"\n")

        guessword = guess_dict.get(misspelling).lower()
        with open("cmu_corrected_words.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([realword] + [misspelling] + [guessword])
        #print("The misspelled word was {}, this solution corrected to {}. The correct word was {}".format(misspelling, guessword, realword))
        if(realword==guessword):
            nb_good_correction += 1

percentage_correct = nb_good_correction / nb_misspelling * 100
print("correction percentage : ", percentage_correct)


#GET THE PERCENTAGE OF CORRECTION OVER THE WORDS THAT ARE IN FRQUENCY AND CMU
cmu_no_match = open('cmu_no_match.csv', 'w+')
cmu_good_correction = open('cmu_good_correction.csv', 'w+')
cmu_wrong_correction = open('cmu_wrong_correction.csv', 'w+')

#cmu_words1=open('cmu_corrected_words1.csv','r')
cmu_words=open('cmu_corrected_words.csv','r')

i=0
match=0
no_match=0
bad_corr =0

for line in cmu_words:
    i+=1
    line = line.strip()
    word, misspelling, guess = line.split(',')
    if (misspelling != 'mot') :
        if(word == guess):
            match+=1
            cmu_good_correction.write(word +","+ misspelling +","+ guess +"\n")
        else :
            bad_corr +=1
            cmu_wrong_correction.write(word +","+ misspelling+ ","+ guess +"\n")
    else:
        no_match+=1
        cmu_no_match.write(word +","+ misspelling+ ","+ guess +"\n")
        
print("Correct word found : {}".format(match))
print("Guess word not in frequency or CMU dict : {}".format(no_match))
print("Bad correction : {}".format(bad_corr))
print(match +no_match+bad_corr)
print(i) #lines in cmu_words

luck_percentage = nb_luck/len(close_dict)*100
print("Guess word was in close words {} times, ie in {} percent of the case".format(nb_luck,luck_percentage))

percentage = match / (match + bad_corr)*100
print ("New correction percentage {}".format(percentage))

cmu_words.close()    
cmu_good_correction.close()
cmu_luck_words.close()
cmu_wrong_correction.close()
cmu_no_match.close()

matrix name:  acoustic_similarity.csv


correction percentage :  6.4387146091200425
Correct word found : 2190
Guess word not in frequency or CMU dict : 1
Bad correction : 31821
34012
34012
Guess word was in close words 6431 times, ie in 18.907476553082645 percent of the case
New correction percentage 6.439093234541766


In [None]:
#CMU with phoneme edit distance

import sys
import stringdist
from string import digits
import birkbeck_parser
import csv
import data_structures
import ast
import numpy as np
import pandas as pd
from tabulate import tabulate


#EMMA'S EDIT DISTANCE FOR PHONEMES
#similarity matrix 
similarities = pd.read_csv("acoustic_similarity.csv", index_col=0)

#cost of insertion/deletion
idc=0.5

def compare(string1, string2):
    r = string1
    h = string2


    d = np.zeros((len(r)+1)*(len(h)+1), dtype=float)
    d = d.reshape((len(r)+1, len(h)+1))

    e = np.zeros((len(r)+1)*(len(h)+1), dtype=object)
    e = e.reshape((len(r)+1, len(h)+1))


    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                if j == 0:
                    d[0][0] = 0
                    e[0][j] = (0,0,0)
                else:
                    d[i][j] = d[i][j-1] + idc
                    e[0][j] = (0,0,1)
            elif j == 0:
                if i == 0:
                    e[i][0] = (0,0,0)
                else:
                    d[i][j] = d[i-1][j] + idc
                    e[i][0] = (1,0,0)

    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
                e[i][j] = (0, 1, 0)                
            else:
                #some random extra stuff in here that doesn't get used - could clean up but works as is
                sub = ((d[i-1][j-1] + (score(r[i-1], h[j-1]))) , (str(e[i-1][j-1]) + 'substitution ' + str(j-1) + ', '))
                dell = (d[i-1][j] +idc, (str(e[i][j-1]) + 'deletion ' + str(j-1) + ' ,'))
                if j >= len(h):
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                else:
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                d[i][j] = min(sub, ins, dell)[0]
                e[i][j] = (dell[0]==d[i][j], sub[0]==d[i][j], ins[0]==d[i][j]) * 1    
    return d, e

def score(l1, l2):
    return similarities[str(l1)][str(l2)]

#naive_backtrace and align code taken from the internet - uses the backtrace to find 
#the optimum path and alignment
#https://giov.dev/2016/01/minimum-edit-distance-in-python.html

def naive_backtrace(B_matrix):

    i, j = B_matrix.shape[0]-1, B_matrix.shape[1]-1
    backtrace_idxs = [(i, j)]
    while (i, j) != (0, 0):
        if B_matrix[i,j][1]:
            i, j = i-1, j-1
        elif B_matrix[i,j][0]:
            i, j = i-1, j
        elif B_matrix[i,j][2]:
            i, j = i, j-1
        backtrace_idxs.append((i,j))

    return backtrace_idxs

def align(word_1, word_2, bt):
    
    aligned_word_1 = []
    aligned_word_2 = []
    operations = []

    backtrace = bt[::-1]  # make it a forward trace

    for k in range(len(backtrace) - 1): 
        i_0, j_0 = backtrace[k]
        i_1, j_1 = backtrace[k+1]

        w_1_letter = None
        w_2_letter = None
        op = None

        if i_1 > i_0 and j_1 > j_0:  # either substitution or no-op
            if word_1[i_0] == word_2[j_0]:  # no-op, same symbol
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = " "
            else:  # cost increased: substitution
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = "s"
        elif i_0 == i_1:  # insertion
            w_1_letter = " "
            w_2_letter = word_2[j_0]
            op = "i"
        else: #  j_0 == j_1,  deletion
            w_1_letter = word_1[i_0]
            w_2_letter = " "
            op = "d"

        aligned_word_1.append(w_1_letter)
        aligned_word_2.append(w_2_letter)
        operations.append(op)
    
    

    return aligned_word_1, aligned_word_2, operations


def make_table(alignment):
    row1=[]
    row2=[]
    row3=[]
    for i,j,k in alignment:
        row1.append(i)
        row2.append(j)
        row3.append(k)
    table=[row1, row2, row3]
    return table
    
def align_and_score(word1, word2):
    d, e = compare(word1, word2)
    bt = naive_backtrace(e)
    a, b, c =  (align(word1, word2, bt))
    alignment = (list(zip(a,b,c)))
    x,y = (d.shape)
    score = (d[x-1][y-1]) 
    table = make_table(alignment)
    return score

    

#Decalaration and importation of the structures required for this script
birkbeck_dict = birkbeck_parser.birkbeck_dict
reverse_dict = birkbeck_parser.reverse_dict

cmu_dict = data_structures.cmu_dict   # cmu_dict[phonemes]=word
cmu_dict2 = data_structures.cmu_dict2  # cmu_dict2[word]= phonemes
cmu_phones = data_structures.cmu_phones # list of sequence of phonemes
frequency_dict = data_structures.frequency_dict #frequency_dict[word]=frequency (which is a STRING not an INT)

f = open('cmu_results&misspellings.csv') # results from g2p-seq2seq and the birkbeck data set
csv_f = csv.reader(f)
results_dict = {}

#creating dictionary of misspellings and cmu dict phonemes of misspellings
for row in csv_f:
    results_dict[row[0]] = row[1] # results_dict[misspelled word]=sequence of phonemes
    
# 1 - Getting all closes phonemes of the misspelled word

#For each misspelled word, we get a list of sequences of phonemes within an edit distance of 2
#Each sequence of phonemes must start and end with the same phoneme as the missepelled word's sequence of phonemes
close_seq_of_phonemes_dict={}
for misspelling in list(results_dict)[0:]: #results_dict len = 34013
    close_seq_of_phonemes=[]
    phonemes_dist_list=[]
    phonemes = results_dict.get(misspelling) #phonemes seq associated to the misspelling
    phonemes_list=phonemes.split(" ")
    close_seq_of_phonemes.append(phonemes) #make sure the list of close phonemes also contains the misspelled word's phonemes
    for i in cmu_phones:
        j=i.split(" ") #to compare the phonemes and not each caracter  
        if j[:1]==phonemes_list[:1]:
            if j[-1]==phonemes_list[-1]:
                dist = align_and_score(phonemes_list, j) # Emma's distance for phonemes
                if dist <= 0.7 :
                    close_seq_of_phonemes.append(i)
                    phonemes_dist_list.append([i,dist])
    #Sort close seq of phonemes and keep the 20 best
    if (len(phonemes_dist_list)>20):
        close=[] #future liste avec uniquement les 10 phonemes les plus proches
        sort=sorted(phonemes_dist_list, key=lambda tup: tup[1])
        close_phonemes_list= sort[:20] # keep only the best first elements
        for k in range (0, len(close_phonemes_list)):
            close.append(close_phonemes_list[0]) # only add the phonemes but not the distance
        close_seq_of_phonemes_dict[misspelling]=close
    else :
        close_seq_of_phonemes_dict[misspelling]=close_seq_of_phonemes

print("Retrieved all the close phonemes within an edit distance of 0.7")
# 2 - Getting the corresponding word for the most common sequence of phonemes
nbwords_in_cmu = 0
words_in_freq_dic =0
frequency=0
guess_dict={} # dictionary with only the misspelled word and the guess
close_dict={} # dictionary with all the close words
not_in_freq_dict={}
for misspelling in list(close_seq_of_phonemes_dict):
    words_list=[] #store all the close words 
    phonemes_list=close_seq_of_phonemes_dict.get(misspelling)
    popular_seq = phonemes_list[0] # the first et ie the misspelled word  sequence of phonemes
    max_frequency = 0 #frequency of the word
    popular_mot="mot"
    for seq in phonemes_list:
        if seq in list(cmu_dict):
            guess = cmu_dict.get(seq)
            words_list.append(guess.lower())
            if guess in frequency_dict:
                frequency = int(frequency_dict.get(guess)) # get the frequency associated to the guessed word
                if (frequency > max_frequency):
                    popular_mot= guess
                    max_frequency = frequency 
            else:
                with open("cmu_no_correction.csv", 'a', newline='') as csvfile:
                    Fwriter =csv.writer(csvfile, delimiter=',')
                    Fwriter.writerow([misspelling] + ['frequency'])
        else:
            with open("cmu_no_correction.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([misspelling] + ['cmu'])
                
    close_dict[misspelling]=words_list
    guess_dict[misspelling] = popular_mot
""" if(popular_mot != "mot"): #otherwise words that are either not in CMU nor frequency are added
        guess_dict[misspelling] = popular_mot"""
    
    
print("Retrieved the corresponding words")  
#dictionary with correct spelling from birkbeck dataset
#get the real spelling from birkbeck data set
cmu_luck_words = open('cmu_luck_words.csv', 'w+')
nb_misspelling = len(guess_dict)
nb_good_correction = 0
nb_luck= 0 # ie nb times real word is in the list of close phonemes

for misspelling in list(guess_dict):
    close_words= close_dict.get(misspelling)
    if misspelling in reverse_dict :
        realword = reverse_dict.get(misspelling).lower()

        if realword in close_words :
            nb_luck += 1
            cmu_luck_words.write(realword +","+ misspelling +"\n")

        guessword = guess_dict.get(misspelling).lower()
        with open("cmu_corrected_words.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([realword] + [misspelling] + [guessword])
        #print("The misspelled word was {}, this solution corrected to {}. The correct word was {}".format(misspelling, guessword, realword))
        if(realword==guessword):
            nb_good_correction += 1

percentage_correct = nb_good_correction / nb_misspelling * 100
print("correction percentage : ", percentage_correct)


#GET THE PERCENTAGE OF CORRECTION OVER THE WORDS THAT ARE IN FRQUENCY AND CMU
cmu_no_match = open('cmu_no_match.csv', 'w+')
cmu_good_correction = open('cmu_good_correction.csv', 'w+')
cmu_wrong_correction = open('cmu_wrong_correction.csv', 'w+')

#cmu_words1=open('cmu_corrected_words1.csv','r')
cmu_words=open('cmu_corrected_words.csv','r')

i=0
match=0
no_match=0
bad_corr =0

for line in cmu_words:
    i+=1
    line = line.strip()
    word, misspelling, guess = line.split(',')
    if (misspelling != 'mot') :
        if(word == guess):
            match+=1
            cmu_good_correction.write(word +","+ misspelling +","+ guess +"\n")
        else :
            bad_corr +=1
            cmu_wrong_correction.write(word +","+ misspelling+ ","+ guess +"\n")
    else:
        no_match+=1
        cmu_no_match.write(word +","+ misspelling+ ","+ guess +"\n")
        
print("Correct word found : {}".format(match))
print("Guess word not in frequency or CMU dict : {}".format(no_match))
print("Bad correction : {}".format(bad_corr))
print(match +no_match+bad_corr)
print(i) #lines in cmu_words

luck_percentage = nb_luck/len(close_dict)*100
print("Guess word was in close words {} times, ie in {} percent of the case".format(nb_luck,luck_percentage))

percentage = match / (match + bad_corr)*100
print ("New correction percentage {}".format(percentage))

cmu_words.close()    
cmu_good_correction.close()
cmu_luck_words.close()
cmu_wrong_correction.close()
cmu_no_match.close()

In [2]:
nbwords_in_cmu = 0
words_in_freq_dic = 0
frequency = 0
guess_dict = {} # dictionary with only the misspelled word and the guess
close_dict = {} # dictionary with all the close words
not_in_freq_dict = {}
for misspelling in list(close_seq_of_phonemes_dict):
    words_list = [] #store all the close words 
    phonemes_list = close_seq_of_phonemes_dict.get(misspelling)
    popular_seq = phonemes_list[0] # the first et ie the misspelled word  sequence of phonemes
    max_frequency = 0 #frequency of the word
    popular_mot = "mot"
    for seq in phonemes_list:
        if seq in list(cmu_dict):
            guess = cmu_dict.get(seq)
            words_list.append(guess.lower())
            if guess in frequency_dict:
                frequency = int(frequency_dict.get(guess)) # get the frequency associated to the guessed word
                if (frequency > max_frequency):
                    popular_mot = guess
                    max_frequency = frequency 
            else:
                with open("cmu_no_correction.csv", 'a', newline='') as csvfile:
                    Fwriter =csv.writer(csvfile, delimiter=',')
                    Fwriter.writerow([misspelling] + ['frequency'])
        else:
            with open("cmu_no_correction.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([misspelling] + ['cmu'])
                
    close_dict[misspelling] = words_list
    guess_dict[misspelling] = popular_mot
""" if(popular_mot != "mot"): #otherwise words that are either not in CMU nor frequency are added
        guess_dict[misspelling] = popular_mot"""
    
    
print("Retrieved the corresponding words")  
#dictionary with correct spelling from birkbeck dataset
#get the real spelling from birkbeck data set
cmu_luck_words = open('cmu_luck_words.csv', 'w+')
nb_misspelling = len(guess_dict)
nb_good_correction = 0
nb_luck = 0 # ie nb times real word is in the list of close phonemes

for misspelling in list(guess_dict):
    close_words = close_dict.get(misspelling)
    if misspelling in reverse_dict :
        realword = reverse_dict.get(misspelling).lower()

        if realword in close_words :
            nb_luck += 1
            cmu_luck_words.write(realword +","+ misspelling +"\n")

        guessword = guess_dict.get(misspelling).lower()
        with open("cmu_corrected_words.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([realword] + [misspelling] + [guessword])
        #print("The misspelled word was {}, this solution corrected to {}. The correct word was {}".format(misspelling, guessword, realword))
        if(realword==guessword):
            nb_good_correction += 1

percentage_correct = nb_good_correction / nb_misspelling * 100
print("correction percentage : ", percentage_correct)


#GET THE PERCENTAGE OF CORRECTION OVER THE WORDS THAT ARE IN FRQUENCY AND CMU
cmu_no_match = open('cmu_no_match.csv', 'w+')
cmu_good_correction = open('cmu_good_correction.csv', 'w+')
cmu_wrong_correction = open('cmu_wrong_correction.csv', 'w+')

#cmu_words1=open('cmu_corrected_words1.csv','r')
cmu_words = open('cmu_corrected_words.csv','r')

i = 0
match = 0
no_match = 0
bad_corr = 0

for line in cmu_words:
    i += 1
    line = line.strip()
    word, misspelling, guess = line.split(',')
    if (misspelling != 'mot') :
        if(word == guess):
            match += 1
            cmu_good_correction.write(word +","+ misspelling +","+ guess +"\n")
        else :
            bad_corr += 1
            cmu_wrong_correction.write(word +","+ misspelling+ ","+ guess +"\n")
    else:
        no_match += 1
        cmu_no_match.write(word +","+ misspelling+ ","+ guess +"\n")
        
print("Correct word found : {}".format(match))
print("Guess word not in frequency or CMU dict : {}".format(no_match))
print("Bad correction : {}".format(bad_corr))
print(match +no_match+bad_corr)
print(i) #lines in cmu_words

luck_percentage = nb_luck/len(close_dict)*100
print("Guess word was in close words {} times, ie in {} percent of the case".format(nb_luck,luck_percentage))

percentage = match / (match + bad_corr)*100
print ("New correction percentage {}".format(percentage))

cmu_words.close()    
cmu_good_correction.close()
cmu_luck_words.close()
cmu_wrong_correction.close()
cmu_no_match.close()

Retrieved the corresponding words
correction percentage :  0.5439096815923323
Correct word found : 185
Guess word not in frequency or CMU dict : 1
Bad correction : 33826
34012
34012
Guess word was in close words 263 times, ie in 0.7732337635609914 percent of the case
New correction percentage 0.5439416659316103


In [2]:
#CMU with phoneme edit distance

import sys
import stringdist
from string import digits
import birkbeck_parser
import csv
import data_structures
import ast
import numpy as np
import pandas as pd
from tabulate import tabulate

#EMMA'S EDIT DISTANCE FOR PHONEMES
#similarity matrix 
similarities = pd.read_csv(input('matrix name: '), index_col=0)

#cost of insertion/deletion
idc=0.5

def compare(string1, string2):
    r = string1
    h = string2


    d = np.zeros((len(r)+1)*(len(h)+1), dtype=float)
    d = d.reshape((len(r)+1, len(h)+1))

    e = np.zeros((len(r)+1)*(len(h)+1), dtype=object)
    e = e.reshape((len(r)+1, len(h)+1))


    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                if j == 0:
                    d[0][0] = 0
                    e[0][j] = (0,0,0)
                else:
                    d[i][j] = d[i][j-1] + idc
                    e[0][j] = (0,0,1)
            elif j == 0:
                if i == 0:
                    e[i][0] = (0,0,0)
                else:
                    d[i][j] = d[i-1][j] + idc
                    e[i][0] = (1,0,0)

    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
                e[i][j] = (0, 1, 0)                
            else:
                #some random extra stuff in here that doesn't get used - could clean up but works as is
                sub = ((d[i-1][j-1] + (score(r[i-1], h[j-1]))) , (str(e[i-1][j-1]) + 'substitution ' + str(j-1) + ', '))
                dell = (d[i-1][j] +idc, (str(e[i][j-1]) + 'deletion ' + str(j-1) + ' ,'))
                if j >= len(h):
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                else:
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                d[i][j] = min(sub, ins, dell)[0]
                e[i][j] = (dell[0]==d[i][j], sub[0]==d[i][j], ins[0]==d[i][j]) * 1    
    return d, e

def score(l1, l2):
    return similarities[str(l1)][str(l2)]

#naive_backtrace and align code taken from the internet - uses the backtrace to find 
#the optimum path and alignment
#https://giov.dev/2016/01/minimum-edit-distance-in-python.html

def naive_backtrace(B_matrix):

    i, j = B_matrix.shape[0]-1, B_matrix.shape[1]-1
    backtrace_idxs = [(i, j)]
    while (i, j) != (0, 0):
        if B_matrix[i,j][1]:
            i, j = i-1, j-1
        elif B_matrix[i,j][0]:
            i, j = i-1, j
        elif B_matrix[i,j][2]:
            i, j = i, j-1
        backtrace_idxs.append((i,j))

    return backtrace_idxs

def align(word_1, word_2, bt):
    
    aligned_word_1 = []
    aligned_word_2 = []
    operations = []

    backtrace = bt[::-1]  # make it a forward trace

    for k in range(len(backtrace) - 1): 
        i_0, j_0 = backtrace[k]
        i_1, j_1 = backtrace[k+1]

        w_1_letter = None
        w_2_letter = None
        op = None

        if i_1 > i_0 and j_1 > j_0:  # either substitution or no-op
            if word_1[i_0] == word_2[j_0]:  # no-op, same symbol
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = " "
            else:  # cost increased: substitution
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = "s"
        elif i_0 == i_1:  # insertion
            w_1_letter = " "
            w_2_letter = word_2[j_0]
            op = "i"
        else: #  j_0 == j_1,  deletion
            w_1_letter = word_1[i_0]
            w_2_letter = " "
            op = "d"

        aligned_word_1.append(w_1_letter)
        aligned_word_2.append(w_2_letter)
        operations.append(op)
    
    

    return aligned_word_1, aligned_word_2, operations


def make_table(alignment):
    row1=[]
    row2=[]
    row3=[]
    for i,j,k in alignment:
        row1.append(i)
        row2.append(j)
        row3.append(k)
    table=[row1, row2, row3]
    return table
    
def align_and_score(word1, word2):
    d, e = compare(word1, word2)
    bt = naive_backtrace(e)
    a, b, c =  (align(word1, word2, bt))
    alignment = (list(zip(a,b,c)))
    x,y = (d.shape)
    score = (d[x-1][y-1]) 
    table = make_table(alignment)
    return score

    

#Decalaration and importation of the structures required for this script
birkbeck_dict = birkbeck_parser.birkbeck_dict
reverse_dict = birkbeck_parser.reverse_dict

cmu_dict = data_structures.cmu_dict   # cmu_dict[phonemes]=word
cmu_dict2 = data_structures.cmu_dict2  # cmu_dict2[word]= phonemes
cmu_phones = data_structures.cmu_phones # list of sequence of phonemes
frequency_dict = data_structures.frequency_dict #frequency_dict[word]=frequency (which is a STRING not an INT)

f = open('cmu_results&misspellings.csv') # results from g2p-seq2seq and the birkbeck data set
csv_f = csv.reader(f)
results_dict = {}

#creating dictionary of misspellings and cmu dict phonemes of misspellings
for row in csv_f:
    results_dict[row[0]] = row[1] # results_dict[misspelled word]=sequence of phonemes
    
# 1 - Getting all closes phonemes of the misspelled word

#For each misspelled word, we get a list of sequences of phonemes within an edit distance of 2
#Each sequence of phonemes must start and end with the same phoneme as the missepelled word's sequence of phonemes
close_seq_of_phonemes_dict={}
for misspelling in list(results_dict)[0:]: #results_dict len = 34013
    close_seq_of_phonemes=[]
    phonemes = results_dict.get(misspelling) #phonemes seq associated to the misspelling
    phonemes_list=phonemes.split(" ")
    close_seq_of_phonemes.append(phonemes) #make sure the list of close phonemes also contains the misspelled word's phonemes
    for i in cmu_phones:
        j=i.split(" ") #to compare the phonemes and not each caracter  
        if j[:1]==phonemes_list[:1]:
            if j[-1]==phonemes_list[-1]:
                dist = align_and_score(phonemes_list, j) # Emma's distance for phonemes
                if dist <= 0.5:
                    close_seq_of_phonemes.append(i)
    close_seq_of_phonemes_dict[misspelling]=close_seq_of_phonemes

# 2 - Getting the corresponding word for the most common sequence of phonemes
nbwords_in_cmu = 0
words_in_freq_dic =0
frequency=0
guess_dict={} # dictionary with only the misspelled word and the guess
close_dict={} # dictionary with all the close words
not_in_freq_dict={}
for misspelling in list(close_seq_of_phonemes_dict):
    words_list=[] #store all the close words 
    phonemes_list=close_seq_of_phonemes_dict.get(misspelling)
    popular_seq = phonemes_list[0] # the first et ie the misspelled word  sequence of phonemes
    max_frequency = 0 #frequency of the word
    popular_mot="mot"
    for seq in phonemes_list:
        if seq in cmu_dict:
            guess = cmu_dict.get(seq)
            words_list.append(guess.lower())
            if guess in frequency_dict:
                frequency = int(frequency_dict.get(guess)) # get the frequency associated to the guessed word
                if (frequency > max_frequency):
                    popular_mot= guess
                    max_frequency = frequency 
            else:
                with open("cmu_no_correction.csv", 'a', newline='') as csvfile:
                    Fwriter =csv.writer(csvfile, delimiter=',')
                    Fwriter.writerow([misspelling] + ['frequency'])
        else:
            with open("cmu_no_correction.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([misspelling] + ['cmu'])
                
    close_dict[misspelling]=words_list
    guess_dict[misspelling] = popular_mot
""" if(popular_mot != "mot"): #otherwise words that are either not in CMU nor frequency are added
        guess_dict[misspelling] = popular_mot"""
    
    
    
#dictionary with correct spelling from birkbeck dataset
#get the real spelling from birkbeck data set
cmu_luck_words = open('cmu_luck_words.csv', 'w+')
nb_misspelling = len(guess_dict)
nb_good_correction = 0
nb_luck= 0 # ie nb times real word is in the list of close phonemes

for misspelling in list(guess_dict):
    close_words= close_dict.get(misspelling)
    if misspelling in reverse_dict :
        realword = reverse_dict.get(misspelling).lower()

        if realword in close_words :
            nb_luck += 1
            cmu_luck_words.write(realword +","+ misspelling +"\n")

        guessword = guess_dict.get(misspelling).lower()
        with open("cmu_corrected_words.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([realword] + [misspelling] + [guessword])
        #print("The misspelled word was {}, this solution corrected to {}. The correct word was {}".format(misspelling, guessword, realword))
        if(realword==guessword):
            nb_good_correction += 1

percentage_correct = nb_good_correction / nb_misspelling * 100
print("correction percentage : ", percentage_correct)


#GET THE PERCENTAGE OF CORRECTION OVER THE WORDS THAT ARE IN FRQUENCY AND CMU
cmu_no_match = open('cmu_no_match.csv', 'w+')
cmu_good_correction = open('cmu_good_correction.csv', 'w+')
cmu_wrong_correction = open('cmu_wrong_correction.csv', 'w+')

#cmu_words1=open('cmu_corrected_words1.csv','r')
cmu_words=open('cmu_corrected_words.csv','r')

i=0
match=0
no_match=0
bad_corr =0

for line in cmu_words:
    i+=1
    line = line.strip()
    word, misspelling, guess = line.split(',')
    if (misspelling != 'mot') :
        if(word == guess):
            match+=1
            cmu_good_correction.write(word +","+ misspelling +","+ guess +"\n")
        else :
            bad_corr +=1
            cmu_wrong_correction.write(word +","+ misspelling+ ","+ guess +"\n")
    else:
        no_match+=1
        cmu_no_match.write(word +","+ misspelling+ ","+ guess +"\n")
        
print("Correct word found : {}".format(match))
print("Guess word not in frequency or CMU dict : {}".format(no_match))
print("Bad correction : {}".format(bad_corr))
print(match +no_match+bad_corr)
print(i) #lines in cmu_words

luck_percentage = nb_luck/len(close_dict)*100
print("Guess word was in close words {} times, ie in {} percent of the case".format(nb_luck,luck_percentage))

percentage = match / (match + bad_corr)*100
print ("New correction percentage {}".format(percentage))

cmu_words.close()    
cmu_good_correction.close()
cmu_luck_words.close()
cmu_wrong_correction.close()
cmu_no_match.close()

matrix name:  acoustic_similarity.csv


correction percentage :  7.738217740275777
Correct word found : 5422
Guess word not in frequency or CMU dict : 4
Bad correction : 130622
136048
136048
Guess word was in close words 8363 times, ie in 24.587657660306352 percent of the case
New correction percentage 3.9854752874070156


In [3]:
#CMU with phoneme edit distance

import sys
import stringdist
from string import digits
import birkbeck_parser
import csv
import data_structures
import ast
import numpy as np
import pandas as pd
from tabulate import tabulate

#EMMA'S EDIT DISTANCE FOR PHONEMES
#similarity matrix 
similarities = pd.read_csv(input('matrix name: '), index_col=0)

#cost of insertion/deletion
idc=0.5

def compare(string1, string2):
    r = string1
    h = string2


    d = np.zeros((len(r)+1)*(len(h)+1), dtype=float)
    d = d.reshape((len(r)+1, len(h)+1))

    e = np.zeros((len(r)+1)*(len(h)+1), dtype=object)
    e = e.reshape((len(r)+1, len(h)+1))


    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                if j == 0:
                    d[0][0] = 0
                    e[0][j] = (0,0,0)
                else:
                    d[i][j] = d[i][j-1] + idc
                    e[0][j] = (0,0,1)
            elif j == 0:
                if i == 0:
                    e[i][0] = (0,0,0)
                else:
                    d[i][j] = d[i-1][j] + idc
                    e[i][0] = (1,0,0)

    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
                e[i][j] = (0, 1, 0)                
            else:
                #some random extra stuff in here that doesn't get used - could clean up but works as is
                sub = ((d[i-1][j-1] + (score(r[i-1], h[j-1]))) , (str(e[i-1][j-1]) + 'substitution ' + str(j-1) + ', '))
                dell = (d[i-1][j] +idc, (str(e[i][j-1]) + 'deletion ' + str(j-1) + ' ,'))
                if j >= len(h):
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                else:
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                d[i][j] = min(sub, ins, dell)[0]
                e[i][j] = (dell[0]==d[i][j], sub[0]==d[i][j], ins[0]==d[i][j]) * 1    
    return d, e

def score(l1, l2):
    return similarities[str(l1)][str(l2)]

#naive_backtrace and align code taken from the internet - uses the backtrace to find 
#the optimum path and alignment
#https://giov.dev/2016/01/minimum-edit-distance-in-python.html

def naive_backtrace(B_matrix):

    i, j = B_matrix.shape[0]-1, B_matrix.shape[1]-1
    backtrace_idxs = [(i, j)]
    while (i, j) != (0, 0):
        if B_matrix[i,j][1]:
            i, j = i-1, j-1
        elif B_matrix[i,j][0]:
            i, j = i-1, j
        elif B_matrix[i,j][2]:
            i, j = i, j-1
        backtrace_idxs.append((i,j))

    return backtrace_idxs

def align(word_1, word_2, bt):
    
    aligned_word_1 = []
    aligned_word_2 = []
    operations = []

    backtrace = bt[::-1]  # make it a forward trace

    for k in range(len(backtrace) - 1): 
        i_0, j_0 = backtrace[k]
        i_1, j_1 = backtrace[k+1]

        w_1_letter = None
        w_2_letter = None
        op = None

        if i_1 > i_0 and j_1 > j_0:  # either substitution or no-op
            if word_1[i_0] == word_2[j_0]:  # no-op, same symbol
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = " "
            else:  # cost increased: substitution
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = "s"
        elif i_0 == i_1:  # insertion
            w_1_letter = " "
            w_2_letter = word_2[j_0]
            op = "i"
        else: #  j_0 == j_1,  deletion
            w_1_letter = word_1[i_0]
            w_2_letter = " "
            op = "d"

        aligned_word_1.append(w_1_letter)
        aligned_word_2.append(w_2_letter)
        operations.append(op)
    
    

    return aligned_word_1, aligned_word_2, operations


def make_table(alignment):
    row1=[]
    row2=[]
    row3=[]
    for i,j,k in alignment:
        row1.append(i)
        row2.append(j)
        row3.append(k)
    table=[row1, row2, row3]
    return table
    
def align_and_score(word1, word2):
    d, e = compare(word1, word2)
    bt = naive_backtrace(e)
    a, b, c =  (align(word1, word2, bt))
    alignment = (list(zip(a,b,c)))
    x,y = (d.shape)
    score = (d[x-1][y-1]) 
    table = make_table(alignment)
    return score

    

#Decalaration and importation of the structures required for this script
birkbeck_dict = birkbeck_parser.birkbeck_dict
reverse_dict = birkbeck_parser.reverse_dict

cmu_dict = data_structures.cmu_dict   # cmu_dict[phonemes]=word
cmu_dict2 = data_structures.cmu_dict2  # cmu_dict2[word]= phonemes
cmu_phones = data_structures.cmu_phones # list of sequence of phonemes
frequency_dict = data_structures.frequency_dict #frequency_dict[word]=frequency (which is a STRING not an INT)

f = open('cmu_results&misspellings.csv') # results from g2p-seq2seq and the birkbeck data set
csv_f = csv.reader(f)
results_dict = {}

#creating dictionary of misspellings and cmu dict phonemes of misspellings
for row in csv_f:
    results_dict[row[0]] = row[1] # results_dict[misspelled word]=sequence of phonemes
    
# 1 - Getting all closes phonemes of the misspelled word

#For each misspelled word, we get a list of sequences of phonemes within an edit distance of 2
#Each sequence of phonemes must start and end with the same phoneme as the missepelled word's sequence of phonemes
close_seq_of_phonemes_dict={}
for misspelling in list(results_dict)[0:]: #results_dict len = 34013
    close_seq_of_phonemes=[]
    phonemes = results_dict.get(misspelling) #phonemes seq associated to the misspelling
    phonemes_list=phonemes.split(" ")
    close_seq_of_phonemes.append(phonemes) #make sure the list of close phonemes also contains the misspelled word's phonemes
    for i in cmu_phones:
        j=i.split(" ") #to compare the phonemes and not each caracter  
        if j[:1]==phonemes_list[:1]:
            if j[-1]==phonemes_list[-1]:
                dist = align_and_score(phonemes_list, j) # Emma's distance for phonemes
                if dist <= 0.7:
                    close_seq_of_phonemes.append(i)
    close_seq_of_phonemes_dict[misspelling]=close_seq_of_phonemes

# 2 - Getting the corresponding word for the most common sequence of phonemes
nbwords_in_cmu = 0
words_in_freq_dic =0
frequency=0
guess_dict={} # dictionary with only the misspelled word and the guess
close_dict={} # dictionary with all the close words
not_in_freq_dict={}
for misspelling in list(close_seq_of_phonemes_dict):
    words_list=[] #store all the close words 
    phonemes_list=close_seq_of_phonemes_dict.get(misspelling)
    popular_seq = phonemes_list[0] # the first et ie the misspelled word  sequence of phonemes
    max_frequency = 0 #frequency of the word
    popular_mot="mot"
    for seq in phonemes_list:
        if seq in cmu_dict:
            guess = cmu_dict.get(seq)
            words_list.append(guess.lower())
            if guess in frequency_dict:
                frequency = int(frequency_dict.get(guess)) # get the frequency associated to the guessed word
                if (frequency > max_frequency):
                    popular_mot= guess
                    max_frequency = frequency 
            else:
                with open("cmu_no_correction.csv", 'a', newline='') as csvfile:
                    Fwriter =csv.writer(csvfile, delimiter=',')
                    Fwriter.writerow([misspelling] + ['frequency'])
        else:
            with open("cmu_no_correction.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([misspelling] + ['cmu'])
                
    close_dict[misspelling]=words_list
    guess_dict[misspelling] = popular_mot
""" if(popular_mot != "mot"): #otherwise words that are either not in CMU nor frequency are added
        guess_dict[misspelling] = popular_mot"""
    
    
    
#dictionary with correct spelling from birkbeck dataset
#get the real spelling from birkbeck data set
cmu_luck_words = open('cmu_luck_words.csv', 'w+')
nb_misspelling = len(guess_dict)
nb_good_correction = 0
nb_luck= 0 # ie nb times real word is in the list of close phonemes

for misspelling in list(guess_dict):
    close_words= close_dict.get(misspelling)
    if misspelling in reverse_dict :
        realword = reverse_dict.get(misspelling).lower()

        if realword in close_words :
            nb_luck += 1
            cmu_luck_words.write(realword +","+ misspelling +"\n")

        guessword = guess_dict.get(misspelling).lower()
        with open("cmu_corrected_words.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([realword] + [misspelling] + [guessword])
        #print("The misspelled word was {}, this solution corrected to {}. The correct word was {}".format(misspelling, guessword, realword))
        if(realword==guessword):
            nb_good_correction += 1

percentage_correct = nb_good_correction / nb_misspelling * 100
print("correction percentage : ", percentage_correct)


#GET THE PERCENTAGE OF CORRECTION OVER THE WORDS THAT ARE IN FRQUENCY AND CMU
cmu_no_match = open('cmu_no_match.csv', 'w+')
cmu_good_correction = open('cmu_good_correction.csv', 'w+')
cmu_wrong_correction = open('cmu_wrong_correction.csv', 'w+')

#cmu_words1=open('cmu_corrected_words1.csv','r')
cmu_words=open('cmu_corrected_words.csv','r')

i=0
match=0
no_match=0
bad_corr =0

for line in cmu_words:
    i+=1
    line = line.strip()
    word, misspelling, guess = line.split(',')
    if (misspelling != 'mot') :
        if(word == guess):
            match+=1
            cmu_good_correction.write(word +","+ misspelling +","+ guess +"\n")
        else :
            bad_corr +=1
            cmu_wrong_correction.write(word +","+ misspelling+ ","+ guess +"\n")
    else:
        no_match+=1
        cmu_no_match.write(word +","+ misspelling+ ","+ guess +"\n")
        
print("Correct word found : {}".format(match))
print("Guess word not in frequency or CMU dict : {}".format(no_match))
print("Bad correction : {}".format(bad_corr))
print(match +no_match+bad_corr)
print(i) #lines in cmu_words

luck_percentage = nb_luck/len(close_dict)*100
print("Guess word was in close words {} times, ie in {} percent of the case".format(nb_luck,luck_percentage))

percentage = match / (match + bad_corr)*100
print ("New correction percentage {}".format(percentage))

cmu_words.close()    
cmu_good_correction.close()
cmu_luck_words.close()
cmu_wrong_correction.close()
cmu_no_match.close()

matrix name:  acoustic_similarity.csv


correction percentage :  5.615499955899215
Correct word found : 1910
Guess word not in frequency or CMU dict : 1
Bad correction : 32101
34012
34012
Guess word was in close words 12027 times, ie in 35.360009408167464 percent of the case
New correction percentage 5.61583017259122


In [5]:
#CMU with phoneme edit distance

import sys
import stringdist
from string import digits
import birkbeck_parser
import csv
import data_structures
import ast
import numpy as np
import pandas as pd
from tabulate import tabulate

#EMMA'S EDIT DISTANCE FOR PHONEMES
#similarity matrix 
similarities = pd.read_csv(input('matrix name: '), index_col=0)

#cost of insertion/deletion
idc=0.5

def compare(string1, string2):
    r = string1
    h = string2


    d = np.zeros((len(r)+1)*(len(h)+1), dtype=float)
    d = d.reshape((len(r)+1, len(h)+1))

    e = np.zeros((len(r)+1)*(len(h)+1), dtype=object)
    e = e.reshape((len(r)+1, len(h)+1))


    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                if j == 0:
                    d[0][0] = 0
                    e[0][j] = (0,0,0)
                else:
                    d[i][j] = d[i][j-1] + idc
                    e[0][j] = (0,0,1)
            elif j == 0:
                if i == 0:
                    e[i][0] = (0,0,0)
                else:
                    d[i][j] = d[i-1][j] + idc
                    e[i][0] = (1,0,0)

    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
                e[i][j] = (0, 1, 0)                
            else:
                #some random extra stuff in here that doesn't get used - could clean up but works as is
                sub = ((d[i-1][j-1] + (score(r[i-1], h[j-1]))) , (str(e[i-1][j-1]) + 'substitution ' + str(j-1) + ', '))
                dell = (d[i-1][j] +idc, (str(e[i][j-1]) + 'deletion ' + str(j-1) + ' ,'))
                if j >= len(h):
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                else:
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                d[i][j] = min(sub, ins, dell)[0]
                e[i][j] = (dell[0]==d[i][j], sub[0]==d[i][j], ins[0]==d[i][j]) * 1    
    return d, e

def score(l1, l2):
    return similarities[str(l1)][str(l2)]

#naive_backtrace and align code taken from the internet - uses the backtrace to find 
#the optimum path and alignment
#https://giov.dev/2016/01/minimum-edit-distance-in-python.html

def naive_backtrace(B_matrix):

    i, j = B_matrix.shape[0]-1, B_matrix.shape[1]-1
    backtrace_idxs = [(i, j)]
    while (i, j) != (0, 0):
        if B_matrix[i,j][1]:
            i, j = i-1, j-1
        elif B_matrix[i,j][0]:
            i, j = i-1, j
        elif B_matrix[i,j][2]:
            i, j = i, j-1
        backtrace_idxs.append((i,j))

    return backtrace_idxs

def align(word_1, word_2, bt):
    
    aligned_word_1 = []
    aligned_word_2 = []
    operations = []

    backtrace = bt[::-1]  # make it a forward trace

    for k in range(len(backtrace) - 1): 
        i_0, j_0 = backtrace[k]
        i_1, j_1 = backtrace[k+1]

        w_1_letter = None
        w_2_letter = None
        op = None

        if i_1 > i_0 and j_1 > j_0:  # either substitution or no-op
            if word_1[i_0] == word_2[j_0]:  # no-op, same symbol
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = " "
            else:  # cost increased: substitution
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = "s"
        elif i_0 == i_1:  # insertion
            w_1_letter = " "
            w_2_letter = word_2[j_0]
            op = "i"
        else: #  j_0 == j_1,  deletion
            w_1_letter = word_1[i_0]
            w_2_letter = " "
            op = "d"

        aligned_word_1.append(w_1_letter)
        aligned_word_2.append(w_2_letter)
        operations.append(op)
    
    

    return aligned_word_1, aligned_word_2, operations


def make_table(alignment):
    row1=[]
    row2=[]
    row3=[]
    for i,j,k in alignment:
        row1.append(i)
        row2.append(j)
        row3.append(k)
    table=[row1, row2, row3]
    return table
    
def align_and_score(word1, word2):
    d, e = compare(word1, word2)
    bt = naive_backtrace(e)
    a, b, c =  (align(word1, word2, bt))
    alignment = (list(zip(a,b,c)))
    x,y = (d.shape)
    score = (d[x-1][y-1]) 
    table = make_table(alignment)
    return score

    

#Decalaration and importation of the structures required for this script
birkbeck_dict = birkbeck_parser.birkbeck_dict
reverse_dict = birkbeck_parser.reverse_dict

cmu_dict = data_structures.cmu_dict   # cmu_dict[phonemes]=word
cmu_dict2 = data_structures.cmu_dict2  # cmu_dict2[word]= phonemes
cmu_phones = data_structures.cmu_phones # list of sequence of phonemes
frequency_dict = data_structures.frequency_dict #frequency_dict[word]=frequency (which is a STRING not an INT)

f = open('cmu_results&misspellings.csv') # results from g2p-seq2seq and the birkbeck data set
csv_f = csv.reader(f)
results_dict = {}

#creating dictionary of misspellings and cmu dict phonemes of misspellings
for row in csv_f:
    results_dict[row[0]] = row[1] # results_dict[misspelled word]=sequence of phonemes
    
# 1 - Getting all closes phonemes of the misspelled word

#For each misspelled word, we get a list of sequences of phonemes within an edit distance of 2
#Each sequence of phonemes must start and end with the same phoneme as the missepelled word's sequence of phonemes
close_seq_of_phonemes_dict={}
for misspelling in list(results_dict)[0:]: #results_dict len = 34013
    close_seq_of_phonemes=[]
    phonemes = results_dict.get(misspelling) #phonemes seq associated to the misspelling
    phonemes_list=phonemes.split(" ")
    close_seq_of_phonemes.append(phonemes) #make sure the list of close phonemes also contains the misspelled word's phonemes
    for i in cmu_phones:
        j=i.split(" ") #to compare the phonemes and not each caracter  
        if j[:1]==phonemes_list[:1]:
            if j[-1]==phonemes_list[-1]:
                dist = align_and_score(phonemes_list, j) # Emma's distance for phonemes
                if dist <= 1:
                    close_seq_of_phonemes.append(i)
    close_seq_of_phonemes_dict[misspelling]=close_seq_of_phonemes

# 2 - Getting the corresponding word for the most common sequence of phonemes
nbwords_in_cmu = 0
words_in_freq_dic =0
frequency=0
guess_dict={} # dictionary with only the misspelled word and the guess
close_dict={} # dictionary with all the close words
not_in_freq_dict={}
for misspelling in list(close_seq_of_phonemes_dict):
    words_list=[] #store all the close words 
    phonemes_list=close_seq_of_phonemes_dict.get(misspelling)
    popular_seq = phonemes_list[0] # the first et ie the misspelled word  sequence of phonemes
    max_frequency = 0 #frequency of the word
    popular_mot="mot"
    for seq in phonemes_list:
        if seq in cmu_dict:
            guess = cmu_dict.get(seq)
            words_list.append(guess.lower())
            if guess in frequency_dict:
                frequency = int(frequency_dict.get(guess)) # get the frequency associated to the guessed word
                if (frequency > max_frequency):
                    popular_mot= guess
                    max_frequency = frequency 
            else:
                with open("cmu_no_correction.csv", 'a', newline='') as csvfile:
                    Fwriter =csv.writer(csvfile, delimiter=',')
                    Fwriter.writerow([misspelling] + ['frequency'])
        else:
            with open("cmu_no_correction.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([misspelling] + ['cmu'])
                
    close_dict[misspelling]=words_list
    guess_dict[misspelling] = popular_mot
""" if(popular_mot != "mot"): #otherwise words that are either not in CMU nor frequency are added
        guess_dict[misspelling] = popular_mot"""
    
    
    
#dictionary with correct spelling from birkbeck dataset
#get the real spelling from birkbeck data set
cmu_luck_words = open('cmu_luck_words.csv', 'w+')
nb_misspelling = len(guess_dict)
nb_good_correction = 0
nb_luck= 0 # ie nb times real word is in the list of close phonemes

for misspelling in list(guess_dict):
    close_words= close_dict.get(misspelling)
    if misspelling in reverse_dict :
        realword = reverse_dict.get(misspelling).lower()

        if realword in close_words :
            nb_luck += 1
            cmu_luck_words.write(realword +","+ misspelling +"\n")

        guessword = guess_dict.get(misspelling).lower()
        with open("cmu_corrected_words.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([realword] + [misspelling] + [guessword])
        #print("The misspelled word was {}, this solution corrected to {}. The correct word was {}".format(misspelling, guessword, realword))
        if(realword==guessword):
            nb_good_correction += 1

percentage_correct = nb_good_correction / nb_misspelling * 100
print("correction percentage : ", percentage_correct)


#GET THE PERCENTAGE OF CORRECTION OVER THE WORDS THAT ARE IN FRQUENCY AND CMU
cmu_no_match = open('cmu_no_match.csv', 'w+')
cmu_good_correction = open('cmu_good_correction.csv', 'w+')
cmu_wrong_correction = open('cmu_wrong_correction.csv', 'w+')

#cmu_words1=open('cmu_corrected_words1.csv','r')
cmu_words=open('cmu_corrected_words.csv','r')

i=0
match=0
no_match=0
bad_corr =0

for line in cmu_words:
    i+=1
    line = line.strip()
    word, misspelling, guess = line.split(',')
    if (misspelling != 'mot') :
        if(word == guess):
            match+=1
            cmu_good_correction.write(word +","+ misspelling +","+ guess +"\n")
        else :
            bad_corr +=1
            cmu_wrong_correction.write(word +","+ misspelling+ ","+ guess +"\n")
    else:
        no_match+=1
        cmu_no_match.write(word +","+ misspelling+ ","+ guess +"\n")
        
print("Correct word found : {}".format(match))
print("Guess word not in frequency or CMU dict : {}".format(no_match))
print("Bad correction : {}".format(bad_corr))
print(match +no_match+bad_corr)
print(i) #lines in cmu_words

luck_percentage = nb_luck/len(close_dict)*100
print("Guess word was in close words {} times, ie in {} percent of the case".format(nb_luck,luck_percentage))

percentage = match / (match + bad_corr)*100
print ("New correction percentage {}".format(percentage))

cmu_words.close()    
cmu_good_correction.close()
cmu_luck_words.close()
cmu_wrong_correction.close()
cmu_no_match.close()

matrix name:  acoustic_similarity.csv


correction percentage :  5.697821421221297
Correct word found : 1938
Guess word not in frequency or CMU dict : 1
Bad correction : 32073
34012
34012
Guess word was in close words 12893 times, ie in 37.90609472848617 percent of the case
New correction percentage 5.6981564787862755


In [1]:
#CMU with phoneme edit distance

import sys
import stringdist
from string import digits
import hollbrook_structures
import csv
import data_structures
import ast
import numpy as np
import pandas as pd
from tabulate import tabulate

#EMMA'S EDIT DISTANCE FOR PHONEMES
#similarity matrix 
similarities = pd.read_csv(input('matrix name: '), index_col=0)

#cost of insertion/deletion
idc=0.5

def compare(string1, string2):
    r = string1
    h = string2


    d = np.zeros((len(r)+1)*(len(h)+1), dtype=float)
    d = d.reshape((len(r)+1, len(h)+1))

    e = np.zeros((len(r)+1)*(len(h)+1), dtype=object)
    e = e.reshape((len(r)+1, len(h)+1))


    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                if j == 0:
                    d[0][0] = 0
                    e[0][j] = (0,0,0)
                else:
                    d[i][j] = d[i][j-1] + idc
                    e[0][j] = (0,0,1)
            elif j == 0:
                if i == 0:
                    e[i][0] = (0,0,0)
                else:
                    d[i][j] = d[i-1][j] + idc
                    e[i][0] = (1,0,0)

    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
                e[i][j] = (0, 1, 0)                
            else:
                #some random extra stuff in here that doesn't get used - could clean up but works as is
                sub = ((d[i-1][j-1] + (score(r[i-1], h[j-1]))) , (str(e[i-1][j-1]) + 'substitution ' + str(j-1) + ', '))
                dell = (d[i-1][j] +idc, (str(e[i][j-1]) + 'deletion ' + str(j-1) + ' ,'))
                if j >= len(h):
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                else:
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                d[i][j] = min(sub, ins, dell)[0]
                e[i][j] = (dell[0]==d[i][j], sub[0]==d[i][j], ins[0]==d[i][j]) * 1    
    return d, e

def score(l1, l2):
    return similarities[str(l1)][str(l2)]

#naive_backtrace and align code taken from the internet - uses the backtrace to find 
#the optimum path and alignment
#https://giov.dev/2016/01/minimum-edit-distance-in-python.html

def naive_backtrace(B_matrix):

    i, j = B_matrix.shape[0]-1, B_matrix.shape[1]-1
    backtrace_idxs = [(i, j)]
    while (i, j) != (0, 0):
        if B_matrix[i,j][1]:
            i, j = i-1, j-1
        elif B_matrix[i,j][0]:
            i, j = i-1, j
        elif B_matrix[i,j][2]:
            i, j = i, j-1
        backtrace_idxs.append((i,j))

    return backtrace_idxs

def align(word_1, word_2, bt):
    
    aligned_word_1 = []
    aligned_word_2 = []
    operations = []

    backtrace = bt[::-1]  # make it a forward trace

    for k in range(len(backtrace) - 1): 
        i_0, j_0 = backtrace[k]
        i_1, j_1 = backtrace[k+1]

        w_1_letter = None
        w_2_letter = None
        op = None

        if i_1 > i_0 and j_1 > j_0:  # either substitution or no-op
            if word_1[i_0] == word_2[j_0]:  # no-op, same symbol
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = " "
            else:  # cost increased: substitution
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = "s"
        elif i_0 == i_1:  # insertion
            w_1_letter = " "
            w_2_letter = word_2[j_0]
            op = "i"
        else: #  j_0 == j_1,  deletion
            w_1_letter = word_1[i_0]
            w_2_letter = " "
            op = "d"

        aligned_word_1.append(w_1_letter)
        aligned_word_2.append(w_2_letter)
        operations.append(op)
    
    

    return aligned_word_1, aligned_word_2, operations


def make_table(alignment):
    row1=[]
    row2=[]
    row3=[]
    for i,j,k in alignment:
        row1.append(i)
        row2.append(j)
        row3.append(k)
    table=[row1, row2, row3]
    return table
    
def align_and_score(word1, word2):
    d, e = compare(word1, word2)
    bt = naive_backtrace(e)
    a, b, c =  (align(word1, word2, bt))
    alignment = (list(zip(a,b,c)))
    x,y = (d.shape)
    score = (d[x-1][y-1]) 
    table = make_table(alignment)
    return score

    

#Decalaration and importation of the structures required for this scriptbirkbeck_dict = birkbeck_parser.birkbeck_dict
reverse_dict = hollbrook_structures.hollbrook_reverse

cmu_dict = data_structures.cmu_dict   # cmu_dict[phonemes]=word
cmu_dict2 = data_structures.cmu_dict2  # cmu_dict2[word]= phonemes
cmu_phones = data_structures.cmu_phones # list of sequence of phonemes
frequency_dict = data_structures.frequency_dict #frequency_dict[word]=frequency (which is a STRING not an INT)

f = open('hollbrook_miss_phi.csv') # results from g2p-seq2seq and the hollbrook data set
csv_f = csv.reader(f)
results_dict = {}

#creating dictionary of misspellings and cmu dict phonemes of misspellings
for row in csv_f:
    results_dict[row[0]] = row[1] # results_dict[misspelled word]=sequence of phonemes
    
# 1 - Getting all closes phonemes of the misspelled word

#For each misspelled word, we get a list of sequences of phonemes within an edit distance of 2
#Each sequence of phonemes must start and end with the same phoneme as the missepelled word's sequence of phonemes
close_seq_of_phonemes_dict={}
for misspelling in list(results_dict)[0:]: #results_dict len = 34013
    close_seq_of_phonemes=[]
    phonemes = results_dict.get(misspelling) #phonemes seq associated to the misspelling
    phonemes_list=phonemes.split(" ")
    close_seq_of_phonemes.append(phonemes) #make sure the list of close phonemes also contains the misspelled word's phonemes
    for i in cmu_phones:
        j=i.split(" ") #to compare the phonemes and not each caracter  
        if j[:1]==phonemes_list[:1]:
            if j[-1]==phonemes_list[-1]:
                dist = align_and_score(phonemes_list, j) # Emma's distance for phonemes
                if dist <= 0.5:
                    close_seq_of_phonemes.append(i)
    close_seq_of_phonemes_dict[misspelling]=close_seq_of_phonemes

# 2 - Getting the corresponding word for the most common sequence of phonemes
nbwords_in_cmu = 0
words_in_freq_dic =0
frequency=0
guess_dict={} # dictionary with only the misspelled word and the guess
close_dict={} # dictionary with all the close words
not_in_freq_dict={}
for misspelling in list(close_seq_of_phonemes_dict):
    words_list=[] #store all the close words 
    phonemes_list=close_seq_of_phonemes_dict.get(misspelling)
    popular_seq = phonemes_list[0] # the first et ie the misspelled word  sequence of phonemes
    max_frequency = 0 #frequency of the word
    popular_mot="mot"
    for seq in phonemes_list:
        if seq in cmu_dict:
            guess = cmu_dict.get(seq)
            words_list.append(guess.lower())
            if guess in frequency_dict:
                frequency = int(frequency_dict.get(guess)) # get the frequency associated to the guessed word
                if (frequency > max_frequency):
                    popular_mot= guess
                    max_frequency = frequency 
            else:
                with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                    Fwriter =csv.writer(csvfile, delimiter=',')
                    Fwriter.writerow([misspelling] + ['frequency'])
        else:
            with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([misspelling] + ['cmu'])
                
    close_dict[misspelling]=words_list
    guess_dict[misspelling] = popular_mot
""" if(popular_mot != "mot"): #otherwise words that are either not in CMU nor frequency are added
        guess_dict[misspelling] = popular_mot"""
    
    
    
#dictionary with correct spelling from birkbeck dataset
#get the real spelling from birkbeck data set
cmu_luck_words = open('HBKcmu_luck_words.csv', 'w+')
nb_misspelling = len(guess_dict)
nb_good_correction = 0
nb_luck= 0 # ie nb times real word is in the list of close phonemes

for misspelling in list(guess_dict):
    close_words= close_dict.get(misspelling)
    if misspelling in reverse_dict :
        realword = reverse_dict.get(misspelling).lower()

        if realword in close_words :
            nb_luck += 1
            cmu_luck_words.write(realword +","+ misspelling +"\n")

        guessword = guess_dict.get(misspelling).lower()
        with open("HBKcmu_corrected_words.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([realword] + [misspelling] + [guessword])
        #print("The misspelled word was {}, this solution corrected to {}. The correct word was {}".format(misspelling, guessword, realword))
        if(realword==guessword):
            nb_good_correction += 1

percentage_correct = nb_good_correction / nb_misspelling * 100
print("correction percentage : ", percentage_correct)


#GET THE PERCENTAGE OF CORRECTION OVER THE WORDS THAT ARE IN FRQUENCY AND CMU
cmu_no_match = open('HBKcmu_no_match.csv', 'w+')
cmu_good_correction = open('HBKcmu_good_correction.csv', 'w+')
cmu_wrong_correction = open('HBKcmu_wrong_correction.csv', 'w+')

#cmu_words1=open('cmu_corrected_words1.csv','r')
cmu_words=open('HBKcmu_corrected_words.csv','r')

i=0
match=0
no_match=0
bad_corr =0

for line in cmu_words:
    i+=1
    line = line.strip()
    word, misspelling, guess = line.split(',')
    if (misspelling != 'mot') :
        if(word == guess):
            match+=1
            cmu_good_correction.write(word +","+ misspelling +","+ guess +"\n")
        else :
            bad_corr +=1
            cmu_wrong_correction.write(word +","+ misspelling+ ","+ guess +"\n")
    else:
        no_match+=1
        cmu_no_match.write(word +","+ misspelling+ ","+ guess +"\n")
        
print("Correct word found : {}".format(match))
print("Guess word not in frequency or CMU dict : {}".format(no_match))
print("Bad correction : {}".format(bad_corr))
print(match +no_match+bad_corr)
print(i) #lines in cmu_words

luck_percentage = nb_luck/len(close_dict)*100
print("Guess word was in close words {} times, ie in {} percent of the case".format(nb_luck,luck_percentage))

percentage = match / (match + bad_corr)*100
print ("New correction percentage {}".format(percentage))

cmu_words.close()    
cmu_good_correction.close()
cmu_luck_words.close()
cmu_wrong_correction.close()
cmu_no_match.close()

matrix name:  acoustic_similarity.csv


correction percentage :  9.70873786407767
Correct word found : 70
Guess word not in frequency or CMU dict : 0
Bad correction : 578
648
648
Guess word was in close words 224 times, ie in 31.06796116504854 percent of the case
New correction percentage 10.802469135802468


In [2]:
#CMU with phoneme edit distance

import sys
import stringdist
from string import digits
import hollbrook_structures
import csv
import data_structures
import ast
import numpy as np
import pandas as pd
from tabulate import tabulate

#EMMA'S EDIT DISTANCE FOR PHONEMES
#similarity matrix 
similarities = pd.read_csv(input('matrix name: '), index_col=0)

#cost of insertion/deletion
idc=0.5

def compare(string1, string2):
    r = string1
    h = string2


    d = np.zeros((len(r)+1)*(len(h)+1), dtype=float)
    d = d.reshape((len(r)+1, len(h)+1))

    e = np.zeros((len(r)+1)*(len(h)+1), dtype=object)
    e = e.reshape((len(r)+1, len(h)+1))


    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                if j == 0:
                    d[0][0] = 0
                    e[0][j] = (0,0,0)
                else:
                    d[i][j] = d[i][j-1] + idc
                    e[0][j] = (0,0,1)
            elif j == 0:
                if i == 0:
                    e[i][0] = (0,0,0)
                else:
                    d[i][j] = d[i-1][j] + idc
                    e[i][0] = (1,0,0)

    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
                e[i][j] = (0, 1, 0)                
            else:
                #some random extra stuff in here that doesn't get used - could clean up but works as is
                sub = ((d[i-1][j-1] + (score(r[i-1], h[j-1]))) , (str(e[i-1][j-1]) + 'substitution ' + str(j-1) + ', '))
                dell = (d[i-1][j] +idc, (str(e[i][j-1]) + 'deletion ' + str(j-1) + ' ,'))
                if j >= len(h):
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                else:
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                d[i][j] = min(sub, ins, dell)[0]
                e[i][j] = (dell[0]==d[i][j], sub[0]==d[i][j], ins[0]==d[i][j]) * 1    
    return d, e

def score(l1, l2):
    return similarities[str(l1)][str(l2)]

#naive_backtrace and align code taken from the internet - uses the backtrace to find 
#the optimum path and alignment
#https://giov.dev/2016/01/minimum-edit-distance-in-python.html

def naive_backtrace(B_matrix):

    i, j = B_matrix.shape[0]-1, B_matrix.shape[1]-1
    backtrace_idxs = [(i, j)]
    while (i, j) != (0, 0):
        if B_matrix[i,j][1]:
            i, j = i-1, j-1
        elif B_matrix[i,j][0]:
            i, j = i-1, j
        elif B_matrix[i,j][2]:
            i, j = i, j-1
        backtrace_idxs.append((i,j))

    return backtrace_idxs

def align(word_1, word_2, bt):
    
    aligned_word_1 = []
    aligned_word_2 = []
    operations = []

    backtrace = bt[::-1]  # make it a forward trace

    for k in range(len(backtrace) - 1): 
        i_0, j_0 = backtrace[k]
        i_1, j_1 = backtrace[k+1]

        w_1_letter = None
        w_2_letter = None
        op = None

        if i_1 > i_0 and j_1 > j_0:  # either substitution or no-op
            if word_1[i_0] == word_2[j_0]:  # no-op, same symbol
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = " "
            else:  # cost increased: substitution
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = "s"
        elif i_0 == i_1:  # insertion
            w_1_letter = " "
            w_2_letter = word_2[j_0]
            op = "i"
        else: #  j_0 == j_1,  deletion
            w_1_letter = word_1[i_0]
            w_2_letter = " "
            op = "d"

        aligned_word_1.append(w_1_letter)
        aligned_word_2.append(w_2_letter)
        operations.append(op)
    
    

    return aligned_word_1, aligned_word_2, operations


def make_table(alignment):
    row1=[]
    row2=[]
    row3=[]
    for i,j,k in alignment:
        row1.append(i)
        row2.append(j)
        row3.append(k)
    table=[row1, row2, row3]
    return table
    
def align_and_score(word1, word2):
    d, e = compare(word1, word2)
    bt = naive_backtrace(e)
    a, b, c =  (align(word1, word2, bt))
    alignment = (list(zip(a,b,c)))
    x,y = (d.shape)
    score = (d[x-1][y-1]) 
    table = make_table(alignment)
    return score

    

#Decalaration and importation of the structures required for this scriptbirkbeck_dict = birkbeck_parser.birkbeck_dict
reverse_dict = hollbrook_structures.hollbrook_reverse

cmu_dict = data_structures.cmu_dict   # cmu_dict[phonemes]=word
cmu_dict2 = data_structures.cmu_dict2  # cmu_dict2[word]= phonemes
cmu_phones = data_structures.cmu_phones # list of sequence of phonemes
frequency_dict = data_structures.frequency_dict #frequency_dict[word]=frequency (which is a STRING not an INT)

f = open('hollbrook_miss_phi.csv') # results from g2p-seq2seq and the hollbrook data set
csv_f = csv.reader(f)
results_dict = {}

#creating dictionary of misspellings and cmu dict phonemes of misspellings
for row in csv_f:
    results_dict[row[0]] = row[1] # results_dict[misspelled word]=sequence of phonemes
    
# 1 - Getting all closes phonemes of the misspelled word

#For each misspelled word, we get a list of sequences of phonemes within an edit distance of 2
#Each sequence of phonemes must start and end with the same phoneme as the missepelled word's sequence of phonemes
close_seq_of_phonemes_dict={}
for misspelling in list(results_dict)[0:]: #results_dict len = 34013
    close_seq_of_phonemes=[]
    phonemes = results_dict.get(misspelling) #phonemes seq associated to the misspelling
    phonemes_list=phonemes.split(" ")
    close_seq_of_phonemes.append(phonemes) #make sure the list of close phonemes also contains the misspelled word's phonemes
    for i in cmu_phones:
        j=i.split(" ") #to compare the phonemes and not each caracter  
        if j[:1]==phonemes_list[:1]:
            if j[-1]==phonemes_list[-1]:
                dist = align_and_score(phonemes_list, j) # Emma's distance for phonemes
                if dist <= 0.7:
                    close_seq_of_phonemes.append(i)
    close_seq_of_phonemes_dict[misspelling]=close_seq_of_phonemes

# 2 - Getting the corresponding word for the most common sequence of phonemes
nbwords_in_cmu = 0
words_in_freq_dic =0
frequency=0
guess_dict={} # dictionary with only the misspelled word and the guess
close_dict={} # dictionary with all the close words
not_in_freq_dict={}
for misspelling in list(close_seq_of_phonemes_dict):
    words_list=[] #store all the close words 
    phonemes_list=close_seq_of_phonemes_dict.get(misspelling)
    popular_seq = phonemes_list[0] # the first et ie the misspelled word  sequence of phonemes
    max_frequency = 0 #frequency of the word
    popular_mot="mot"
    for seq in phonemes_list:
        if seq in cmu_dict:
            guess = cmu_dict.get(seq)
            words_list.append(guess.lower())
            if guess in frequency_dict:
                frequency = int(frequency_dict.get(guess)) # get the frequency associated to the guessed word
                if (frequency > max_frequency):
                    popular_mot= guess
                    max_frequency = frequency 
            else:
                with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                    Fwriter =csv.writer(csvfile, delimiter=',')
                    Fwriter.writerow([misspelling] + ['frequency'])
        else:
            with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([misspelling] + ['cmu'])
                
    close_dict[misspelling]=words_list
    guess_dict[misspelling] = popular_mot
""" if(popular_mot != "mot"): #otherwise words that are either not in CMU nor frequency are added
        guess_dict[misspelling] = popular_mot"""
    
    
    
#dictionary with correct spelling from birkbeck dataset
#get the real spelling from birkbeck data set
cmu_luck_words = open('HBKcmu_luck_words.csv', 'w+')
nb_misspelling = len(guess_dict)
nb_good_correction = 0
nb_luck= 0 # ie nb times real word is in the list of close phonemes

for misspelling in list(guess_dict):
    close_words= close_dict.get(misspelling)
    if misspelling in reverse_dict :
        realword = reverse_dict.get(misspelling).lower()

        if realword in close_words :
            nb_luck += 1
            cmu_luck_words.write(realword +","+ misspelling +"\n")

        guessword = guess_dict.get(misspelling).lower()
        with open("HBKcmu_corrected_words.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([realword] + [misspelling] + [guessword])
        #print("The misspelled word was {}, this solution corrected to {}. The correct word was {}".format(misspelling, guessword, realword))
        if(realword==guessword):
            nb_good_correction += 1

percentage_correct = nb_good_correction / nb_misspelling * 100
print("correction percentage : ", percentage_correct)


#GET THE PERCENTAGE OF CORRECTION OVER THE WORDS THAT ARE IN FRQUENCY AND CMU
cmu_no_match = open('HBKcmu_no_match.csv', 'w+')
cmu_good_correction = open('HBKcmu_good_correction.csv', 'w+')
cmu_wrong_correction = open('HBKcmu_wrong_correction.csv', 'w+')

#cmu_words1=open('cmu_corrected_words1.csv','r')
cmu_words=open('HBKcmu_corrected_words.csv','r')

i=0
match=0
no_match=0
bad_corr =0

for line in cmu_words:
    i+=1
    line = line.strip()
    word, misspelling, guess = line.split(',')
    if (misspelling != 'mot') :
        if(word == guess):
            match+=1
            cmu_good_correction.write(word +","+ misspelling +","+ guess +"\n")
        else :
            bad_corr +=1
            cmu_wrong_correction.write(word +","+ misspelling+ ","+ guess +"\n")
    else:
        no_match+=1
        cmu_no_match.write(word +","+ misspelling+ ","+ guess +"\n")
        
print("Correct word found : {}".format(match))
print("Guess word not in frequency or CMU dict : {}".format(no_match))
print("Bad correction : {}".format(bad_corr))
print(match +no_match+bad_corr)
print(i) #lines in cmu_words

luck_percentage = nb_luck/len(close_dict)*100
print("Guess word was in close words {} times, ie in {} percent of the case".format(nb_luck,luck_percentage))

percentage = match / (match + bad_corr)*100
print ("New correction percentage {}".format(percentage))

cmu_words.close()    
cmu_good_correction.close()
cmu_luck_words.close()
cmu_wrong_correction.close()
cmu_no_match.close()

matrix name:  acoustic_similarity.csv


correction percentage :  6.38002773925104
Correct word found : 46
Guess word not in frequency or CMU dict : 0
Bad correction : 602
648
648
Guess word was in close words 258 times, ie in 35.78363384188627 percent of the case
New correction percentage 7.098765432098765


In [4]:
#CMU with phoneme edit distance

import sys
import stringdist
from string import digits
import hollbrook_structures
import csv
import data_structures
import ast
import numpy as np
import pandas as pd
from tabulate import tabulate

#EMMA'S EDIT DISTANCE FOR PHONEMES
#similarity matrix 
similarities = pd.read_csv(input('matrix name: '), index_col=0)

#cost of insertion/deletion
idc=0.5

def compare(string1, string2):
    r = string1
    h = string2


    d = np.zeros((len(r)+1)*(len(h)+1), dtype=float)
    d = d.reshape((len(r)+1, len(h)+1))

    e = np.zeros((len(r)+1)*(len(h)+1), dtype=object)
    e = e.reshape((len(r)+1, len(h)+1))


    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                if j == 0:
                    d[0][0] = 0
                    e[0][j] = (0,0,0)
                else:
                    d[i][j] = d[i][j-1] + idc
                    e[0][j] = (0,0,1)
            elif j == 0:
                if i == 0:
                    e[i][0] = (0,0,0)
                else:
                    d[i][j] = d[i-1][j] + idc
                    e[i][0] = (1,0,0)

    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
                e[i][j] = (0, 1, 0)                
            else:
                #some random extra stuff in here that doesn't get used - could clean up but works as is
                sub = ((d[i-1][j-1] + (score(r[i-1], h[j-1]))) , (str(e[i-1][j-1]) + 'substitution ' + str(j-1) + ', '))
                dell = (d[i-1][j] +idc, (str(e[i][j-1]) + 'deletion ' + str(j-1) + ' ,'))
                if j >= len(h):
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                else:
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                d[i][j] = min(sub, ins, dell)[0]
                e[i][j] = (dell[0]==d[i][j], sub[0]==d[i][j], ins[0]==d[i][j]) * 1    
    return d, e

def score(l1, l2):
    return similarities[str(l1)][str(l2)]

#naive_backtrace and align code taken from the internet - uses the backtrace to find 
#the optimum path and alignment
#https://giov.dev/2016/01/minimum-edit-distance-in-python.html

def naive_backtrace(B_matrix):

    i, j = B_matrix.shape[0]-1, B_matrix.shape[1]-1
    backtrace_idxs = [(i, j)]
    while (i, j) != (0, 0):
        if B_matrix[i,j][1]:
            i, j = i-1, j-1
        elif B_matrix[i,j][0]:
            i, j = i-1, j
        elif B_matrix[i,j][2]:
            i, j = i, j-1
        backtrace_idxs.append((i,j))

    return backtrace_idxs

def align(word_1, word_2, bt):
    
    aligned_word_1 = []
    aligned_word_2 = []
    operations = []

    backtrace = bt[::-1]  # make it a forward trace

    for k in range(len(backtrace) - 1): 
        i_0, j_0 = backtrace[k]
        i_1, j_1 = backtrace[k+1]

        w_1_letter = None
        w_2_letter = None
        op = None

        if i_1 > i_0 and j_1 > j_0:  # either substitution or no-op
            if word_1[i_0] == word_2[j_0]:  # no-op, same symbol
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = " "
            else:  # cost increased: substitution
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = "s"
        elif i_0 == i_1:  # insertion
            w_1_letter = " "
            w_2_letter = word_2[j_0]
            op = "i"
        else: #  j_0 == j_1,  deletion
            w_1_letter = word_1[i_0]
            w_2_letter = " "
            op = "d"

        aligned_word_1.append(w_1_letter)
        aligned_word_2.append(w_2_letter)
        operations.append(op)
    
    

    return aligned_word_1, aligned_word_2, operations


def make_table(alignment):
    row1=[]
    row2=[]
    row3=[]
    for i,j,k in alignment:
        row1.append(i)
        row2.append(j)
        row3.append(k)
    table=[row1, row2, row3]
    return table
    
def align_and_score(word1, word2):
    d, e = compare(word1, word2)
    bt = naive_backtrace(e)
    a, b, c =  (align(word1, word2, bt))
    alignment = (list(zip(a,b,c)))
    x,y = (d.shape)
    score = (d[x-1][y-1]) 
    table = make_table(alignment)
    return score

    

#Decalaration and importation of the structures required for this scriptbirkbeck_dict = birkbeck_parser.birkbeck_dict
reverse_dict = hollbrook_structures.hollbrook_reverse

cmu_dict = data_structures.cmu_dict   # cmu_dict[phonemes]=word
cmu_dict2 = data_structures.cmu_dict2  # cmu_dict2[word]= phonemes
cmu_phones = data_structures.cmu_phones # list of sequence of phonemes
frequency_dict = data_structures.frequency_dict #frequency_dict[word]=frequency (which is a STRING not an INT)

f = open('hollbrook_miss_phi.csv') # results from g2p-seq2seq and the hollbrook data set
csv_f = csv.reader(f)
results_dict = {}

#creating dictionary of misspellings and cmu dict phonemes of misspellings
for row in csv_f:
    results_dict[row[0]] = row[1] # results_dict[misspelled word]=sequence of phonemes
    
# 1 - Getting all closes phonemes of the misspelled word

#For each misspelled word, we get a list of sequences of phonemes within an edit distance of 2
#Each sequence of phonemes must start and end with the same phoneme as the missepelled word's sequence of phonemes
close_seq_of_phonemes_dict={}
for misspelling in list(results_dict)[0:]: #results_dict len = 34013
    close_seq_of_phonemes=[]
    phonemes = results_dict.get(misspelling) #phonemes seq associated to the misspelling
    phonemes_list=phonemes.split(" ")
    close_seq_of_phonemes.append(phonemes) #make sure the list of close phonemes also contains the misspelled word's phonemes
    for i in cmu_phones:
        j=i.split(" ") #to compare the phonemes and not each caracter  
        if j[:1]==phonemes_list[:1]:
            if j[-1]==phonemes_list[-1]:
                dist = align_and_score(phonemes_list, j) # Emma's distance for phonemes
                if dist <= 0.3:
                    close_seq_of_phonemes.append(i)
    close_seq_of_phonemes_dict[misspelling]=close_seq_of_phonemes

# 2 - Getting the corresponding word for the most common sequence of phonemes
nbwords_in_cmu = 0
words_in_freq_dic =0
frequency=0
guess_dict={} # dictionary with only the misspelled word and the guess
close_dict={} # dictionary with all the close words
not_in_freq_dict={}
for misspelling in list(close_seq_of_phonemes_dict):
    words_list=[] #store all the close words 
    phonemes_list=close_seq_of_phonemes_dict.get(misspelling)
    popular_seq = phonemes_list[0] # the first et ie the misspelled word  sequence of phonemes
    max_frequency = 0 #frequency of the word
    popular_mot="mot"
    for seq in phonemes_list:
        if seq in cmu_dict:
            guess = cmu_dict.get(seq)
            words_list.append(guess.lower())
            if guess in frequency_dict:
                frequency = int(frequency_dict.get(guess)) # get the frequency associated to the guessed word
                if (frequency > max_frequency):
                    popular_mot= guess
                    max_frequency = frequency 
            else:
                with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                    Fwriter =csv.writer(csvfile, delimiter=',')
                    Fwriter.writerow([misspelling] + ['frequency'])
        else:
            with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([misspelling] + ['cmu'])
                
    close_dict[misspelling]=words_list
    guess_dict[misspelling] = popular_mot
""" if(popular_mot != "mot"): #otherwise words that are either not in CMU nor frequency are added
        guess_dict[misspelling] = popular_mot"""
    
    
    
#dictionary with correct spelling from birkbeck dataset
#get the real spelling from birkbeck data set
cmu_luck_words = open('HBKcmu_luck_words.csv', 'w+')
nb_misspelling = len(guess_dict)
nb_good_correction = 0
nb_luck= 0 # ie nb times real word is in the list of close phonemes

for misspelling in list(guess_dict):
    close_words= close_dict.get(misspelling)
    if misspelling in reverse_dict :
        realword = reverse_dict.get(misspelling).lower()

        if realword in close_words :
            nb_luck += 1
            cmu_luck_words.write(realword +","+ misspelling +"\n")

        guessword = guess_dict.get(misspelling).lower()
        with open("HBKcmu_corrected_words.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([realword] + [misspelling] + [guessword])
        #print("The misspelled word was {}, this solution corrected to {}. The correct word was {}".format(misspelling, guessword, realword))
        if(realword==guessword):
            nb_good_correction += 1

percentage_correct = nb_good_correction / nb_misspelling * 100
print("correction percentage : ", percentage_correct)


#GET THE PERCENTAGE OF CORRECTION OVER THE WORDS THAT ARE IN FRQUENCY AND CMU
cmu_no_match = open('HBKcmu_no_match.csv', 'w+')
cmu_good_correction = open('HBKcmu_good_correction.csv', 'w+')
cmu_wrong_correction = open('HBKcmu_wrong_correction.csv', 'w+')

#cmu_words1=open('cmu_corrected_words1.csv','r')
cmu_words=open('HBKcmu_corrected_words.csv','r')

i=0
match=0
no_match=0
bad_corr =0

for line in cmu_words:
    i+=1
    line = line.strip()
    word, misspelling, guess = line.split(',')
    if (misspelling != 'mot') :
        if(word == guess):
            match+=1
            cmu_good_correction.write(word +","+ misspelling +","+ guess +"\n")
        else :
            bad_corr +=1
            cmu_wrong_correction.write(word +","+ misspelling+ ","+ guess +"\n")
    else:
        no_match+=1
        cmu_no_match.write(word +","+ misspelling+ ","+ guess +"\n")
        
print("Correct word found : {}".format(match))
print("Guess word not in frequency or CMU dict : {}".format(no_match))
print("Bad correction : {}".format(bad_corr))
print(match +no_match+bad_corr)
print(i) #lines in cmu_words

luck_percentage = nb_luck/len(close_dict)*100
print("Guess word was in close words {} times, ie in {} percent of the case".format(nb_luck,luck_percentage))

percentage = match / (match + bad_corr)*100
print ("New correction percentage {}".format(percentage))

cmu_words.close()    
cmu_good_correction.close()
cmu_luck_words.close()
cmu_wrong_correction.close()
cmu_no_match.close()

matrix name:  acoustic_similarity.csv


correction percentage :  9.57004160887656
Correct word found : 69
Guess word not in frequency or CMU dict : 0
Bad correction : 579
648
648
Guess word was in close words 177 times, ie in 24.549237170596395 percent of the case
New correction percentage 10.648148148148149


In [5]:
#CMU with phoneme edit distance

import sys
import stringdist
from string import digits
import hollbrook_structures
import csv
import data_structures
import ast
import numpy as np
import pandas as pd
from tabulate import tabulate

#EMMA'S EDIT DISTANCE FOR PHONEMES
#similarity matrix 
similarities = pd.read_csv(input('matrix name: '), index_col=0)

#cost of insertion/deletion
idc=0.5

def compare(string1, string2):
    r = string1
    h = string2


    d = np.zeros((len(r)+1)*(len(h)+1), dtype=float)
    d = d.reshape((len(r)+1, len(h)+1))

    e = np.zeros((len(r)+1)*(len(h)+1), dtype=object)
    e = e.reshape((len(r)+1, len(h)+1))


    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                if j == 0:
                    d[0][0] = 0
                    e[0][j] = (0,0,0)
                else:
                    d[i][j] = d[i][j-1] + idc
                    e[0][j] = (0,0,1)
            elif j == 0:
                if i == 0:
                    e[i][0] = (0,0,0)
                else:
                    d[i][j] = d[i-1][j] + idc
                    e[i][0] = (1,0,0)

    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
                e[i][j] = (0, 1, 0)                
            else:
                #some random extra stuff in here that doesn't get used - could clean up but works as is
                sub = ((d[i-1][j-1] + (score(r[i-1], h[j-1]))) , (str(e[i-1][j-1]) + 'substitution ' + str(j-1) + ', '))
                dell = (d[i-1][j] +idc, (str(e[i][j-1]) + 'deletion ' + str(j-1) + ' ,'))
                if j >= len(h):
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                else:
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                d[i][j] = min(sub, ins, dell)[0]
                e[i][j] = (dell[0]==d[i][j], sub[0]==d[i][j], ins[0]==d[i][j]) * 1    
    return d, e

def score(l1, l2):
    return similarities[str(l1)][str(l2)]

#naive_backtrace and align code taken from the internet - uses the backtrace to find 
#the optimum path and alignment
#https://giov.dev/2016/01/minimum-edit-distance-in-python.html

def naive_backtrace(B_matrix):

    i, j = B_matrix.shape[0]-1, B_matrix.shape[1]-1
    backtrace_idxs = [(i, j)]
    while (i, j) != (0, 0):
        if B_matrix[i,j][1]:
            i, j = i-1, j-1
        elif B_matrix[i,j][0]:
            i, j = i-1, j
        elif B_matrix[i,j][2]:
            i, j = i, j-1
        backtrace_idxs.append((i,j))

    return backtrace_idxs

def align(word_1, word_2, bt):
    
    aligned_word_1 = []
    aligned_word_2 = []
    operations = []

    backtrace = bt[::-1]  # make it a forward trace

    for k in range(len(backtrace) - 1): 
        i_0, j_0 = backtrace[k]
        i_1, j_1 = backtrace[k+1]

        w_1_letter = None
        w_2_letter = None
        op = None

        if i_1 > i_0 and j_1 > j_0:  # either substitution or no-op
            if word_1[i_0] == word_2[j_0]:  # no-op, same symbol
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = " "
            else:  # cost increased: substitution
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = "s"
        elif i_0 == i_1:  # insertion
            w_1_letter = " "
            w_2_letter = word_2[j_0]
            op = "i"
        else: #  j_0 == j_1,  deletion
            w_1_letter = word_1[i_0]
            w_2_letter = " "
            op = "d"

        aligned_word_1.append(w_1_letter)
        aligned_word_2.append(w_2_letter)
        operations.append(op)
    
    

    return aligned_word_1, aligned_word_2, operations


def make_table(alignment):
    row1=[]
    row2=[]
    row3=[]
    for i,j,k in alignment:
        row1.append(i)
        row2.append(j)
        row3.append(k)
    table=[row1, row2, row3]
    return table
    
def align_and_score(word1, word2):
    d, e = compare(word1, word2)
    bt = naive_backtrace(e)
    a, b, c =  (align(word1, word2, bt))
    alignment = (list(zip(a,b,c)))
    x,y = (d.shape)
    score = (d[x-1][y-1]) 
    table = make_table(alignment)
    return score

    

#Decalaration and importation of the structures required for this scriptbirkbeck_dict = birkbeck_parser.birkbeck_dict
reverse_dict = hollbrook_structures.hollbrook_reverse

cmu_dict = data_structures.cmu_dict   # cmu_dict[phonemes]=word
cmu_dict2 = data_structures.cmu_dict2  # cmu_dict2[word]= phonemes
cmu_phones = data_structures.cmu_phones # list of sequence of phonemes
frequency_dict = data_structures.frequency_dict #frequency_dict[word]=frequency (which is a STRING not an INT)

f = open('hollbrook_miss_phi.csv') # results from g2p-seq2seq and the hollbrook data set
csv_f = csv.reader(f)
results_dict = {}

#creating dictionary of misspellings and cmu dict phonemes of misspellings
for row in csv_f:
    results_dict[row[0]] = row[1] # results_dict[misspelled word]=sequence of phonemes
    
# 1 - Getting all closes phonemes of the misspelled word

#For each misspelled word, we get a list of sequences of phonemes within an edit distance of 2
#Each sequence of phonemes must start and end with the same phoneme as the missepelled word's sequence of phonemes
close_seq_of_phonemes_dict={}
for misspelling in list(results_dict)[0:]: #results_dict len = 34013
    close_seq_of_phonemes=[]
    phonemes = results_dict.get(misspelling) #phonemes seq associated to the misspelling
    phonemes_list=phonemes.split(" ")
    close_seq_of_phonemes.append(phonemes) #make sure the list of close phonemes also contains the misspelled word's phonemes
    for i in cmu_phones:
        j=i.split(" ") #to compare the phonemes and not each caracter  
        if j[:1]==phonemes_list[:1]:
            if j[-1]==phonemes_list[-1]:
                dist = align_and_score(phonemes_list, j) # Emma's distance for phonemes
                if dist <= 0.4:
                    close_seq_of_phonemes.append(i)
    close_seq_of_phonemes_dict[misspelling]=close_seq_of_phonemes

# 2 - Getting the corresponding word for the most common sequence of phonemes
nbwords_in_cmu = 0
words_in_freq_dic =0
frequency=0
guess_dict={} # dictionary with only the misspelled word and the guess
close_dict={} # dictionary with all the close words
not_in_freq_dict={}
for misspelling in list(close_seq_of_phonemes_dict):
    words_list=[] #store all the close words 
    phonemes_list=close_seq_of_phonemes_dict.get(misspelling)
    popular_seq = phonemes_list[0] # the first et ie the misspelled word  sequence of phonemes
    max_frequency = 0 #frequency of the word
    popular_mot="mot"
    for seq in phonemes_list:
        if seq in cmu_dict:
            guess = cmu_dict.get(seq)
            words_list.append(guess.lower())
            if guess in frequency_dict:
                frequency = int(frequency_dict.get(guess)) # get the frequency associated to the guessed word
                if (frequency > max_frequency):
                    popular_mot= guess
                    max_frequency = frequency 
            else:
                with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                    Fwriter =csv.writer(csvfile, delimiter=',')
                    Fwriter.writerow([misspelling] + ['frequency'])
        else:
            with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([misspelling] + ['cmu'])
                
    close_dict[misspelling]=words_list
    guess_dict[misspelling] = popular_mot
""" if(popular_mot != "mot"): #otherwise words that are either not in CMU nor frequency are added
        guess_dict[misspelling] = popular_mot"""
    
    
    
#dictionary with correct spelling from birkbeck dataset
#get the real spelling from birkbeck data set
cmu_luck_words = open('HBKcmu_luck_words.csv', 'w+')
nb_misspelling = len(guess_dict)
nb_good_correction = 0
nb_luck= 0 # ie nb times real word is in the list of close phonemes

for misspelling in list(guess_dict):
    close_words= close_dict.get(misspelling)
    if misspelling in reverse_dict :
        realword = reverse_dict.get(misspelling).lower()

        if realword in close_words :
            nb_luck += 1
            cmu_luck_words.write(realword +","+ misspelling +"\n")

        guessword = guess_dict.get(misspelling).lower()
        with open("HBKcmu_corrected_words.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([realword] + [misspelling] + [guessword])
        #print("The misspelled word was {}, this solution corrected to {}. The correct word was {}".format(misspelling, guessword, realword))
        if(realword==guessword):
            nb_good_correction += 1

percentage_correct = nb_good_correction / nb_misspelling * 100
print("correction percentage : ", percentage_correct)


#GET THE PERCENTAGE OF CORRECTION OVER THE WORDS THAT ARE IN FRQUENCY AND CMU
cmu_no_match = open('HBKcmu_no_match.csv', 'w+')
cmu_good_correction = open('HBKcmu_good_correction.csv', 'w+')
cmu_wrong_correction = open('HBKcmu_wrong_correction.csv', 'w+')

#cmu_words1=open('cmu_corrected_words1.csv','r')
cmu_words=open('HBKcmu_corrected_words.csv','r')

i=0
match=0
no_match=0
bad_corr =0

for line in cmu_words:
    i+=1
    line = line.strip()
    word, misspelling, guess = line.split(',')
    if (misspelling != 'mot') :
        if(word == guess):
            match+=1
            cmu_good_correction.write(word +","+ misspelling +","+ guess +"\n")
        else :
            bad_corr +=1
            cmu_wrong_correction.write(word +","+ misspelling+ ","+ guess +"\n")
    else:
        no_match+=1
        cmu_no_match.write(word +","+ misspelling+ ","+ guess +"\n")
        
print("Correct word found : {}".format(match))
print("Guess word not in frequency or CMU dict : {}".format(no_match))
print("Bad correction : {}".format(bad_corr))
print(match +no_match+bad_corr)
print(i) #lines in cmu_words

luck_percentage = nb_luck/len(close_dict)*100
print("Guess word was in close words {} times, ie in {} percent of the case".format(nb_luck,luck_percentage))

percentage = match / (match + bad_corr)*100
print ("New correction percentage {}".format(percentage))

cmu_words.close()    
cmu_good_correction.close()
cmu_luck_words.close()
cmu_wrong_correction.close()
cmu_no_match.close()

matrix name:  acoustic_similarity.csv


correction percentage :  9.57004160887656
Correct word found : 69
Guess word not in frequency or CMU dict : 0
Bad correction : 579
648
648
Guess word was in close words 179 times, ie in 24.826629680998614 percent of the case
New correction percentage 10.648148148148149


In [6]:
#CMU with phoneme edit distance

import sys
import stringdist
from string import digits
import hollbrook_structures
import csv
import data_structures
import ast
import numpy as np
import pandas as pd
from tabulate import tabulate

#EMMA'S EDIT DISTANCE FOR PHONEMES
#similarity matrix 
similarities = pd.read_csv(input('matrix name: '), index_col=0)

#cost of insertion/deletion
idc=0.5

def compare(string1, string2):
    r = string1
    h = string2


    d = np.zeros((len(r)+1)*(len(h)+1), dtype=float)
    d = d.reshape((len(r)+1, len(h)+1))

    e = np.zeros((len(r)+1)*(len(h)+1), dtype=object)
    e = e.reshape((len(r)+1, len(h)+1))


    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                if j == 0:
                    d[0][0] = 0
                    e[0][j] = (0,0,0)
                else:
                    d[i][j] = d[i][j-1] + idc
                    e[0][j] = (0,0,1)
            elif j == 0:
                if i == 0:
                    e[i][0] = (0,0,0)
                else:
                    d[i][j] = d[i-1][j] + idc
                    e[i][0] = (1,0,0)

    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
                e[i][j] = (0, 1, 0)                
            else:
                #some random extra stuff in here that doesn't get used - could clean up but works as is
                sub = ((d[i-1][j-1] + (score(r[i-1], h[j-1]))) , (str(e[i-1][j-1]) + 'substitution ' + str(j-1) + ', '))
                dell = (d[i-1][j] +idc, (str(e[i][j-1]) + 'deletion ' + str(j-1) + ' ,'))
                if j >= len(h):
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                else:
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                d[i][j] = min(sub, ins, dell)[0]
                e[i][j] = (dell[0]==d[i][j], sub[0]==d[i][j], ins[0]==d[i][j]) * 1    
    return d, e

def score(l1, l2):
    return similarities[str(l1)][str(l2)]

#naive_backtrace and align code taken from the internet - uses the backtrace to find 
#the optimum path and alignment
#https://giov.dev/2016/01/minimum-edit-distance-in-python.html

def naive_backtrace(B_matrix):

    i, j = B_matrix.shape[0]-1, B_matrix.shape[1]-1
    backtrace_idxs = [(i, j)]
    while (i, j) != (0, 0):
        if B_matrix[i,j][1]:
            i, j = i-1, j-1
        elif B_matrix[i,j][0]:
            i, j = i-1, j
        elif B_matrix[i,j][2]:
            i, j = i, j-1
        backtrace_idxs.append((i,j))

    return backtrace_idxs

def align(word_1, word_2, bt):
    
    aligned_word_1 = []
    aligned_word_2 = []
    operations = []

    backtrace = bt[::-1]  # make it a forward trace

    for k in range(len(backtrace) - 1): 
        i_0, j_0 = backtrace[k]
        i_1, j_1 = backtrace[k+1]

        w_1_letter = None
        w_2_letter = None
        op = None

        if i_1 > i_0 and j_1 > j_0:  # either substitution or no-op
            if word_1[i_0] == word_2[j_0]:  # no-op, same symbol
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = " "
            else:  # cost increased: substitution
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = "s"
        elif i_0 == i_1:  # insertion
            w_1_letter = " "
            w_2_letter = word_2[j_0]
            op = "i"
        else: #  j_0 == j_1,  deletion
            w_1_letter = word_1[i_0]
            w_2_letter = " "
            op = "d"

        aligned_word_1.append(w_1_letter)
        aligned_word_2.append(w_2_letter)
        operations.append(op)
    
    

    return aligned_word_1, aligned_word_2, operations


def make_table(alignment):
    row1=[]
    row2=[]
    row3=[]
    for i,j,k in alignment:
        row1.append(i)
        row2.append(j)
        row3.append(k)
    table=[row1, row2, row3]
    return table
    
def align_and_score(word1, word2):
    d, e = compare(word1, word2)
    bt = naive_backtrace(e)
    a, b, c =  (align(word1, word2, bt))
    alignment = (list(zip(a,b,c)))
    x,y = (d.shape)
    score = (d[x-1][y-1]) 
    table = make_table(alignment)
    return score

    

#Decalaration and importation of the structures required for this scriptbirkbeck_dict = birkbeck_parser.birkbeck_dict
reverse_dict = hollbrook_structures.hollbrook_reverse

cmu_dict = data_structures.cmu_dict   # cmu_dict[phonemes]=word
cmu_dict2 = data_structures.cmu_dict2  # cmu_dict2[word]= phonemes
cmu_phones = data_structures.cmu_phones # list of sequence of phonemes
frequency_dict = data_structures.frequency_dict #frequency_dict[word]=frequency (which is a STRING not an INT)

f = open('hollbrook_miss_phi.csv') # results from g2p-seq2seq and the hollbrook data set
csv_f = csv.reader(f)
results_dict = {}

#creating dictionary of misspellings and cmu dict phonemes of misspellings
for row in csv_f:
    results_dict[row[0]] = row[1] # results_dict[misspelled word]=sequence of phonemes
    
# 1 - Getting all closes phonemes of the misspelled word

#For each misspelled word, we get a list of sequences of phonemes within an edit distance of 2
#Each sequence of phonemes must start and end with the same phoneme as the missepelled word's sequence of phonemes
close_seq_of_phonemes_dict={}
for misspelling in list(results_dict)[0:]: #results_dict len = 34013
    close_seq_of_phonemes=[]
    phonemes = results_dict.get(misspelling) #phonemes seq associated to the misspelling
    phonemes_list=phonemes.split(" ")
    close_seq_of_phonemes.append(phonemes) #make sure the list of close phonemes also contains the misspelled word's phonemes
    for i in cmu_phones:
        j=i.split(" ") #to compare the phonemes and not each caracter  
        if j[:1]==phonemes_list[:1]:
            if j[-1]==phonemes_list[-1]:
                dist = align_and_score(phonemes_list, j) # Emma's distance for phonemes
                if dist <= 0.6:
                    close_seq_of_phonemes.append(i)
    close_seq_of_phonemes_dict[misspelling]=close_seq_of_phonemes

# 2 - Getting the corresponding word for the most common sequence of phonemes
nbwords_in_cmu = 0
words_in_freq_dic =0
frequency=0
guess_dict={} # dictionary with only the misspelled word and the guess
close_dict={} # dictionary with all the close words
not_in_freq_dict={}
for misspelling in list(close_seq_of_phonemes_dict):
    words_list=[] #store all the close words 
    phonemes_list=close_seq_of_phonemes_dict.get(misspelling)
    popular_seq = phonemes_list[0] # the first et ie the misspelled word  sequence of phonemes
    max_frequency = 0 #frequency of the word
    popular_mot="mot"
    for seq in phonemes_list:
        if seq in cmu_dict:
            guess = cmu_dict.get(seq)
            words_list.append(guess.lower())
            if guess in frequency_dict:
                frequency = int(frequency_dict.get(guess)) # get the frequency associated to the guessed word
                if (frequency > max_frequency):
                    popular_mot= guess
                    max_frequency = frequency 
            else:
                with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                    Fwriter =csv.writer(csvfile, delimiter=',')
                    Fwriter.writerow([misspelling] + ['frequency'])
        else:
            with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([misspelling] + ['cmu'])
                
    close_dict[misspelling]=words_list
    guess_dict[misspelling] = popular_mot
""" if(popular_mot != "mot"): #otherwise words that are either not in CMU nor frequency are added
        guess_dict[misspelling] = popular_mot"""
    
    
    
#dictionary with correct spelling from birkbeck dataset
#get the real spelling from birkbeck data set
cmu_luck_words = open('HBKcmu_luck_words.csv', 'w+')
nb_misspelling = len(guess_dict)
nb_good_correction = 0
nb_luck= 0 # ie nb times real word is in the list of close phonemes

for misspelling in list(guess_dict):
    close_words= close_dict.get(misspelling)
    if misspelling in reverse_dict :
        realword = reverse_dict.get(misspelling).lower()

        if realword in close_words :
            nb_luck += 1
            cmu_luck_words.write(realword +","+ misspelling +"\n")

        guessword = guess_dict.get(misspelling).lower()
        with open("HBKcmu_corrected_words.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([realword] + [misspelling] + [guessword])
        #print("The misspelled word was {}, this solution corrected to {}. The correct word was {}".format(misspelling, guessword, realword))
        if(realword==guessword):
            nb_good_correction += 1

percentage_correct = nb_good_correction / nb_misspelling * 100
print("correction percentage : ", percentage_correct)


#GET THE PERCENTAGE OF CORRECTION OVER THE WORDS THAT ARE IN FRQUENCY AND CMU
cmu_no_match = open('HBKcmu_no_match.csv', 'w+')
cmu_good_correction = open('HBKcmu_good_correction.csv', 'w+')
cmu_wrong_correction = open('HBKcmu_wrong_correction.csv', 'w+')

#cmu_words1=open('cmu_corrected_words1.csv','r')
cmu_words=open('HBKcmu_corrected_words.csv','r')

i=0
match=0
no_match=0
bad_corr =0

for line in cmu_words:
    i+=1
    line = line.strip()
    word, misspelling, guess = line.split(',')
    if (misspelling != 'mot') :
        if(word == guess):
            match+=1
            cmu_good_correction.write(word +","+ misspelling +","+ guess +"\n")
        else :
            bad_corr +=1
            cmu_wrong_correction.write(word +","+ misspelling+ ","+ guess +"\n")
    else:
        no_match+=1
        cmu_no_match.write(word +","+ misspelling+ ","+ guess +"\n")
        
print("Correct word found : {}".format(match))
print("Guess word not in frequency or CMU dict : {}".format(no_match))
print("Bad correction : {}".format(bad_corr))
print(match +no_match+bad_corr)
print(i) #lines in cmu_words

luck_percentage = nb_luck/len(close_dict)*100
print("Guess word was in close words {} times, ie in {} percent of the case".format(nb_luck,luck_percentage))

percentage = match / (match + bad_corr)*100
print ("New correction percentage {}".format(percentage))

cmu_words.close()    
cmu_good_correction.close()
cmu_luck_words.close()
cmu_wrong_correction.close()
cmu_no_match.close()

matrix name:  acoustic_similarity.csv


correction percentage :  6.51872399445215
Correct word found : 116
Guess word not in frequency or CMU dict : 0
Bad correction : 1180
1296
1296
Guess word was in close words 258 times, ie in 35.78363384188627 percent of the case
New correction percentage 8.950617283950617


In [7]:
#CMU with phoneme edit distance

import sys
import stringdist
from string import digits
import hollbrook_structures
import csv
import data_structures
import ast
import numpy as np
import pandas as pd
from tabulate import tabulate

#EMMA'S EDIT DISTANCE FOR PHONEMES
#similarity matrix 
similarities = pd.read_csv(input('matrix name: '), index_col=0)

#cost of insertion/deletion
idc=0.5

def compare(string1, string2):
    r = string1
    h = string2


    d = np.zeros((len(r)+1)*(len(h)+1), dtype=float)
    d = d.reshape((len(r)+1, len(h)+1))

    e = np.zeros((len(r)+1)*(len(h)+1), dtype=object)
    e = e.reshape((len(r)+1, len(h)+1))


    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                if j == 0:
                    d[0][0] = 0
                    e[0][j] = (0,0,0)
                else:
                    d[i][j] = d[i][j-1] + idc
                    e[0][j] = (0,0,1)
            elif j == 0:
                if i == 0:
                    e[i][0] = (0,0,0)
                else:
                    d[i][j] = d[i-1][j] + idc
                    e[i][0] = (1,0,0)

    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
                e[i][j] = (0, 1, 0)                
            else:
                #some random extra stuff in here that doesn't get used - could clean up but works as is
                sub = ((d[i-1][j-1] + (score(r[i-1], h[j-1]))) , (str(e[i-1][j-1]) + 'substitution ' + str(j-1) + ', '))
                dell = (d[i-1][j] +idc, (str(e[i][j-1]) + 'deletion ' + str(j-1) + ' ,'))
                if j >= len(h):
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                else:
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                d[i][j] = min(sub, ins, dell)[0]
                e[i][j] = (dell[0]==d[i][j], sub[0]==d[i][j], ins[0]==d[i][j]) * 1    
    return d, e

def score(l1, l2):
    return similarities[str(l1)][str(l2)]

#naive_backtrace and align code taken from the internet - uses the backtrace to find 
#the optimum path and alignment
#https://giov.dev/2016/01/minimum-edit-distance-in-python.html

def naive_backtrace(B_matrix):

    i, j = B_matrix.shape[0]-1, B_matrix.shape[1]-1
    backtrace_idxs = [(i, j)]
    while (i, j) != (0, 0):
        if B_matrix[i,j][1]:
            i, j = i-1, j-1
        elif B_matrix[i,j][0]:
            i, j = i-1, j
        elif B_matrix[i,j][2]:
            i, j = i, j-1
        backtrace_idxs.append((i,j))

    return backtrace_idxs

def align(word_1, word_2, bt):
    
    aligned_word_1 = []
    aligned_word_2 = []
    operations = []

    backtrace = bt[::-1]  # make it a forward trace

    for k in range(len(backtrace) - 1): 
        i_0, j_0 = backtrace[k]
        i_1, j_1 = backtrace[k+1]

        w_1_letter = None
        w_2_letter = None
        op = None

        if i_1 > i_0 and j_1 > j_0:  # either substitution or no-op
            if word_1[i_0] == word_2[j_0]:  # no-op, same symbol
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = " "
            else:  # cost increased: substitution
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = "s"
        elif i_0 == i_1:  # insertion
            w_1_letter = " "
            w_2_letter = word_2[j_0]
            op = "i"
        else: #  j_0 == j_1,  deletion
            w_1_letter = word_1[i_0]
            w_2_letter = " "
            op = "d"

        aligned_word_1.append(w_1_letter)
        aligned_word_2.append(w_2_letter)
        operations.append(op)
    
    

    return aligned_word_1, aligned_word_2, operations


def make_table(alignment):
    row1=[]
    row2=[]
    row3=[]
    for i,j,k in alignment:
        row1.append(i)
        row2.append(j)
        row3.append(k)
    table=[row1, row2, row3]
    return table
    
def align_and_score(word1, word2):
    d, e = compare(word1, word2)
    bt = naive_backtrace(e)
    a, b, c =  (align(word1, word2, bt))
    alignment = (list(zip(a,b,c)))
    x,y = (d.shape)
    score = (d[x-1][y-1]) 
    table = make_table(alignment)
    return score

    

#Decalaration and importation of the structures required for this scriptbirkbeck_dict = birkbeck_parser.birkbeck_dict
reverse_dict = hollbrook_structures.hollbrook_reverse

cmu_dict = data_structures.cmu_dict   # cmu_dict[phonemes]=word
cmu_dict2 = data_structures.cmu_dict2  # cmu_dict2[word]= phonemes
cmu_phones = data_structures.cmu_phones # list of sequence of phonemes
frequency_dict = data_structures.frequency_dict #frequency_dict[word]=frequency (which is a STRING not an INT)

f = open('hollbrook_miss_phi.csv') # results from g2p-seq2seq and the hollbrook data set
csv_f = csv.reader(f)
results_dict = {}

#creating dictionary of misspellings and cmu dict phonemes of misspellings
for row in csv_f:
    results_dict[row[0]] = row[1] # results_dict[misspelled word]=sequence of phonemes
    
# 1 - Getting all closes phonemes of the misspelled word

#For each misspelled word, we get a list of sequences of phonemes within an edit distance of 2
#Each sequence of phonemes must start and end with the same phoneme as the missepelled word's sequence of phonemes
close_seq_of_phonemes_dict={}
for misspelling in list(results_dict)[0:]: #results_dict len = 34013
    close_seq_of_phonemes=[]
    phonemes = results_dict.get(misspelling) #phonemes seq associated to the misspelling
    phonemes_list=phonemes.split(" ")
    close_seq_of_phonemes.append(phonemes) #make sure the list of close phonemes also contains the misspelled word's phonemes
    for i in cmu_phones:
        j=i.split(" ") #to compare the phonemes and not each caracter  
        if j[:1]==phonemes_list[:1]:
            if j[-1]==phonemes_list[-1]:
                dist = align_and_score(phonemes_list, j) # Emma's distance for phonemes
                if dist <= 0.55:
                    close_seq_of_phonemes.append(i)
    close_seq_of_phonemes_dict[misspelling]=close_seq_of_phonemes

# 2 - Getting the corresponding word for the most common sequence of phonemes
nbwords_in_cmu = 0
words_in_freq_dic =0
frequency=0
guess_dict={} # dictionary with only the misspelled word and the guess
close_dict={} # dictionary with all the close words
not_in_freq_dict={}
for misspelling in list(close_seq_of_phonemes_dict):
    words_list=[] #store all the close words 
    phonemes_list=close_seq_of_phonemes_dict.get(misspelling)
    popular_seq = phonemes_list[0] # the first et ie the misspelled word  sequence of phonemes
    max_frequency = 0 #frequency of the word
    popular_mot="mot"
    for seq in phonemes_list:
        if seq in cmu_dict:
            guess = cmu_dict.get(seq)
            words_list.append(guess.lower())
            if guess in frequency_dict:
                frequency = int(frequency_dict.get(guess)) # get the frequency associated to the guessed word
                if (frequency > max_frequency):
                    popular_mot= guess
                    max_frequency = frequency 
            else:
                with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                    Fwriter =csv.writer(csvfile, delimiter=',')
                    Fwriter.writerow([misspelling] + ['frequency'])
        else:
            with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([misspelling] + ['cmu'])
                
    close_dict[misspelling]=words_list
    guess_dict[misspelling] = popular_mot
""" if(popular_mot != "mot"): #otherwise words that are either not in CMU nor frequency are added
        guess_dict[misspelling] = popular_mot"""
    
    
    
#dictionary with correct spelling from birkbeck dataset
#get the real spelling from birkbeck data set
cmu_luck_words = open('HBKcmu_luck_words.csv', 'w+')
nb_misspelling = len(guess_dict)
nb_good_correction = 0
nb_luck= 0 # ie nb times real word is in the list of close phonemes

for misspelling in list(guess_dict):
    close_words= close_dict.get(misspelling)
    if misspelling in reverse_dict :
        realword = reverse_dict.get(misspelling).lower()

        if realword in close_words :
            nb_luck += 1
            cmu_luck_words.write(realword +","+ misspelling +"\n")

        guessword = guess_dict.get(misspelling).lower()
        with open("HBKcmu_corrected_words.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([realword] + [misspelling] + [guessword])
        #print("The misspelled word was {}, this solution corrected to {}. The correct word was {}".format(misspelling, guessword, realword))
        if(realword==guessword):
            nb_good_correction += 1

percentage_correct = nb_good_correction / nb_misspelling * 100
print("correction percentage : ", percentage_correct)


#GET THE PERCENTAGE OF CORRECTION OVER THE WORDS THAT ARE IN FRQUENCY AND CMU
cmu_no_match = open('HBKcmu_no_match.csv', 'w+')
cmu_good_correction = open('HBKcmu_good_correction.csv', 'w+')
cmu_wrong_correction = open('HBKcmu_wrong_correction.csv', 'w+')

#cmu_words1=open('cmu_corrected_words1.csv','r')
cmu_words=open('HBKcmu_corrected_words.csv','r')

i=0
match=0
no_match=0
bad_corr =0

for line in cmu_words:
    i+=1
    line = line.strip()
    word, misspelling, guess = line.split(',')
    if (misspelling != 'mot') :
        if(word == guess):
            match+=1
            cmu_good_correction.write(word +","+ misspelling +","+ guess +"\n")
        else :
            bad_corr +=1
            cmu_wrong_correction.write(word +","+ misspelling+ ","+ guess +"\n")
    else:
        no_match+=1
        cmu_no_match.write(word +","+ misspelling+ ","+ guess +"\n")
        
print("Correct word found : {}".format(match))
print("Guess word not in frequency or CMU dict : {}".format(no_match))
print("Bad correction : {}".format(bad_corr))
print(match +no_match+bad_corr)
print(i) #lines in cmu_words

luck_percentage = nb_luck/len(close_dict)*100
print("Guess word was in close words {} times, ie in {} percent of the case".format(nb_luck,luck_percentage))

percentage = match / (match + bad_corr)*100
print ("New correction percentage {}".format(percentage))

cmu_words.close()    
cmu_good_correction.close()
cmu_luck_words.close()
cmu_wrong_correction.close()
cmu_no_match.close()

matrix name:  acoustic_similarity.csv


correction percentage :  6.657420249653259
Correct word found : 48
Guess word not in frequency or CMU dict : 0
Bad correction : 600
648
648
Guess word was in close words 257 times, ie in 35.64493758668516 percent of the case
New correction percentage 7.4074074074074066


In [8]:
#CMU with phoneme edit distance

import sys
import stringdist
from string import digits
import hollbrook_structures
import csv
import data_structures
import ast
import numpy as np
import pandas as pd
from tabulate import tabulate

#EMMA'S EDIT DISTANCE FOR PHONEMES
#similarity matrix 
similarities = pd.read_csv(input('matrix name: '), index_col=0)

#cost of insertion/deletion
idc=0.5

def compare(string1, string2):
    r = string1
    h = string2


    d = np.zeros((len(r)+1)*(len(h)+1), dtype=float)
    d = d.reshape((len(r)+1, len(h)+1))

    e = np.zeros((len(r)+1)*(len(h)+1), dtype=object)
    e = e.reshape((len(r)+1, len(h)+1))


    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                if j == 0:
                    d[0][0] = 0
                    e[0][j] = (0,0,0)
                else:
                    d[i][j] = d[i][j-1] + idc
                    e[0][j] = (0,0,1)
            elif j == 0:
                if i == 0:
                    e[i][0] = (0,0,0)
                else:
                    d[i][j] = d[i-1][j] + idc
                    e[i][0] = (1,0,0)

    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
                e[i][j] = (0, 1, 0)                
            else:
                #some random extra stuff in here that doesn't get used - could clean up but works as is
                sub = ((d[i-1][j-1] + (score(r[i-1], h[j-1]))) , (str(e[i-1][j-1]) + 'substitution ' + str(j-1) + ', '))
                dell = (d[i-1][j] +idc, (str(e[i][j-1]) + 'deletion ' + str(j-1) + ' ,'))
                if j >= len(h):
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                else:
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                d[i][j] = min(sub, ins, dell)[0]
                e[i][j] = (dell[0]==d[i][j], sub[0]==d[i][j], ins[0]==d[i][j]) * 1    
    return d, e

def score(l1, l2):
    return similarities[str(l1)][str(l2)]

#naive_backtrace and align code taken from the internet - uses the backtrace to find 
#the optimum path and alignment
#https://giov.dev/2016/01/minimum-edit-distance-in-python.html

def naive_backtrace(B_matrix):

    i, j = B_matrix.shape[0]-1, B_matrix.shape[1]-1
    backtrace_idxs = [(i, j)]
    while (i, j) != (0, 0):
        if B_matrix[i,j][1]:
            i, j = i-1, j-1
        elif B_matrix[i,j][0]:
            i, j = i-1, j
        elif B_matrix[i,j][2]:
            i, j = i, j-1
        backtrace_idxs.append((i,j))

    return backtrace_idxs

def align(word_1, word_2, bt):
    
    aligned_word_1 = []
    aligned_word_2 = []
    operations = []

    backtrace = bt[::-1]  # make it a forward trace

    for k in range(len(backtrace) - 1): 
        i_0, j_0 = backtrace[k]
        i_1, j_1 = backtrace[k+1]

        w_1_letter = None
        w_2_letter = None
        op = None

        if i_1 > i_0 and j_1 > j_0:  # either substitution or no-op
            if word_1[i_0] == word_2[j_0]:  # no-op, same symbol
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = " "
            else:  # cost increased: substitution
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = "s"
        elif i_0 == i_1:  # insertion
            w_1_letter = " "
            w_2_letter = word_2[j_0]
            op = "i"
        else: #  j_0 == j_1,  deletion
            w_1_letter = word_1[i_0]
            w_2_letter = " "
            op = "d"

        aligned_word_1.append(w_1_letter)
        aligned_word_2.append(w_2_letter)
        operations.append(op)
    
    

    return aligned_word_1, aligned_word_2, operations


def make_table(alignment):
    row1=[]
    row2=[]
    row3=[]
    for i,j,k in alignment:
        row1.append(i)
        row2.append(j)
        row3.append(k)
    table=[row1, row2, row3]
    return table
    
def align_and_score(word1, word2):
    d, e = compare(word1, word2)
    bt = naive_backtrace(e)
    a, b, c =  (align(word1, word2, bt))
    alignment = (list(zip(a,b,c)))
    x,y = (d.shape)
    score = (d[x-1][y-1]) 
    table = make_table(alignment)
    return score

    

#Decalaration and importation of the structures required for this scriptbirkbeck_dict = birkbeck_parser.birkbeck_dict
reverse_dict = hollbrook_structures.hollbrook_reverse

cmu_dict = data_structures.cmu_dict   # cmu_dict[phonemes]=word
cmu_dict2 = data_structures.cmu_dict2  # cmu_dict2[word]= phonemes
cmu_phones = data_structures.cmu_phones # list of sequence of phonemes
frequency_dict = data_structures.frequency_dict #frequency_dict[word]=frequency (which is a STRING not an INT)

f = open('hollbrook_miss_phi.csv') # results from g2p-seq2seq and the hollbrook data set
csv_f = csv.reader(f)
results_dict = {}

#creating dictionary of misspellings and cmu dict phonemes of misspellings
for row in csv_f:
    results_dict[row[0]] = row[1] # results_dict[misspelled word]=sequence of phonemes
    
# 1 - Getting all closes phonemes of the misspelled word

#For each misspelled word, we get a list of sequences of phonemes within an edit distance of 2
#Each sequence of phonemes must start and end with the same phoneme as the missepelled word's sequence of phonemes
close_seq_of_phonemes_dict={}
for misspelling in list(results_dict)[0:]: #results_dict len = 34013
    close_seq_of_phonemes=[]
    phonemes = results_dict.get(misspelling) #phonemes seq associated to the misspelling
    phonemes_list=phonemes.split(" ")
    close_seq_of_phonemes.append(phonemes) #make sure the list of close phonemes also contains the misspelled word's phonemes
    for i in cmu_phones:
        j=i.split(" ") #to compare the phonemes and not each caracter  
        if j[:1]==phonemes_list[:1]:
            if j[-1]==phonemes_list[-1]:
                dist = align_and_score(phonemes_list, j) # Emma's distance for phonemes
                if dist <= 0.45:
                    close_seq_of_phonemes.append(i)
    close_seq_of_phonemes_dict[misspelling]=close_seq_of_phonemes

# 2 - Getting the corresponding word for the most common sequence of phonemes
nbwords_in_cmu = 0
words_in_freq_dic =0
frequency=0
guess_dict={} # dictionary with only the misspelled word and the guess
close_dict={} # dictionary with all the close words
not_in_freq_dict={}
for misspelling in list(close_seq_of_phonemes_dict):
    words_list=[] #store all the close words 
    phonemes_list=close_seq_of_phonemes_dict.get(misspelling)
    popular_seq = phonemes_list[0] # the first et ie the misspelled word  sequence of phonemes
    max_frequency = 0 #frequency of the word
    popular_mot="mot"
    for seq in phonemes_list:
        if seq in cmu_dict:
            guess = cmu_dict.get(seq)
            words_list.append(guess.lower())
            if guess in frequency_dict:
                frequency = int(frequency_dict.get(guess)) # get the frequency associated to the guessed word
                if (frequency > max_frequency):
                    popular_mot= guess
                    max_frequency = frequency 
            else:
                with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                    Fwriter =csv.writer(csvfile, delimiter=',')
                    Fwriter.writerow([misspelling] + ['frequency'])
        else:
            with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([misspelling] + ['cmu'])
                
    close_dict[misspelling]=words_list
    guess_dict[misspelling] = popular_mot
""" if(popular_mot != "mot"): #otherwise words that are either not in CMU nor frequency are added
        guess_dict[misspelling] = popular_mot"""
    
    
    
#dictionary with correct spelling from birkbeck dataset
#get the real spelling from birkbeck data set
cmu_luck_words = open('HBKcmu_luck_words.csv', 'w+')
nb_misspelling = len(guess_dict)
nb_good_correction = 0
nb_luck= 0 # ie nb times real word is in the list of close phonemes

for misspelling in list(guess_dict):
    close_words= close_dict.get(misspelling)
    if misspelling in reverse_dict :
        realword = reverse_dict.get(misspelling).lower()

        if realword in close_words :
            nb_luck += 1
            cmu_luck_words.write(realword +","+ misspelling +"\n")

        guessword = guess_dict.get(misspelling).lower()
        with open("HBKcmu_corrected_words.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([realword] + [misspelling] + [guessword])
        #print("The misspelled word was {}, this solution corrected to {}. The correct word was {}".format(misspelling, guessword, realword))
        if(realword==guessword):
            nb_good_correction += 1

percentage_correct = nb_good_correction / nb_misspelling * 100
print("correction percentage : ", percentage_correct)


#GET THE PERCENTAGE OF CORRECTION OVER THE WORDS THAT ARE IN FRQUENCY AND CMU
cmu_no_match = open('HBKcmu_no_match.csv', 'w+')
cmu_good_correction = open('HBKcmu_good_correction.csv', 'w+')
cmu_wrong_correction = open('HBKcmu_wrong_correction.csv', 'w+')

#cmu_words1=open('cmu_corrected_words1.csv','r')
cmu_words=open('HBKcmu_corrected_words.csv','r')

i=0
match=0
no_match=0
bad_corr =0

for line in cmu_words:
    i+=1
    line = line.strip()
    word, misspelling, guess = line.split(',')
    if (misspelling != 'mot') :
        if(word == guess):
            match+=1
            cmu_good_correction.write(word +","+ misspelling +","+ guess +"\n")
        else :
            bad_corr +=1
            cmu_wrong_correction.write(word +","+ misspelling+ ","+ guess +"\n")
    else:
        no_match+=1
        cmu_no_match.write(word +","+ misspelling+ ","+ guess +"\n")
        
print("Correct word found : {}".format(match))
print("Guess word not in frequency or CMU dict : {}".format(no_match))
print("Bad correction : {}".format(bad_corr))
print(match +no_match+bad_corr)
print(i) #lines in cmu_words

luck_percentage = nb_luck/len(close_dict)*100
print("Guess word was in close words {} times, ie in {} percent of the case".format(nb_luck,luck_percentage))

percentage = match / (match + bad_corr)*100
print ("New correction percentage {}".format(percentage))

cmu_words.close()    
cmu_good_correction.close()
cmu_luck_words.close()
cmu_wrong_correction.close()
cmu_no_match.close()

matrix name:  acoustic_similarity.csv


correction percentage :  9.57004160887656
Correct word found : 69
Guess word not in frequency or CMU dict : 0
Bad correction : 579
648
648
Guess word was in close words 179 times, ie in 24.826629680998614 percent of the case
New correction percentage 10.648148148148149


In [10]:
#CMU with phoneme edit distance

import sys
import stringdist
from string import digits
import hollbrook_structures
import csv
import data_structures
import ast
import numpy as np
import pandas as pd
from tabulate import tabulate

#EMMA'S EDIT DISTANCE FOR PHONEMES
#similarity matrix 
similarities = pd.read_csv(input('matrix name: '), index_col=0)

#cost of insertion/deletion
idc=0.5

def compare(string1, string2):
    r = string1
    h = string2


    d = np.zeros((len(r)+1)*(len(h)+1), dtype=float)
    d = d.reshape((len(r)+1, len(h)+1))

    e = np.zeros((len(r)+1)*(len(h)+1), dtype=object)
    e = e.reshape((len(r)+1, len(h)+1))


    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                if j == 0:
                    d[0][0] = 0
                    e[0][j] = (0,0,0)
                else:
                    d[i][j] = d[i][j-1] + idc
                    e[0][j] = (0,0,1)
            elif j == 0:
                if i == 0:
                    e[i][0] = (0,0,0)
                else:
                    d[i][j] = d[i-1][j] + idc
                    e[i][0] = (1,0,0)

    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
                e[i][j] = (0, 1, 0)                
            else:
                #some random extra stuff in here that doesn't get used - could clean up but works as is
                sub = ((d[i-1][j-1] + (score(r[i-1], h[j-1]))) , (str(e[i-1][j-1]) + 'substitution ' + str(j-1) + ', '))
                dell = (d[i-1][j] +idc, (str(e[i][j-1]) + 'deletion ' + str(j-1) + ' ,'))
                if j >= len(h):
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                else:
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                d[i][j] = min(sub, ins, dell)[0]
                e[i][j] = (dell[0]==d[i][j], sub[0]==d[i][j], ins[0]==d[i][j]) * 1    
    return d, e

def score(l1, l2):
    return similarities[str(l1)][str(l2)]

#naive_backtrace and align code taken from the internet - uses the backtrace to find 
#the optimum path and alignment
#https://giov.dev/2016/01/minimum-edit-distance-in-python.html

def naive_backtrace(B_matrix):

    i, j = B_matrix.shape[0]-1, B_matrix.shape[1]-1
    backtrace_idxs = [(i, j)]
    while (i, j) != (0, 0):
        if B_matrix[i,j][1]:
            i, j = i-1, j-1
        elif B_matrix[i,j][0]:
            i, j = i-1, j
        elif B_matrix[i,j][2]:
            i, j = i, j-1
        backtrace_idxs.append((i,j))

    return backtrace_idxs

def align(word_1, word_2, bt):
    
    aligned_word_1 = []
    aligned_word_2 = []
    operations = []

    backtrace = bt[::-1]  # make it a forward trace

    for k in range(len(backtrace) - 1): 
        i_0, j_0 = backtrace[k]
        i_1, j_1 = backtrace[k+1]

        w_1_letter = None
        w_2_letter = None
        op = None

        if i_1 > i_0 and j_1 > j_0:  # either substitution or no-op
            if word_1[i_0] == word_2[j_0]:  # no-op, same symbol
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = " "
            else:  # cost increased: substitution
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = "s"
        elif i_0 == i_1:  # insertion
            w_1_letter = " "
            w_2_letter = word_2[j_0]
            op = "i"
        else: #  j_0 == j_1,  deletion
            w_1_letter = word_1[i_0]
            w_2_letter = " "
            op = "d"

        aligned_word_1.append(w_1_letter)
        aligned_word_2.append(w_2_letter)
        operations.append(op)
    
    

    return aligned_word_1, aligned_word_2, operations


def make_table(alignment):
    row1=[]
    row2=[]
    row3=[]
    for i,j,k in alignment:
        row1.append(i)
        row2.append(j)
        row3.append(k)
    table=[row1, row2, row3]
    return table
    
def align_and_score(word1, word2):
    d, e = compare(word1, word2)
    bt = naive_backtrace(e)
    a, b, c =  (align(word1, word2, bt))
    alignment = (list(zip(a,b,c)))
    x,y = (d.shape)
    score = (d[x-1][y-1]) 
    table = make_table(alignment)
    return score

    

#Decalaration and importation of the structures required for this scriptbirkbeck_dict = birkbeck_parser.birkbeck_dict
reverse_dict = hollbrook_structures.hollbrook_reverse

cmu_dict = data_structures.cmu_dict   # cmu_dict[phonemes]=word
cmu_dict2 = data_structures.cmu_dict2  # cmu_dict2[word]= phonemes
cmu_phones = data_structures.cmu_phones # list of sequence of phonemes
frequency_dict = data_structures.frequency_dict #frequency_dict[word]=frequency (which is a STRING not an INT)

f = open('hollbrook_miss_phi.csv') # results from g2p-seq2seq and the hollbrook data set
csv_f = csv.reader(f)
results_dict = {}

#creating dictionary of misspellings and cmu dict phonemes of misspellings
for row in csv_f:
    results_dict[row[0]] = row[1] # results_dict[misspelled word]=sequence of phonemes
    
# 1 - Getting all closes phonemes of the misspelled word

#For each misspelled word, we get a list of sequences of phonemes within an edit distance of 2
#Each sequence of phonemes must start and end with the same phoneme as the missepelled word's sequence of phonemes
close_seq_of_phonemes_dict={}
for misspelling in list(results_dict)[0:]: #results_dict len = 34013
    close_seq_of_phonemes=[]
    phonemes = results_dict.get(misspelling) #phonemes seq associated to the misspelling
    phonemes_list=phonemes.split(" ")
    close_seq_of_phonemes.append(phonemes) #make sure the list of close phonemes also contains the misspelled word's phonemes
    for i in cmu_phones:
        j=i.split(" ") #to compare the phonemes and not each caracter  
        if j[:1]==phonemes_list[:1]:
            if j[-1]==phonemes_list[-1]:
                dist = align_and_score(phonemes_list, j) # Emma's distance for phonemes
                if dist <= 0.52:
                    close_seq_of_phonemes.append(i)
    close_seq_of_phonemes_dict[misspelling]=close_seq_of_phonemes

# 2 - Getting the corresponding word for the most common sequence of phonemes
nbwords_in_cmu = 0
words_in_freq_dic =0
frequency=0
guess_dict={} # dictionary with only the misspelled word and the guess
close_dict={} # dictionary with all the close words
not_in_freq_dict={}
for misspelling in list(close_seq_of_phonemes_dict):
    words_list=[] #store all the close words 
    phonemes_list=close_seq_of_phonemes_dict.get(misspelling)
    popular_seq = phonemes_list[0] # the first et ie the misspelled word  sequence of phonemes
    max_frequency = 0 #frequency of the word
    popular_mot="mot"
    for seq in phonemes_list:
        if seq in cmu_dict:
            guess = cmu_dict.get(seq)
            words_list.append(guess.lower())
            if guess in frequency_dict:
                frequency = int(frequency_dict.get(guess)) # get the frequency associated to the guessed word
                if (frequency > max_frequency):
                    popular_mot= guess
                    max_frequency = frequency 
            else:
                with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                    Fwriter =csv.writer(csvfile, delimiter=',')
                    Fwriter.writerow([misspelling] + ['frequency'])
        else:
            with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([misspelling] + ['cmu'])
                
    close_dict[misspelling]=words_list
    guess_dict[misspelling] = popular_mot
""" if(popular_mot != "mot"): #otherwise words that are either not in CMU nor frequency are added
        guess_dict[misspelling] = popular_mot"""
    
    
    
#dictionary with correct spelling from birkbeck dataset
#get the real spelling from birkbeck data set
cmu_luck_words = open('HBKcmu_luck_words.csv', 'w+')
nb_misspelling = len(guess_dict)
nb_good_correction = 0
nb_luck= 0 # ie nb times real word is in the list of close phonemes

for misspelling in list(guess_dict):
    close_words= close_dict.get(misspelling)
    if misspelling in reverse_dict :
        realword = reverse_dict.get(misspelling).lower()

        if realword in close_words :
            nb_luck += 1
            cmu_luck_words.write(realword +","+ misspelling +"\n")

        guessword = guess_dict.get(misspelling).lower()
        with open("HBKcmu_corrected_words.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([realword] + [misspelling] + [guessword])
        #print("The misspelled word was {}, this solution corrected to {}. The correct word was {}".format(misspelling, guessword, realword))
        if(realword==guessword):
            nb_good_correction += 1

percentage_correct = nb_good_correction / nb_misspelling * 100
print("correction percentage : ", percentage_correct)


#GET THE PERCENTAGE OF CORRECTION OVER THE WORDS THAT ARE IN FRQUENCY AND CMU
cmu_no_match = open('HBKcmu_no_match.csv', 'w+')
cmu_good_correction = open('HBKcmu_good_correction.csv', 'w+')
cmu_wrong_correction = open('HBKcmu_wrong_correction.csv', 'w+')

#cmu_words1=open('cmu_corrected_words1.csv','r')
cmu_words=open('HBKcmu_corrected_words.csv','r')

i=0
match=0
no_match=0
bad_corr =0

for line in cmu_words:
    i+=1
    line = line.strip()
    word, misspelling, guess = line.split(',')
    if (misspelling != 'mot') :
        if(word == guess):
            match+=1
            cmu_good_correction.write(word +","+ misspelling +","+ guess +"\n")
        else :
            bad_corr +=1
            cmu_wrong_correction.write(word +","+ misspelling+ ","+ guess +"\n")
    else:
        no_match+=1
        cmu_no_match.write(word +","+ misspelling+ ","+ guess +"\n")
        
print("Correct word found : {}".format(match))
print("Guess word not in frequency or CMU dict : {}".format(no_match))
print("Bad correction : {}".format(bad_corr))
print(match +no_match+bad_corr)
print(i) #lines in cmu_words

luck_percentage = nb_luck/len(close_dict)*100
print("Guess word was in close words {} times, ie in {} percent of the case".format(nb_luck,luck_percentage))

percentage = match / (match + bad_corr)*100
print ("New correction percentage {}".format(percentage))

cmu_words.close()    
cmu_good_correction.close()
cmu_luck_words.close()
cmu_wrong_correction.close()
cmu_no_match.close()

matrix name:  acoustic_similarity.csv


correction percentage :  6.796116504854369
Correct word found : 118
Guess word not in frequency or CMU dict : 0
Bad correction : 1178
1296
1296
Guess word was in close words 255 times, ie in 35.367545076282944 percent of the case
New correction percentage 9.104938271604938


In [11]:
#CMU with phoneme edit distance

import sys
import stringdist
from string import digits
import hollbrook_structures
import csv
import data_structures
import ast
import numpy as np
import pandas as pd
from tabulate import tabulate

#EMMA'S EDIT DISTANCE FOR PHONEMES
#similarity matrix 
similarities = pd.read_csv(input('matrix name: '), index_col=0)

#cost of insertion/deletion
idc=0.5

def compare(string1, string2):
    r = string1
    h = string2


    d = np.zeros((len(r)+1)*(len(h)+1), dtype=float)
    d = d.reshape((len(r)+1, len(h)+1))

    e = np.zeros((len(r)+1)*(len(h)+1), dtype=object)
    e = e.reshape((len(r)+1, len(h)+1))


    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                if j == 0:
                    d[0][0] = 0
                    e[0][j] = (0,0,0)
                else:
                    d[i][j] = d[i][j-1] + idc
                    e[0][j] = (0,0,1)
            elif j == 0:
                if i == 0:
                    e[i][0] = (0,0,0)
                else:
                    d[i][j] = d[i-1][j] + idc
                    e[i][0] = (1,0,0)

    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
                e[i][j] = (0, 1, 0)                
            else:
                #some random extra stuff in here that doesn't get used - could clean up but works as is
                sub = ((d[i-1][j-1] + (score(r[i-1], h[j-1]))) , (str(e[i-1][j-1]) + 'substitution ' + str(j-1) + ', '))
                dell = (d[i-1][j] +idc, (str(e[i][j-1]) + 'deletion ' + str(j-1) + ' ,'))
                if j >= len(h):
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                else:
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                d[i][j] = min(sub, ins, dell)[0]
                e[i][j] = (dell[0]==d[i][j], sub[0]==d[i][j], ins[0]==d[i][j]) * 1    
    return d, e

def score(l1, l2):
    return similarities[str(l1)][str(l2)]

#naive_backtrace and align code taken from the internet - uses the backtrace to find 
#the optimum path and alignment
#https://giov.dev/2016/01/minimum-edit-distance-in-python.html

def naive_backtrace(B_matrix):

    i, j = B_matrix.shape[0]-1, B_matrix.shape[1]-1
    backtrace_idxs = [(i, j)]
    while (i, j) != (0, 0):
        if B_matrix[i,j][1]:
            i, j = i-1, j-1
        elif B_matrix[i,j][0]:
            i, j = i-1, j
        elif B_matrix[i,j][2]:
            i, j = i, j-1
        backtrace_idxs.append((i,j))

    return backtrace_idxs

def align(word_1, word_2, bt):
    
    aligned_word_1 = []
    aligned_word_2 = []
    operations = []

    backtrace = bt[::-1]  # make it a forward trace

    for k in range(len(backtrace) - 1): 
        i_0, j_0 = backtrace[k]
        i_1, j_1 = backtrace[k+1]

        w_1_letter = None
        w_2_letter = None
        op = None

        if i_1 > i_0 and j_1 > j_0:  # either substitution or no-op
            if word_1[i_0] == word_2[j_0]:  # no-op, same symbol
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = " "
            else:  # cost increased: substitution
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = "s"
        elif i_0 == i_1:  # insertion
            w_1_letter = " "
            w_2_letter = word_2[j_0]
            op = "i"
        else: #  j_0 == j_1,  deletion
            w_1_letter = word_1[i_0]
            w_2_letter = " "
            op = "d"

        aligned_word_1.append(w_1_letter)
        aligned_word_2.append(w_2_letter)
        operations.append(op)
    
    

    return aligned_word_1, aligned_word_2, operations


def make_table(alignment):
    row1=[]
    row2=[]
    row3=[]
    for i,j,k in alignment:
        row1.append(i)
        row2.append(j)
        row3.append(k)
    table=[row1, row2, row3]
    return table
    
def align_and_score(word1, word2):
    d, e = compare(word1, word2)
    bt = naive_backtrace(e)
    a, b, c =  (align(word1, word2, bt))
    alignment = (list(zip(a,b,c)))
    x,y = (d.shape)
    score = (d[x-1][y-1]) 
    table = make_table(alignment)
    return score

    

#Decalaration and importation of the structures required for this scriptbirkbeck_dict = birkbeck_parser.birkbeck_dict
reverse_dict = hollbrook_structures.hollbrook_reverse

cmu_dict = data_structures.cmu_dict   # cmu_dict[phonemes]=word
cmu_dict2 = data_structures.cmu_dict2  # cmu_dict2[word]= phonemes
cmu_phones = data_structures.cmu_phones # list of sequence of phonemes
frequency_dict = data_structures.frequency_dict #frequency_dict[word]=frequency (which is a STRING not an INT)

f = open('hollbrook_miss_phi.csv') # results from g2p-seq2seq and the hollbrook data set
csv_f = csv.reader(f)
results_dict = {}

#creating dictionary of misspellings and cmu dict phonemes of misspellings
for row in csv_f:
    results_dict[row[0]] = row[1] # results_dict[misspelled word]=sequence of phonemes
    
# 1 - Getting all closes phonemes of the misspelled word

#For each misspelled word, we get a list of sequences of phonemes within an edit distance of 2
#Each sequence of phonemes must start and end with the same phoneme as the missepelled word's sequence of phonemes
close_seq_of_phonemes_dict={}
for misspelling in list(results_dict)[0:]: #results_dict len = 34013
    close_seq_of_phonemes=[]
    phonemes = results_dict.get(misspelling) #phonemes seq associated to the misspelling
    phonemes_list=phonemes.split(" ")
    close_seq_of_phonemes.append(phonemes) #make sure the list of close phonemes also contains the misspelled word's phonemes
    for i in cmu_phones:
        j=i.split(" ") #to compare the phonemes and not each caracter  
        if j[:1]==phonemes_list[:1]:
            if j[-1]==phonemes_list[-1]:
                dist = align_and_score(phonemes_list, j) # Emma's distance for phonemes
                if dist <= 0.9:
                    close_seq_of_phonemes.append(i)
    close_seq_of_phonemes_dict[misspelling]=close_seq_of_phonemes

# 2 - Getting the corresponding word for the most common sequence of phonemes
nbwords_in_cmu = 0
words_in_freq_dic =0
frequency=0
guess_dict={} # dictionary with only the misspelled word and the guess
close_dict={} # dictionary with all the close words
not_in_freq_dict={}
for misspelling in list(close_seq_of_phonemes_dict):
    words_list=[] #store all the close words 
    phonemes_list=close_seq_of_phonemes_dict.get(misspelling)
    popular_seq = phonemes_list[0] # the first et ie the misspelled word  sequence of phonemes
    max_frequency = 0 #frequency of the word
    popular_mot="mot"
    for seq in phonemes_list:
        if seq in cmu_dict:
            guess = cmu_dict.get(seq)
            words_list.append(guess.lower())
            if guess in frequency_dict:
                frequency = int(frequency_dict.get(guess)) # get the frequency associated to the guessed word
                if (frequency > max_frequency):
                    popular_mot= guess
                    max_frequency = frequency 
            else:
                with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                    Fwriter =csv.writer(csvfile, delimiter=',')
                    Fwriter.writerow([misspelling] + ['frequency'])
        else:
            with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([misspelling] + ['cmu'])
                
    close_dict[misspelling]=words_list
    guess_dict[misspelling] = popular_mot
""" if(popular_mot != "mot"): #otherwise words that are either not in CMU nor frequency are added
        guess_dict[misspelling] = popular_mot"""
    
    
    
#dictionary with correct spelling from birkbeck dataset
#get the real spelling from birkbeck data set
cmu_luck_words = open('HBKcmu_luck_words.csv', 'w+')
nb_misspelling = len(guess_dict)
nb_good_correction = 0
nb_luck= 0 # ie nb times real word is in the list of close phonemes

for misspelling in list(guess_dict):
    close_words= close_dict.get(misspelling)
    if misspelling in reverse_dict :
        realword = reverse_dict.get(misspelling).lower()

        if realword in close_words :
            nb_luck += 1
            cmu_luck_words.write(realword +","+ misspelling +"\n")

        guessword = guess_dict.get(misspelling).lower()
        with open("HBKcmu_corrected_words.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([realword] + [misspelling] + [guessword])
        #print("The misspelled word was {}, this solution corrected to {}. The correct word was {}".format(misspelling, guessword, realword))
        if(realword==guessword):
            nb_good_correction += 1

percentage_correct = nb_good_correction / nb_misspelling * 100
print("correction percentage : ", percentage_correct)


#GET THE PERCENTAGE OF CORRECTION OVER THE WORDS THAT ARE IN FRQUENCY AND CMU
cmu_no_match = open('HBKcmu_no_match.csv', 'w+')
cmu_good_correction = open('HBKcmu_good_correction.csv', 'w+')
cmu_wrong_correction = open('HBKcmu_wrong_correction.csv', 'w+')

#cmu_words1=open('cmu_corrected_words1.csv','r')
cmu_words=open('HBKcmu_corrected_words.csv','r')

i=0
match=0
no_match=0
bad_corr =0

for line in cmu_words:
    i+=1
    line = line.strip()
    word, misspelling, guess = line.split(',')
    if (misspelling != 'mot') :
        if(word == guess):
            match+=1
            cmu_good_correction.write(word +","+ misspelling +","+ guess +"\n")
        else :
            bad_corr +=1
            cmu_wrong_correction.write(word +","+ misspelling+ ","+ guess +"\n")
    else:
        no_match+=1
        cmu_no_match.write(word +","+ misspelling+ ","+ guess +"\n")
        
print("Correct word found : {}".format(match))
print("Guess word not in frequency or CMU dict : {}".format(no_match))
print("Bad correction : {}".format(bad_corr))
print(match +no_match+bad_corr)
print(i) #lines in cmu_words

luck_percentage = nb_luck/len(close_dict)*100
print("Guess word was in close words {} times, ie in {} percent of the case".format(nb_luck,luck_percentage))

percentage = match / (match + bad_corr)*100
print ("New correction percentage {}".format(percentage))

cmu_words.close()    
cmu_good_correction.close()
cmu_luck_words.close()
cmu_wrong_correction.close()
cmu_no_match.close()

matrix name:  acoustic_similarity.csv


correction percentage :  6.38002773925104
Correct word found : 46
Guess word not in frequency or CMU dict : 0
Bad correction : 602
648
648
Guess word was in close words 260 times, ie in 36.061026352288486 percent of the case
New correction percentage 7.098765432098765


In [12]:
#CMU with phoneme edit distance

import sys
import stringdist
from string import digits
import hollbrook_structures
import csv
import data_structures
import ast
import numpy as np
import pandas as pd
from tabulate import tabulate

#EMMA'S EDIT DISTANCE FOR PHONEMES
#similarity matrix 
similarities = pd.read_csv(input('matrix name: '), index_col=0)

#cost of insertion/deletion
idc=0.5

def compare(string1, string2):
    r = string1
    h = string2


    d = np.zeros((len(r)+1)*(len(h)+1), dtype=float)
    d = d.reshape((len(r)+1, len(h)+1))

    e = np.zeros((len(r)+1)*(len(h)+1), dtype=object)
    e = e.reshape((len(r)+1, len(h)+1))


    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                if j == 0:
                    d[0][0] = 0
                    e[0][j] = (0,0,0)
                else:
                    d[i][j] = d[i][j-1] + idc
                    e[0][j] = (0,0,1)
            elif j == 0:
                if i == 0:
                    e[i][0] = (0,0,0)
                else:
                    d[i][j] = d[i-1][j] + idc
                    e[i][0] = (1,0,0)

    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
                e[i][j] = (0, 1, 0)                
            else:
                #some random extra stuff in here that doesn't get used - could clean up but works as is
                sub = ((d[i-1][j-1] + (score(r[i-1], h[j-1]))) , (str(e[i-1][j-1]) + 'substitution ' + str(j-1) + ', '))
                dell = (d[i-1][j] +idc, (str(e[i][j-1]) + 'deletion ' + str(j-1) + ' ,'))
                if j >= len(h):
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                else:
                    ins = (d[i][j-1] +idc, (str(e[i-1][j]) + 'insertion ' + str(j) + ' ,'))
                d[i][j] = min(sub, ins, dell)[0]
                e[i][j] = (dell[0]==d[i][j], sub[0]==d[i][j], ins[0]==d[i][j]) * 1    
    return d, e

def score(l1, l2):
    return similarities[str(l1)][str(l2)]

#naive_backtrace and align code taken from the internet - uses the backtrace to find 
#the optimum path and alignment
#https://giov.dev/2016/01/minimum-edit-distance-in-python.html

def naive_backtrace(B_matrix):

    i, j = B_matrix.shape[0]-1, B_matrix.shape[1]-1
    backtrace_idxs = [(i, j)]
    while (i, j) != (0, 0):
        if B_matrix[i,j][1]:
            i, j = i-1, j-1
        elif B_matrix[i,j][0]:
            i, j = i-1, j
        elif B_matrix[i,j][2]:
            i, j = i, j-1
        backtrace_idxs.append((i,j))

    return backtrace_idxs

def align(word_1, word_2, bt):
    
    aligned_word_1 = []
    aligned_word_2 = []
    operations = []

    backtrace = bt[::-1]  # make it a forward trace

    for k in range(len(backtrace) - 1): 
        i_0, j_0 = backtrace[k]
        i_1, j_1 = backtrace[k+1]

        w_1_letter = None
        w_2_letter = None
        op = None

        if i_1 > i_0 and j_1 > j_0:  # either substitution or no-op
            if word_1[i_0] == word_2[j_0]:  # no-op, same symbol
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = " "
            else:  # cost increased: substitution
                w_1_letter = word_1[i_0]
                w_2_letter = word_2[j_0]
                op = "s"
        elif i_0 == i_1:  # insertion
            w_1_letter = " "
            w_2_letter = word_2[j_0]
            op = "i"
        else: #  j_0 == j_1,  deletion
            w_1_letter = word_1[i_0]
            w_2_letter = " "
            op = "d"

        aligned_word_1.append(w_1_letter)
        aligned_word_2.append(w_2_letter)
        operations.append(op)
    
    

    return aligned_word_1, aligned_word_2, operations


def make_table(alignment):
    row1=[]
    row2=[]
    row3=[]
    for i,j,k in alignment:
        row1.append(i)
        row2.append(j)
        row3.append(k)
    table=[row1, row2, row3]
    return table
    
def align_and_score(word1, word2):
    d, e = compare(word1, word2)
    bt = naive_backtrace(e)
    a, b, c =  (align(word1, word2, bt))
    alignment = (list(zip(a,b,c)))
    x,y = (d.shape)
    score = (d[x-1][y-1]) 
    table = make_table(alignment)
    return score

    

#Decalaration and importation of the structures required for this scriptbirkbeck_dict = birkbeck_parser.birkbeck_dict
reverse_dict = hollbrook_structures.hollbrook_reverse

cmu_dict = data_structures.cmu_dict   # cmu_dict[phonemes]=word
cmu_dict2 = data_structures.cmu_dict2  # cmu_dict2[word]= phonemes
cmu_phones = data_structures.cmu_phones # list of sequence of phonemes
frequency_dict = data_structures.frequency_dict #frequency_dict[word]=frequency (which is a STRING not an INT)

f = open('hollbrook_miss_phi.csv') # results from g2p-seq2seq and the hollbrook data set
csv_f = csv.reader(f)
results_dict = {}

#creating dictionary of misspellings and cmu dict phonemes of misspellings
for row in csv_f:
    results_dict[row[0]] = row[1] # results_dict[misspelled word]=sequence of phonemes
    
# 1 - Getting all closes phonemes of the misspelled word

#For each misspelled word, we get a list of sequences of phonemes within an edit distance of 2
#Each sequence of phonemes must start and end with the same phoneme as the missepelled word's sequence of phonemes
close_seq_of_phonemes_dict={}
for misspelling in list(results_dict)[0:]: #results_dict len = 34013
    close_seq_of_phonemes=[]
    phonemes = results_dict.get(misspelling) #phonemes seq associated to the misspelling
    phonemes_list=phonemes.split(" ")
    close_seq_of_phonemes.append(phonemes) #make sure the list of close phonemes also contains the misspelled word's phonemes
    for i in cmu_phones:
        j=i.split(" ") #to compare the phonemes and not each caracter  
        if j[:1]==phonemes_list[:1]:
            if j[-1]==phonemes_list[-1]:
                dist = align_and_score(phonemes_list, j) # Emma's distance for phonemes
                if dist <= 1:
                    close_seq_of_phonemes.append(i)
    close_seq_of_phonemes_dict[misspelling]=close_seq_of_phonemes

# 2 - Getting the corresponding word for the most common sequence of phonemes
nbwords_in_cmu = 0
words_in_freq_dic =0
frequency=0
guess_dict={} # dictionary with only the misspelled word and the guess
close_dict={} # dictionary with all the close words
not_in_freq_dict={}
for misspelling in list(close_seq_of_phonemes_dict):
    words_list=[] #store all the close words 
    phonemes_list=close_seq_of_phonemes_dict.get(misspelling)
    popular_seq = phonemes_list[0] # the first et ie the misspelled word  sequence of phonemes
    max_frequency = 0 #frequency of the word
    popular_mot="mot"
    for seq in phonemes_list:
        if seq in cmu_dict:
            guess = cmu_dict.get(seq)
            words_list.append(guess.lower())
            if guess in frequency_dict:
                frequency = int(frequency_dict.get(guess)) # get the frequency associated to the guessed word
                if (frequency > max_frequency):
                    popular_mot= guess
                    max_frequency = frequency 
            else:
                with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                    Fwriter =csv.writer(csvfile, delimiter=',')
                    Fwriter.writerow([misspelling] + ['frequency'])
        else:
            with open("HBKcmu_no_correction.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([misspelling] + ['cmu'])
                
    close_dict[misspelling]=words_list
    guess_dict[misspelling] = popular_mot
""" if(popular_mot != "mot"): #otherwise words that are either not in CMU nor frequency are added
        guess_dict[misspelling] = popular_mot"""
    
    
    
#dictionary with correct spelling from birkbeck dataset
#get the real spelling from birkbeck data set
cmu_luck_words = open('HBKcmu_luck_words.csv', 'w+')
nb_misspelling = len(guess_dict)
nb_good_correction = 0
nb_luck= 0 # ie nb times real word is in the list of close phonemes

for misspelling in list(guess_dict):
    close_words= close_dict.get(misspelling)
    if misspelling in reverse_dict :
        realword = reverse_dict.get(misspelling).lower()

        if realword in close_words :
            nb_luck += 1
            cmu_luck_words.write(realword +","+ misspelling +"\n")

        guessword = guess_dict.get(misspelling).lower()
        with open("HBKcmu_corrected_words.csv", 'a', newline='') as csvfile:
                writer =csv.writer(csvfile, delimiter=',')
                writer.writerow([realword] + [misspelling] + [guessword])
        #print("The misspelled word was {}, this solution corrected to {}. The correct word was {}".format(misspelling, guessword, realword))
        if(realword==guessword):
            nb_good_correction += 1

percentage_correct = nb_good_correction / nb_misspelling * 100
print("correction percentage : ", percentage_correct)


#GET THE PERCENTAGE OF CORRECTION OVER THE WORDS THAT ARE IN FRQUENCY AND CMU
cmu_no_match = open('HBKcmu_no_match.csv', 'w+')
cmu_good_correction = open('HBKcmu_good_correction.csv', 'w+')
cmu_wrong_correction = open('HBKcmu_wrong_correction.csv', 'w+')

#cmu_words1=open('cmu_corrected_words1.csv','r')
cmu_words=open('HBKcmu_corrected_words.csv','r')

i=0
match=0
no_match=0
bad_corr =0

for line in cmu_words:
    i+=1
    line = line.strip()
    word, misspelling, guess = line.split(',')
    if (misspelling != 'mot') :
        if(word == guess):
            match+=1
            cmu_good_correction.write(word +","+ misspelling +","+ guess +"\n")
        else :
            bad_corr +=1
            cmu_wrong_correction.write(word +","+ misspelling+ ","+ guess +"\n")
    else:
        no_match+=1
        cmu_no_match.write(word +","+ misspelling+ ","+ guess +"\n")
        
print("Correct word found : {}".format(match))
print("Guess word not in frequency or CMU dict : {}".format(no_match))
print("Bad correction : {}".format(bad_corr))
print(match +no_match+bad_corr)
print(i) #lines in cmu_words

luck_percentage = nb_luck/len(close_dict)*100
print("Guess word was in close words {} times, ie in {} percent of the case".format(nb_luck,luck_percentage))

percentage = match / (match + bad_corr)*100
print ("New correction percentage {}".format(percentage))

cmu_words.close()    
cmu_good_correction.close()
cmu_luck_words.close()
cmu_wrong_correction.close()
cmu_no_match.close()

matrix name:  acoustic_similarity.csv


correction percentage :  6.38002773925104
Correct word found : 46
Guess word not in frequency or CMU dict : 0
Bad correction : 602
648
648
Guess word was in close words 264 times, ie in 36.615811373092924 percent of the case
New correction percentage 7.098765432098765
