# S-capade Single Phoneme Sequence Correction Demo
This file is a quick implementation of the S-capade method which acoustic edit distance and phoneme sequence representations of misspellings to attempt corrections. To generate the phoneme sequence representation of the misspellings, the CMU dictionary is used. To generate candidate lists using symmetric deletes (for speed), an adaption of SymSpell/SymSpellPy is used. This is a demo only, and adapated from the core code located in text_processing.dataset_processing.symspell_scapade. Usually, you would pass entire word lists to the tool, in the format  

\\$accommodate  
accomodate  
acommadate  
\\$accord
acord  
\\$acquaintance  
aquantance  

Where \\$ indicates the target correction and everything below this until the next \\$ indicates the misspellings of this target word. 

In [1]:
from text_processing.dataset_processing import symspell_scapade
from pathlib import Path
from symspellpy_scapade import symspellscapade
import pkg_resources
import pandas as pd

In [2]:
# Load S-capade method - only need to do this once, this takes the longest time at outset
SymSpell = symspellscapade.SymSpell
Verbosity = symspellscapade.Verbosity
scapade = SymSpell(max_dictionary_edit_distance=2, prefix_length=15)
dictionary_path = pkg_resources.resource_filename("symspellpy_scapade", "cmu_frequency_added.csv")
scapade.load_dictionary(dictionary_path, term_index=1, count_index=2)

True

In [3]:
# Rough implementation of the function in text_processing.dataset_processing.symspell_scapade
# The function in that dataset is used for processing word lists from datasets (the five for the paper)
# This is a quick implementation of it to demonstrate how it works just with single phoneme sequence misspellings
# You can generate phoneme sequences to pass to it using http://www.speech.cs.cmu.edu/cgi-bin/cmudict or
# you write a shell script to work beside this example so you only need to pass word misspellings in and get 
# corrections out
def correction(scapade, phoneme_sequence):
    misspelling = phoneme_sequence
    suggestions = scapade.lookup(misspelling, Verbosity.ALL)
    input_path_csv = Path("input_files/") / "cmu_frequency_added.csv"
    df = pd.read_csv(input_path_csv, names=['word', 'seq', 'count'])
    correction_dict =  {misspelling:{"suggested_correction":"", "candidates":[]}}

    for suggestion in suggestions:
        if len(correction_dict[misspelling]['candidates']) >= 10:
            break
        current_seq = str(suggestion).split(',')[0]
        df_slice = df[df['seq'] == current_seq].sort_values(by=['count'], ascending=False)
        if correction_dict[misspelling]['suggested_correction'] == '' and df_slice.iloc[0]['count'] >= 1:
            correction_dict[misspelling]['suggested_correction'] = df_slice.iloc[0]['word']
        if len(correction_dict[misspelling]['suggested_correction']) > 0 and \
        len(correction_dict[misspelling]['candidates']) <= 10:
            df_slice = df_slice[df_slice['count'] > 1]
            correction_dict[misspelling]['candidates'] += (list(df_slice[:5]['word']))
    return correction_dict

In [4]:
# anybody - enybody EH N IY B AH D IY
print(correction(scapade, "EH N IY B AH D IY"))

{'EH N IY B AH D IY': {'suggested_correction': 'anybody', 'candidates': ['anybody', 'nobody', 'enabled']}}


In [5]:
# example - egsample EH G S AE M P AH L
print(correction(scapade, "EH G S AE M P AH L"))

{'EH G S AE M P AH L': {'suggested_correction': 'example', 'candidates': ['example', 'sample']}}


In [6]:
# necessarily N EH S AH K EH R IY AH L IY
print(correction(scapade,"N EH S AH K EH R IY AH L IY"))

{'N EH S AH K EH R IY AH L IY': {'suggested_correction': 'necessarily', 'candidates': ['necessarily']}}
