# Spell check old congressional data

The core spell-checking code for doing this is in `spelling.py`.

In [None]:
from collections import defaultdict
import time, re
import pandas as pd

# This imports spelling.py , a local file which has the code for running spell checkers
import spelling   

SOURCE_DATA_FILE = '/mnt/data/created_data/merged-im-tvdD.csv' # Source speeches file that we will spell check
TARGET_DATA_FILE = 'merged-im-tvdD-spellchecked.csv'           # Dataframe with corrected speech will be here
REPORT_FILE      = 'spellcheck-report.csv'                     # Report of most common typos will be here

# Create a personal dictionary of terms that we should not correct
personal_dictionary = ['moslem',
                       'unamerican',
                       'cooly',
                       'coolie',
                       'linotype',
                       'denaturized',
                       'bracero',
                       'braceros',
                       'deportable',
                       'revisory',
                      ]
personal_substitutions = {'lawabiding': 'law-abiding',}

# Regular expression to remove initial and final punctuation around a word
match_punct = re.compile("^([^\w]*)(\w*)([^\w]*)$") 

# We define some useful regular expressions for cleaning the data
replacement_re = [
  (re.compile(r'[^\x00-\x7F]'), ' '),                  # non-ascii characters replaced by spaces
  (re.compile(r'(?<=[a-z])\.(?=[a-z])'), ' '),         # periods in the middle of words replaced by spaces
  (re.compile(r'(?<=[a-z])\."? (?=[^A-Z])'), ', '),    # periods not followed by capitals replaced by spaces
  (re.compile(r'(?<=[0-9])\.(?=[0-9]{3}[^0-9])'), ','),# periods in numbers, like 232.000.000
  # The following regular expressions find and replace long numbers and dollar figures with special flags
  (re.compile(r'(?<=[^A-Za-z0-9,])\$[0-9,]{4,}(?=[^A-Za-z0-9,])'), 'XXXBIGDOLLARNUMBERXXX'),
  (re.compile(r'(?<=[^A-Za-z0-9,])\$[0-9,]{1,}\.[0-9]{2}(?=[^A-Za-z0-9,])'), 'XXXSMALLDOLLARNUMBERXXX'),
  (re.compile(r'(?<=[^A-Za-z0-9,])[0-9,]{4,}(?=[^A-Za-z0-9,])'), 'XXXNUMBERXXX'),
]

def clean_speech(speech):
    def remove_multiple_spaces(s):
        while "  " in s:
            s = s.replace("  ", " ")
        return s.strip()

    speech = speech.replace('_', ' ')
    
    while "--" in speech:
        speech = speech.replace("--", "-")
        
    for w in ['mr','miss', 'mrs', 'ms', 'dr']: # detect things like 'mr.' and replace them with special flags
        speech = re.sub(r'\b'+w+'\.', 'XXXNAME' + w.upper() + 'XXX', speech, flags=re.IGNORECASE)
        
    speech = remove_multiple_spaces(speech)
    
    for current_re, replacement in replacement_re:
        speech = re.sub(current_re, replacement, speech)
        speech = remove_multiple_spaces(speech)
    
    return speech

def correct_speech(speech):
    # Go through speech and correct spelling errors
    global checker, typo_counter, personal_dictionary
    
    speech = clean_speech(speech)
    
    words = list(speech.split(" "))
    N     = len(words)
    ndx   = -1

    new_words = []

    for ndx, word in enumerate(words):
        # capitalized indicates if word is capitalized and is not preceeded by end of sentence
        capitalized = word[0].isupper() and (ndx==0 or words[ndx-1][-1]!='.')
        if (capitalized or              # Do not correct capitalized words
            word.startswith('XXX') or   #  or special flag marker
            (ndx > 1 and words[ndx-1].startswith('XXXNAME')) or  # name of a person
            word.isupper()):              #  or all upper case words
            # Either a honorofic marker or some proper name in all caps
            new_words.append(word)
            continue

        # Strip out initial and final punctuation around word
        cleaned_word = re.sub(match_punct, r'\g<2>', word)
        if (len(cleaned_word) == 0 or checker.check(cleaned_word)): 
            # word is all punctuation, or it is spelled correctly
            new_words.append(word)
            
        else:
            # OK, this really looks like a typo. Call checker.correct to get a corrected version
            typo_counter[cleaned_word]+=1
            corrected_word = checker.correct(cleaned_word)
            if corrected_word != cleaned_word:
                # Correction is different from original
                nword = re.sub(match_punct, r'\g<1>'+corrected_word.replace('\\', r'\\')+r'\g<3>', word)
                new_words.append(nword)
            else:
                new_words.append(word)

    return " ".join(new_words)

In [None]:
# Load speeches data
df=pd.read_csv(SOURCE_DATA_FILE, sep='\t')

In [None]:
# Create spell checking object
checker = spelling.HunspellChecker(personal_dict=personal_dictionary, 
                                   personal_subs=personal_substitutions)

typo_counter = defaultdict(int) # This keeps track of # of each typo
report_every = 100              # how often to print statistics
lasttime     = time.time()
for rowndx, row in df.iterrows():
    if rowndx % report_every == report_every-1:
        print('On row %6d, average time=%0.2f' % (rowndx+1, (time.time() - lasttime)/report_every))
        lasttime = time.time()
    newspeech = correct_speech(row.speech)
    row['speech'] = newspeech
    #if rowndx>1000:
    #    break


In [None]:
# Create and save report, listing each typo, what it was corrected with, and how many times it occured
with open(REPORT_FILE, 'w') as rf:
    rf.write('Typoword\tCorrectedWord\tCount\n')
    for word, cnt in sorted(typo_counter.items(), key=lambda x: -x[1]):
        s=word + '\t' + checker._cache.get(word,'-') + '\t' + str(cnt)
        rf.write(s +'\n')
        print(s)


In [None]:
# Save speeches dataframe with updated (spell check corrected) speeches
df.to_csv(TARGET_DATA_FILE, sep='\t')
