# Misspelling detection and correction

**NOTE**: This notebook depends upon the the Retrotech dataset. If you have any issues, please rerun the [Setting up the Retrotech Dataset](../ch04/1.setting-up-the-retrotech-dataset.ipynb) notebook.

In [1]:
import sys

sys.path.append('../..')
from collections import defaultdict

import json
import numpy
import pandas
from aips import get_engine
from aips.spark import create_view_from_collection
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("AIPS").getOrCreate()
engine = get_engine()

## Listing 6.13

In [2]:
products_collection = engine.get_collection("products")
query = "moden"
results = products_collection.spell_check(query)
print(results)

{'modes': 421, 'model': 159, 'modern': 139, 'modem': 56, 'mode6': 9}


## Listing 6.14

In [3]:
signals_collection = engine.get_collection("signals")
create_view_from_collection(signals_collection, "signals")

In [10]:
def get_search_queries():
    query = """SELECT searches.user AS user,
               LOWER(TRIM(searches.target)) As query
               FROM signals AS searches WHERE searches.type = 'query'
               GROUP BY searches.target, user"""
    return spark.sql(query).collect()

In [11]:
query_signals = get_search_queries()

## Listing 6.15
### Step 1: Tokenize queries and count word frequencies. 
Check word frequency distribution quantiles. The quantile will help decide cut off point for potential misspellings and corrections. 

In [5]:
from nltk import tokenize, corpus, download
download('stopwords')
stop_words = set(corpus.stopwords.words("english"))

def is_term_valid(term, minimum_length=4):
    return (term not in stop_words and #drop stopwords
            len(term) >= minimum_length and #only consider token length > 3, since hard to judge whether a very short token is misspelled or not
            not term.isdigit())  # drop digit only tokens

def tokenize_query(query):
    return tokenize.RegexpTokenizer(r'\w+').tokenize(query)

def valid_keyword_occurrences(searches, tokenize=True):
    word_list = defaultdict(int)
    for search in searches:
        query = search["query"]
        terms = tokenize_query(query) if tokenize else [query]
        for term in terms:
            if is_term_valid(term):
                word_list[term] += 1
    return word_list

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Listing 6.16

In [6]:
def calculate_quantiles(word_list):
    quantiles_to_check = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    quantile_values = numpy.quantile(numpy.array(list(word_list.values())),
                                     quantiles_to_check)
    return dict(zip(quantiles_to_check, quantile_values))

In [15]:
query_signals = get_search_queries()
word_list = valid_keyword_occurrences(query_signals, tokenize=True)
quantiles = calculate_quantiles(word_list)
display(quantiles)

{0.1: 5.0,
 0.2: 6.0,
 0.3: 8.0,
 0.4: 12.0,
 0.5: 16.0,
 0.6: 25.0,
 0.7: 47.0,
 0.8: 142.20000000000027,
 0.9: 333.2000000000007}

## Listing 6.17
### Step 2: compute metadata needed for word matching. 
consider word with low count as misspelling condidates, with high count as correctly spelled candidates. 

In [8]:
def create_spelling_candidates(word_list):
    quantiles = calculate_quantiles(word_list)
    misspellings = {"misspelling": [], 
                    "misspell_counts": [], 
                    "misspell_length": [],
                    "initial": []}
    corrections = {"correction": [], 
                   "correction_counts": [], 
                   "correction_length": [],
                   "initial": []}
    for key, value in word_list.items():
        if value <= quantiles[0.2]: #if value == 1:  # this number based on quantile analysis and the data set, more-likely with user-behvaiour data set to be 1
            misspellings["misspelling"].append(key)
            misspellings["misspell_counts"].append(value)
            misspellings["misspell_length"].append(len(key))
            misspellings["initial"].append(key[0])
        if value >= quantiles[0.8]:
            corrections["correction"].append(key)
            corrections["correction_counts"].append(value)
            corrections["correction_length"].append(len(key))
            corrections["initial"].append(key[0])
    return (pandas.DataFrame(misspellings), pandas.DataFrame(corrections))

## Lsting 6.18
### Step 3: Find potential matches 
based on edit distance and whether word initial is the same or not. 

In [16]:
def good_match(word_length_1, word_length_2, edit_dist): #allow longer words have more edit distance
    min_length = min(word_length_1, word_length_2)
    return ((min_length < 8 and edit_dist == 1) or
            (min_length >= 8 and min_length < 11 and edit_dist <= 2) or
            (min_length >= 11 and edit_dist == 3))

## Listing 6.19

### Step 4: rank potential matched corrections 
based on edit distance and correction word frequency. shorter edit distance and higher word count will be prefered. only the top one correction is selected for final matching. 

In [17]:
from nltk import edit_distance

def calculate_spelling_corrections(word_list):
    (misspellings, corrections) = create_spelling_candidates(word_list)
    #Optomization: join each list based on whether they share the same initials
    matches_candidates = pandas.merge(misspellings,
                         corrections, on="initial")    
    matches_candidates["edit_dist"] = matches_candidates.apply(
        lambda row: edit_distance(row.misspelling,
                                  row.correction), axis=1)
    matches_candidates["good_match"] = matches_candidates.apply(
        lambda row: good_match(row.misspell_length,
                               row.correction_length,
                               row.edit_dist),axis=1)
    
    cols = ["misspelling", "correction", "misspell_counts",
            "correction_counts", "edit_dist"]
    matches = matches_candidates[matches_candidates["good_match"]] \
                  .drop(["initial", "good_match"],axis=1) \
                  .groupby("misspelling").first().reset_index() \
                  .sort_values(by=["correction_counts", "misspelling"],
                               ascending=[False, True])[cols]
    return matches

In [42]:
query_signals = get_search_queries()
word_list = valid_keyword_occurrences(query_signals, tokenize=False)
corrections = calculate_spelling_corrections(word_list)
display(corrections.head(20))

Unnamed: 0,misspelling,correction,misspell_counts,correction_counts,edit_dist
181,ipad.,ipad,6,7749,1
153,hp tochpad,hp touchpad,6,7144,1
154,hp touchpad 32,hp touchpad,5,7144,3
155,hp toucpad,hp touchpad,6,7144,1
190,iphone s4,iphone 4s,5,4642,2
193,iphone4 s,iphone 4s,5,4642,2
194,iphones 4s,iphone 4s,5,4642,1
406,tochpad,touchpad,6,4019,1
407,toichpad,touchpad,6,4019,1
412,touchpaf,touchpad,5,4019,1


## Listing 6.20

In [43]:
query_signals = get_search_queries()
word_list = valid_keyword_occurrences(query_signals, tokenize=False)
corrections = calculate_spelling_corrections(word_list)
display(corrections.head(20))

Unnamed: 0,misspelling,correction,misspell_counts,correction_counts,edit_dist
181,ipad.,ipad,6,7749,1
153,hp tochpad,hp touchpad,6,7144,1
154,hp touchpad 32,hp touchpad,5,7144,3
155,hp toucpad,hp touchpad,6,7144,1
190,iphone s4,iphone 4s,5,4642,2
193,iphone4 s,iphone 4s,5,4642,2
194,iphones 4s,iphone 4s,5,4642,1
406,tochpad,touchpad,6,4019,1
407,toichpad,touchpad,6,4019,1
412,touchpaf,touchpad,5,4019,1



Up next: Chapter 7 - [Interpreting Query Intent through Semantic Search](../ch07/1.index-datasets.ipynb)