# Misspelling detection and correction

**NOTE**: This notebook depends upon the the Retrotech dataset. If you have any issues, please rerun the [Setting up the Retrotech Dataset](../ch04/1.setting-up-the-retrotech-dataset.ipynb) notebook.

In [15]:
import sys

sys.path.append('..')
from collections import defaultdict

import numpy
import pandas
from aips import get_engine
from aips.spark import create_view_from_collection
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("AIPS").getOrCreate()
engine = get_engine()

## Listing 6.13

In [2]:
products_collection = engine.get_collection("products")
query = "moden"
products_collection.spell_check(query, log=True)

Solr spellcheck basic request syntax: 
{
  "query": "moden",
  "params": {
    "q.op": "and",
    "indent": "on"
  }
}


{'modes': 421, 'model': 159, 'modern': 139, 'modem': 56, 'mode6': 9}

In [3]:
#use the real signals
signals_collection = engine.get_collection("signals")
create_view_from_collection(signals_collection, "signals")

## Listing 6.14

In [4]:
### Create user-searchs table each raw represent one search query.
def get_search_queries():
    query = """SELECT searches.user AS user,
               LOWER(TRIM(searches.target)) As keyword
               FROM signals AS searches WHERE searches.type = 'query'
               GROUP BY keyword, user"""
    return spark.sql(query).collect()

In [5]:
query_signals = get_search_queries()

## Listing 6.15
### Step 1: Tokenize queries and count word frequencies. 
Check word frequency distribution quantiles. The quantile will help decide cut off point for potential misspellings and corrections. 

In [6]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

def is_word_valid(word):
    return (word not in stop_words and #drop stopwords
            len(word) > 3 and #only consider token length > 3, since hard to judge whether a very short token is misspelled or not
            not word.isdigit())  # drop digit only tokens
        
def filter_valid_words(queries, tokenize=True):    
    word_list = defaultdict(int)
    for row in queries:
        query = row["keyword"]
        if tokenize:
            tokenizer = RegexpTokenizer(r'\w+')
            tokens = tokenizer.tokenize(query)
        else:
            tokens = [query]
        for token in tokens:
            if is_word_valid(token):
                word_list[token] += 1
    return word_list


[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Listing 6.16

In [7]:
def calculate_quantiles(word_list):
    quantiles_to_check = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    quantile_values = numpy.quantile(numpy.array(list(word_list.values())),
                                  quantiles_to_check)
    return dict(zip(quantiles_to_check, quantile_values))

In [8]:
word_list = filter_valid_words(query_signals, tokenize=True)
calculate_quantiles(word_list)

{0.1: 5.0,
 0.2: 6.0,
 0.3: 8.0,
 0.4: 12.0,
 0.5: 16.0,
 0.6: 25.0,
 0.7: 47.0,
 0.8: 142.20000000000027,
 0.9: 333.2000000000007}

## Listing 6.17
### Step 2: compute metadata needed for word matching. 
consider word with low count as misspelling condidates, with high count as correctly spelled candidates. 

In [9]:
def create_spelling_candidates(word_list):
    quantiles = calculate_quantiles(word_list)
    misspellings = {"misspell": [], 
                   "misspell_counts": [], 
                   "misspell_length": [],
                   "initial": []}
    corrections = {"correction": [], 
                   "correction_counts": [], 
                   "correction_length": [],
                   "initial": []}
    for key, value in word_list.items():
        if value <= quantiles[0.2]: #if value == 1:  # this number based on quantile analysis and the data set, more-likely with user-behvaiour data set to be 1
            misspellings["misspell"].append(key)
            misspellings["misspell_counts"].append(value)
            misspellings["misspell_length"].append(len(key))
            misspellings["initial"].append(key[0])
        if value >= quantiles[0.8]:
            corrections["correction"].append(key)
            corrections["correction_counts"].append(value)
            corrections["correction_length"].append(len(key))
            corrections["initial"].append(key[0])
    return (pandas.DataFrame(misspellings), pandas.DataFrame(corrections))

## Lsting 6.18

In [10]:
def display_spellcheck_data(misspell_candidates_df, correction_candidates_df):
    misspellings = misspell_candidates_df.sort_values(by=["misspell_counts", "misspell"], ascending=[False, False]).head(10)
    print(misspellings.head(5), "\n")
    corrections = correction_candidates_df.sort_values(by=["correction_counts", "correction"], ascending=[False, True]).head(10)
    print(corrections.head(5))

## Lsting 6.19
### Step 3: Find potential matches 
based on edit distance and whether word initial is the same or not. 

In [11]:
def good_match(len1, len2, edit_dist): #allow longer words have more edit distance
    match = 0
    min_length = min(len1, len2)
    if min_length < 8:
        if edit_dist == 1: match = 1
    elif min_length < 11:
        if edit_dist <= 2: match = 1
    else:
        if edit_dist == 3: match = 1
    return match

## Listing 6.20

### Step 4: rank potential matched corrections 
based on edit distance and correction word frequency. shorter edit distance and higher word count will be prefered. only the top one correction is selected for final matching. 

In [12]:
def calculate_spelling_corrections(word_list):
    (mispellings, corrections) = create_spelling_candidates(word_list)
    #Optomization: join each list based on whether they share the same initials
    matches_candidates = pandas.merge(mispellings, corrections, on="initial")    
    matches_candidates["edit_dist"] = matches_candidates.apply(
        lambda row: nltk.edit_distance(row.misspell,
                                       row.correction), axis=1)
    matches_candidates["good_match"] = matches_candidates.apply(
        lambda row: good_match(row.misspell_length, row.correction_length,
                               row.edit_dist),axis=1)
    
    cols = ["misspell", "correction", "misspell_counts", "correction_counts", "edit_dist"]
    matches = matches_candidates[matches_candidates["good_match"] == 1] \
                  .drop(["initial", "good_match"],axis=1) \
                  .groupby("misspell").first().reset_index() \
                  .sort_values(by=["correction_counts", "misspell"],
                               ascending=[False, True])[cols]
    return matches

In [16]:
word_list = filter_valid_words(query_signals, tokenize=True)
calculate_spelling_corrections(word_list).head(20)

Unnamed: 0,misspell,correction,misspell_counts,correction_counts,edit_dist
50,iphone3,iphone,6,16854,1
62,latop,laptop,5,14119,1
136,toucpad,touchpad,6,11550,1
137,touxhpad,touchpad,5,11550,1
148,wirless,wireless,6,10060,1
127,tableta,tablet,6,8260,1
8,cage,case,6,7541,1
10,cape,case,5,7541,1
30,gallaxy,galaxy,6,5839,1
61,laptopa,laptops,6,5565,1


## Listing 6.20

In [17]:
word_list = filter_valid_words(query_signals, tokenize=False)
calculate_spelling_corrections(word_list).head(20)

Unnamed: 0,misspell,correction,misspell_counts,correction_counts,edit_dist
181,ipad.,ipad,6,7749,1
153,hp tochpad,hp touchpad,6,7144,1
154,hp touchpad 32,hp touchpad,5,7144,3
155,hp toucpad,hp touchpad,6,7144,1
190,iphone s4,iphone 4s,5,4642,2
193,iphone4 s,iphone 4s,5,4642,2
194,iphones 4s,iphone 4s,5,4642,1
406,tochpad,touchpad,6,4019,1
407,toichpad,touchpad,6,4019,1
412,touchpaf,touchpad,5,4019,1



Up next: Chapter 7 - [Interpreting Query Intent through Semantic Search](../ch07/1.index-datasets.ipynb)