# Misspelling detection and correction

**NOTE**: This notebook depends upon the the Retrotech dataset. If you have any issues, please rerun the [Setting up the Retrotech Dataset](../ch4/1.ch4-setting-up-the-retrotech-dataset.ipynb) notebook.

In [None]:
import sys
sys.path.append('..')
import json
from aips import *
import pandas as pd
import nltk
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import numpy as np
import re
nltk.download('stopwords')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("aips-ch6").getOrCreate()

## Listing 6.12

In [2]:
collection="products"
query="moden"

request = {
    "params": {
        "q.op": "and",
        "rows": 0,
        "indent": "on"
    },
    "query": query,
}

search_results = requests.post(f"{SOLR_URL}/{collection}/spell", json=request).json()
print(json.dumps(search_results["spellcheck"]["collations"], indent=4))

[]


In [3]:
#use the real signals
signals_collection="signals"
signals_opts={"zkhost": "aips-zk", "collection": signals_collection}
df = spark.read.format("solr").options(**signals_opts).load()
df.createOrReplaceTempView("signals")

## Listing 6.13

In [4]:
### Create user-searchs table each raw represent one search query.
query_signals = spark.sql("""
  select lower(trim(searches.target)) as keyword, searches.user as user 
  from signals as searches where searches.type='query'
  group by keyword, user"""
).collect()

### Step 1: Tokenize queries and count word frequencies. 
Check word frequency distribution quantiles. The quantile will help decide cut off point for potential misspellings and corrections. 

## Listing 6.14

In [5]:
stop_words = set(stopwords.words('english'))
word_list = defaultdict(int)

#for query in signal_sample["query_s"]:
for row in query_signals:
    query = row["keyword"]
    tokenizer = RegexpTokenizer(r'\w+') 
    tokens   = tokenizer.tokenize(query)
    
    for token in tokens:
        if token not in stop_words and len(token) > 3 and not token.isdigit():  #drop stopwords and digit only tokens
            # and only consider token length > 3, since hard to judge whether a very short token is misspelled or not
            word_list[token] += 1

## Listing 6.15

In [6]:
quantiles_to_check = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
quantile_values = np.quantile(np.array(list(word_list.values())), quantiles_to_check)
quantiles = dict(zip(quantiles_to_check, quantile_values))
quantiles

{0.1: 5.0,
 0.2: 6.0,
 0.3: 8.0,
 0.4: 12.0,
 0.5: 16.0,
 0.6: 25.0,
 0.7: 47.0,
 0.8: 142.20000000000027,
 0.9: 333.2000000000007}

### Step 2: compute metadata needed for word matching. 
consider word with low count as misspelling condidates, with high count as correctly spelled candidates. 

## Listing 6.16

In [7]:
misspell_candidates = []
correction_candidates = []
misspell_counts = []
correction_counts = []
misspell_length = []
correction_length = []
misspell_initial = []
correction_initial = []

for k, v in word_list.items():
    if v <= quantiles[0.2] : #if v == 1:  # this number based on quantile analysis and the data set, more-likely with user-behvaiour data set to be 1
        misspell_candidates.append(k)
        misspell_counts.append(v)
        misspell_length.append(len(k))
        misspell_initial.append(k[0])
    if v >= quantiles[0.8]:
        correction_candidates.append(k)
        correction_counts.append(v)
        correction_length.append(len(k))
        correction_initial.append(k[0])

## Listing 6.17

In [8]:
misspell_candidates_df = pd.DataFrame({
    "misspell":misspell_candidates, 
    "misspell_counts":misspell_counts, 
    "misspell_length":misspell_length,
    "initial":misspell_initial})

correction_candidates_df = pd.DataFrame({
    "correction":correction_candidates, 
    "correction_counts":correction_counts, 
    "correction_length":correction_length,
    "initial":correction_initial})

In [9]:
#Show Results:
misspell_candidates_df.head(10)

Unnamed: 0,misspell,misspell_counts,misspell_length,initial
0,carolina,5,8,c
1,luther,6,6,l
2,vandross,6,8,v
3,ldf6920st,6,9,l
4,nintendogs,6,10,n
5,gangs,5,5,g
6,york,5,4,y
7,mute,6,4,m
8,math,6,4,m
9,multimedia,5,10,m


### Step 3: Find potential matches 
based on edit distance and whether word initial is the same or not. 

## Lsting 6.18

In [10]:
def good_match(len1, len2, edit_dist): #allow longer words have more edit distance
    match = 0
    min_length = min(len1, len2)
    if min_length < 8:
        if edit_dist == 1: match = 1
    elif min_length < 11:
        if edit_dist <= 2: match = 1
    else:
        if edit_dist == 3: match = 1
    return match

## Listing 6.19

In [11]:
matches_candidates = pd.merge(misspell_candidates_df, correction_candidates_df, on="initial")
#join missepll list with correction list based on whether they share the same initials to reduce matching time. 
matches_candidates["edit_dist"] = matches_candidates.apply(lambda row: nltk.edit_distance(row.misspell,row.correction), axis=1)
matches_candidates["good_match"] = matches_candidates.apply(lambda row: good_match(row.misspell_length, row.correction_length, row.edit_dist),axis=1)

In [12]:
matches = matches_candidates[matches_candidates["good_match"] == 1].drop(["initial","good_match"],axis=1)

### Step 4: rank potential matched corrections 
based on edit distance and correction word frequency. shorter edit distance and higher word count will be prefered. only the top one correction is selected for final matching. 

In [13]:
#matches.sort_values(by=['misspell', 'edit_dist', 'correction_counts'], ascending=[True, True, False])
matches_final = matches.groupby('misspell').first().reset_index()

In [14]:
#Show Results:
matches_final.sort_values(by=['correction_counts'], ascending=[False])[["misspell", "correction", "misspell_counts", "correction_counts", "edit_dist"]].head(20)

Unnamed: 0,misspell,correction,misspell_counts,correction_counts,edit_dist
50,iphone3,iphone,6,16854,1
62,latop,laptop,5,14119,1
61,laptopa,laptop,6,14119,1
137,touxhpad,touchpad,5,11550,1
136,toucpad,touchpad,6,11550,1
148,wirless,wireless,6,10060,1
127,tableta,tablet,6,8260,1
10,cape,case,5,7541,1
8,cage,case,6,7541,1
30,gallaxy,galaxy,6,5839,1


## Listing 6.20

In [15]:
## Alternative - don't tokenize into individual keywords
stop_words = set(stopwords.words('english'))
word_list = defaultdict(int)

#for query in signal_sample["query_s"]:
for row in query_signals:
    query = row["keyword"]

    if query not in stop_words and len(query) > 3 and not query.isdigit():  
        word_list[query] += 1
        
#run Listing 16.12-16.15 again...