In [1]:
import pandas as pd
import re
from collections import Counter

import numpy as np

pd.set_option('display.max_rows', None)

db_path = 'deals_db.db'

In [2]:
def clean_text(text):
    cleaned_text = re.sub(r'\b[a-zA-Z]\b|\b\d+\b', '', text)
    return re.sub(r'\s+', ' ', cleaned_text).strip().lower()

In [3]:
# dict_uniq -> filtered corrected words from top queries
df = pd.read_csv("models/dict_uniq.csv")
df.count()
word_list = df["query"].tolist()
word_list[:20]

['amc',
 'atv',
 'atlantic city',
 'bowlero',
 'brazilian',
 'brazilian wax',
 'cancun',
 'cinema',
 'christmas',
 'christmas cards',
 'chuck e. cheese',
 'coolsculpting',
 "save and buster's",
 'disney',
 'disneyland',
 'emsculpt',
 'halloween',
 'ipl',
 'jpcenney',
 'juvederm']

In [4]:
def calculate_similarity(word1, word2):

    counter1 = Counter(word1)
    counter2 = Counter(word2)

    common_letters = set(counter1.keys()) & set(counter2.keys())
    similarity_score = sum(min(counter1[letter], counter2[letter]) for letter in common_letters)

    length_factor = min(len(word1), len(word2)) / max(len(word1), len(word2))

    initial_match_bonus = 0
    for i in range(min(len(word1), len(word2))):
        if word1[i] == word2[i]:
            initial_match_bonus += 1
        else:
            break

    total_similarity = (similarity_score * length_factor) + initial_match_bonus
    return total_similarity


# Test
input_words = ['acm', 'oli', 'oli chance', 'ifone', 'cuple', 'message', 'valentyn', 'pakr' ]

for input_word in input_words:

    similarities = {word: calculate_similarity(input_word, word) for word in word_list}
    most_similar_word = max(similarities, key=similarities.get)

    print(f'Most similar wird to "{input_word}" is "{most_similar_word}" with score {similarities[most_similar_word]}.')


Most similar wird to "acm" is "amc" with score 4.0.
Most similar wird to "oli" is "oil" with score 4.0.
Most similar wird to "oli chance" is "change oil" with score 9.0.
Most similar wird to "ifone" is "iphone" with score 4.333333333333334.
Most similar wird to "cuple" is "couple" with score 5.166666666666667.
Most similar wird to "message" is "massage" with score 7.0.
Most similar wird to "valentyn" is "valentine's day" with score 10.266666666666666.
Most similar wird to "pakr" is "parking" with score 4.285714285714286.
