In [1]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.feature_extraction.text import TfidfVectorizer
import time
import re
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

pd.set_option('display.max_rows', None)
#pd.set_option('display.max_colwidth', None)

db_path = '/Users/zphilipp/git/research/dealsdb/deals_db1.db'

prepositions_and_conjunctions = [
    "about", "above", "across", "after", "against", "along", "among", "around", "at",
    "before", "behind", "below", "beneath", "beside", "between", "beyond", "by",
    "during", "for", "from", "in", "inside", "into", "near", "of", "off", "on",
    "out", "outside", "over", "through", "throughout", "to", "toward", "under",
    "until", "up", "with", "within", "without", "and", "but", "or", "for", "nor",
    "so", "yet", "although", "because", "as", "since", "unless", "while", "when",
    "where", "after", "before", "the", "a"
]
pattern = r'\b(?:' + '|'.join(prepositions_and_conjunctions) + r')\b'

def remove_prepositions_and_conjunctions(text):
    cleaned_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    return re.sub(r'\s+', ' ', cleaned_text).strip()

# Get Words for embeddings
def get_words_for_embeddings(row):
    words = []
    for item in row:
        words.append(item[0])
    return words

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

sql_query = """
    SELECT
        d.customer_category_id,
        c.name,
        GROUP_CONCAT(d.title_general, ',') || ' ' || GROUP_CONCAT(o.title, ',') AS text
    FROM deals d
        JOIN customer_taxonomy c ON (d.customer_category_id=c.id)
        LEFT JOIN options o ON (d.deal_id=o.deal_id)
    GROUP BY customer_category_id
    -- LIMIT 100
"""

# Execute the query and load the data into a DataFrame
df = pd.read_sql_query(sql_query, conn)
conn.close()

In [3]:
def get_important_words(document):
    document = remove_prepositions_and_conjunctions(document)
    document = [document]

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(document)

    feature_names = vectorizer.get_feature_names_out()
    first_document_vector = tfidf_matrix[0]
    important_words = sorted(
        [(feature_names[i], first_document_vector[0, i]) for i in first_document_vector.nonzero()[1]],
        key=lambda x: x[1], reverse=True
    )
    return important_words

In [4]:
df['important_words'] = df['text'].apply(get_important_words)
df = df[['name', 'important_words']]
df['words'] = df['important_words'].apply(get_words_for_embeddings)
df['embedings'] = df['words'].apply(model.encode)
df.head()

Unnamed: 0,name,important_words,words,embedings
0,Hair Extensions & Wigs,"[(women, 0.5013460125133989), (men, 0.33508421...","[women, men, xl, pack, sleeve, black, long, fo...","[[0.007278031, -0.020804347, -0.012120812, 0.0..."
1,Eyebrow Embroidery,"[(extensions, 0.3950043026695996), (eyebrow, 0...","[extensions, eyebrow, set, full, lash, one, ey...","[[-0.029260742, 0.00788737, 0.0001703182, -0.0..."
2,Escape Games,"[(escape, 0.6554168439500331), (room, 0.510030...","[escape, room, people, murder, private, valid,...","[[0.018107658, 0.023809472, -0.0030498987, 0.0..."
3,Wallets,"[(wallet, 0.533966078168827), (women, 0.342544...","[wallet, women, card, rfid, blocking, holder, ...","[[0.002588826, 0.09205439, -0.060293734, -0.00..."
4,Underwear & Undershirts,"[(piece, 0.39193833369309394), (men, 0.3919383...","[piece, men, thermal, set, winter, top, bottom...","[[-0.05713177, 0.109446764, 0.00094258774, -0...."


In [5]:
def count_euclidean_distance(df, query):
    metrics = []

    for index, row in df.iterrows():
        category = np.array(row['embedings'])  
        # embeding AVG 
        category_mean = np.mean(category, axis=0)

        # cosine similarity
        dot_product = np.dot(category_mean, query.T)
        norm_category = np.linalg.norm(category_mean)
        norm_query = np.linalg.norm(query)

        if norm_category == 0 or norm_query == 0:
            cosine_similarity = 0
        else:
            cosine_similarity = dot_product / (norm_category * norm_query)

        cosine_distance = 1 - cosine_similarity
        euclidean_distance = np.linalg.norm(category_mean - query)

        metrics.append({
            'name': row['name'], 
            'cosine_distance': cosine_distance,
            'euclidean_distance': euclidean_distance
        })
    
    # to dataframe
    return pd.DataFrame(metrics)

In [6]:
# here is the main issue
# Query embeding online
query = np.array(model.encode(['hair', 'cut']))

distance_df = count_euclidean_distance(df, query)
# top 10 closest embedings
closest_embeddings = distance_df.nsmallest(10, 'euclidean_distance')
for index, row in closest_embeddings.sort_values(['euclidean_distance'], ascending = True).iterrows():
    print(f"Category: {row['name']}, Euclidean distance: {row['euclidean_distance']}")

Category: Shave, Euclidean distance: 1.050406813621521
Category: Shaving & Grooming, Euclidean distance: 1.0827836990356445
Category: Color & Highlights, Euclidean distance: 1.0841929912567139
Category: Scalp & Hair Treatments, Euclidean distance: 1.0847097635269165
Category: Hair & Styling, Euclidean distance: 1.0879900455474854
Category: Hair Care, Euclidean distance: 1.1002392768859863
Category: Textured Hair Care, Euclidean distance: 1.1008548736572266
Category: Upper Lip Wax, Euclidean distance: 1.1096110343933105
Category: Pet Supplies, Euclidean distance: 1.1236134767532349
Category: Skin Care Tools, Euclidean distance: 1.128180980682373


In [8]:
# here is the main issue
# Query embeding online
query = np.array(model.encode(['massage', 'oil']))

distance_df = count_euclidean_distance(df, query)
# top 10 closest embedings
closest_embeddings = distance_df.nsmallest(10, 'euclidean_distance')
for index, row in closest_embeddings.sort_values(['euclidean_distance'], ascending = True).iterrows():
    print(f"Category: {row['name']}, Euclidean distance: {row['euclidean_distance']}")

Category: Massage Accessories, Euclidean distance: 1.1642603874206543
Category: Shiatsu Massage, Euclidean distance: 1.1751201152801514
Category: Classic Massage, Euclidean distance: 1.17940092086792
Category: Foot & Leg Massagers, Euclidean distance: 1.179640293121338
Category: Thai Massage, Euclidean distance: 1.1806066036224365
Category: Pulse Massagers, Euclidean distance: 1.1809428930282593
Category: Aromatherapy, Euclidean distance: 1.1850820779800415
Category: Feminine Care, Euclidean distance: 1.1880848407745361
Category: Pain Relief, Euclidean distance: 1.1881052255630493
Category: Reflexology, Euclidean distance: 1.1884011030197144


In [9]:
# here is the main issue
# Query embeding online
query = np.array(model.encode(['water', 'park']))

distance_df = count_euclidean_distance(df, query)
# top 10 closest embedings
closest_embeddings = distance_df.nsmallest(10, 'euclidean_distance')
for index, row in closest_embeddings.sort_values(['euclidean_distance'], ascending = True).iterrows():
    print(f"Category: {row['name']}, Euclidean distance: {row['euclidean_distance']}")

Category: Boats & Water Sports, Euclidean distance: 1.1208205223083496
Category: Inflatables, Euclidean distance: 1.1229444742202759
Category: Pools & Water Fun, Euclidean distance: 1.1339777708053589
Category: Kayaking, Euclidean distance: 1.1547462940216064
Category: Kayaking, Euclidean distance: 1.1649143695831299
Category: Athletic, Euclidean distance: 1.1677696704864502
Category: Unique Lodging, Euclidean distance: 1.1684733629226685
Category: Outdoor Lighting, Euclidean distance: 1.1694552898406982
Category: Pool & Water Fun, Euclidean distance: 1.170409083366394
Category: Hotel Spas, Euclidean distance: 1.1708201169967651


In [10]:
# here is the main issue
# Query embeding online
query = np.array(model.encode(['valvoline', 'syntetic', 'oil']))

distance_df = count_euclidean_distance(df, query)
# top 10 closest embedings
closest_embeddings = distance_df.nsmallest(10, 'euclidean_distance')
for index, row in closest_embeddings.sort_values(['euclidean_distance'], ascending = True).iterrows():
    print(f"Category: {row['name']}, Euclidean distance: {row['euclidean_distance']}")

Category: Facial, Euclidean distance: 1.505852222442627
Category: Face & Skin Care, Euclidean distance: 1.5062943696975708
Category: Weight Loss, Euclidean distance: 1.5065338611602783
Category: Cosmetic Procedures, Euclidean distance: 1.506898045539856
Category: Salon Packages, Euclidean distance: 1.5071290731430054
Category: Shampoo & Conditioners, Euclidean distance: 1.5073918104171753
Category: Facial Peel, Euclidean distance: 1.50751531124115
Category: Injectables & Fillers, Euclidean distance: 1.507830023765564
Category: Medical, Euclidean distance: 1.5078840255737305
Category: Spas, Euclidean distance: 1.5085259675979614
