In [6]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.feature_extraction.text import TfidfVectorizer
import time
import re
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

db_path = '/Users/zphilipp/git/research/dealsdb/deals_db1.db'

prepositions_and_conjunctions = [
    "about", "above", "across", "after", "against", "along", "among", "around", "at",
    "before", "behind", "below", "beneath", "beside", "between", "beyond", "by",
    "during", "for", "from", "in", "inside", "into", "near", "of", "off", "on",
    "out", "outside", "over", "through", "throughout", "to", "toward", "under",
    "until", "up", "with", "within", "without", "and", "but", "or", "for", "nor",
    "so", "yet", "although", "because", "as", "since", "unless", "while", "when",
    "where", "after", "before", "the", "a"
]
pattern = r'\b(?:' + '|'.join(prepositions_and_conjunctions) + r')\b'

def remove_prepositions_and_conjunctions(text):
    cleaned_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    return re.sub(r'\s+', ' ', cleaned_text).strip()

# Get Words for embeddings
def get_words_for_embeddings(row):
    words = []
    for item in row:
        words.append(item[0])
    return words



In [2]:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

sql_query = """
    SELECT
        d.customer_category_id,
        c.name,
        GROUP_CONCAT(d.title_general, ',') || ' ' || GROUP_CONCAT(o.title, ',') AS text
    FROM deals d
        JOIN customer_taxonomy c ON (d.customer_category_id=c.id)
        LEFT JOIN options o ON (d.deal_id=o.deal_id)
    GROUP BY customer_category_id
    -- LIMIT 100
"""

# Execute the query and load the data into a DataFrame
df = pd.read_sql_query(sql_query, conn)
conn.close()

In [3]:
def get_important_words(document):
    document = remove_prepositions_and_conjunctions(document)
    document = [document]

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(document)

    feature_names = vectorizer.get_feature_names_out()
    first_document_vector = tfidf_matrix[0]
    important_words = sorted(
        [(feature_names[i], first_document_vector[0, i]) for i in first_document_vector.nonzero()[1]],
        key=lambda x: x[1], reverse=True
    )
    return important_words

In [4]:
df['important_words'] = df['text'].apply(get_important_words)
df = df[['name', 'important_words']]
df['words'] = df['important_words'].apply(get_words_for_embeddings)
df['embedings'] = df['words'].apply(model.encode)

In [8]:
# here is the main issue
# Query embeding online
query = np.array(model.encode(['hair', 'cut']))

metrics = []

for index, row in df.iterrows():
    category = np.array(row['embedings'])  
    # embeding AVG 
    category_mean = np.mean(category, axis=0)

    # cosine similarity
    dot_product = np.dot(category_mean, query.T)
    norm_category = np.linalg.norm(category_mean)
    norm_query = np.linalg.norm(query)

    if norm_category == 0 or norm_query == 0:
        cosine_similarity = 0
    else:
        cosine_similarity = dot_product / (norm_category * norm_query)

    cosine_distance = 1 - cosine_similarity
    euclidean_distance = np.linalg.norm(category_mean - query)

    metrics.append({
        'name': row['name'], 
        'cosine_distance': cosine_distance,
        'euclidean_distance': euclidean_distance
    })
    
# to dataframe
metrics_df = pd.DataFrame(metrics)

# top 10 closest embedings
closest_embeddings = metrics_df.nsmallest(10, 'euclidean_distance')
for index, row in closest_embeddings.sort_values(['euclidean_distance'], ascending = True).iterrows():
    print(f"Category: {row['name']}, Euclidean distance: {row['euclidean_distance']}")

Category: Shave, Euclidean distance: 1.050406813621521
Category: Shaving & Grooming, Euclidean distance: 1.0827836990356445
Category: Color & Highlights, Euclidean distance: 1.0841929912567139
Category: Scalp & Hair Treatments, Euclidean distance: 1.0847097635269165
Category: Hair & Styling, Euclidean distance: 1.0879900455474854
Category: Hair Care, Euclidean distance: 1.1002392768859863
Category: Textured Hair Care, Euclidean distance: 1.1008548736572266
Category: Upper Lip Wax, Euclidean distance: 1.1096110343933105
Category: Pet Supplies, Euclidean distance: 1.1236134767532349
Category: Skin Care Tools, Euclidean distance: 1.128180980682373
