## Category important words & similarity search

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import re
import nltk
#from nltk import bigrams, trigrams
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity


model = SentenceTransformer('all-MiniLM-L6-v2') 
nltk.download('punkt')

#pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 200)

db_path = '/Users/zphilipp/git/research/dealsdb/deals_db1.db'

prepositions_and_conjunctions = [
    "about", "above", "across", "after", "against", "along", "among", "around", "at",
    "before", "behind", "below", "beneath", "beside", "between", "beyond", "by",
    "during", "for", "from", "in", "inside", "into", "near", "of", "off", "on",
    "out", "outside", "over", "through", "throughout", "to", "toward", "under",
    "until", "up", "with", "within", "without", "and", "but", "or", "for", "nor",
    "so", "yet", "although", "because", "as", "since", "unless", "while", "when",
    "where", "after", "before", "the", "a"
]
pattern = r'\b(?:' + '|'.join(prepositions_and_conjunctions) + r')\b'

def remove_prepositions_and_conjunctions(text):
    cleaned_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    return re.sub(r'\s+', ' ', cleaned_text).strip()

# Get Words for embeddings
#def get_words_for_embeddings(row):
#    words = []
#    for item in row:
#        words.append(item[0])
#    return words

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/zphilipp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Get all titles from Deals and Options text

In [2]:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

sql_query = """
    SELECT
        d.customer_category_id,
        c.name,
        GROUP_CONCAT(d.title_general, ',') || ' ' || GROUP_CONCAT(o.title, ',') AS text
        
    FROM deals d
        JOIN customer_taxonomy c ON (d.customer_category_id=c.id)
        LEFT JOIN options o ON (d.deal_id=o.deal_id)
        
WHERE c.name LIKE 'Couples Massage'
   OR c.name LIKE 'Laser Hair Removal'
   OR c.name LIKE 'Electronics'
   OR c.name LIKE 'Botox'
   OR c.name LIKE 'Injectables & Fillers'
   OR c.name LIKE 'Face & Skin Care'
   OR c.name LIKE 'Facial'
   OR c.name LIKE 'Weight Loss'
   OR c.name LIKE 'Oil Change'
   OR c.name LIKE 'Massage'
   OR c.name LIKE 'Day Spas'
   OR c.name LIKE 'Kids Sports'
   OR c.name LIKE 'Hair Salons'
   OR c.name LIKE 'Water Parks'
   OR c.name LIKE 'Full Body Massage'
   OR c.name LIKE 'Deep Tissue Massage'
   OR c.name LIKE 'Non-Surgical Facelifts'
   OR c.name LIKE 'Retail'
   OR c.name LIKE 'Swedish Massage'
   OR c.name LIKE 'IPL'
   OR c.name LIKE 'Color & Highlights'
   OR c.name LIKE 'Flowers'
   OR c.name LIKE 'Amusement Parks'
   OR c.name LIKE 'Eyelash Extensions'
   OR c.name LIKE 'Bowling'
   OR c.name LIKE 'Escape Games'
   OR c.name LIKE 'Fun & Leisure'
   OR c.name LIKE 'Sightseeing & Tours'
   OR c.name LIKE 'Waxing'
   OR c.name LIKE 'Plastic Surgery'
   OR c.name LIKE 'Repair Services'
   OR c.name LIKE 'Brows & Lashes'
   OR c.name LIKE 'Hot Stone Massage'
   OR c.name LIKE 'Medical'
   OR c.name LIKE 'Natural Medicine'
   OR c.name LIKE 'Auto Detailing'
   OR c.name LIKE 'Print Shop'
   OR c.name LIKE 'Business Training'
   OR c.name LIKE 'Boat Tours'
   OR c.name LIKE 'Permanent Makeup'
   OR c.name LIKE 'Microdermabrasion'
   OR c.name LIKE 'Teeth Whitening'
   OR c.name LIKE 'Dolphin & Whale Watching'
   OR c.name LIKE 'Facial Peel'
   OR c.name LIKE 'Chiropractor'
   OR c.name LIKE 'Windshield & Windows'
   OR c.name LIKE 'Museums'
   OR c.name LIKE 'Teeth Cleaning'
   OR c.name LIKE 'Tires & Wheels'
   OR c.name LIKE 'Art Classes'
   OR c.name LIKE 'Photographers'
   OR c.name LIKE 'Mani Pedi'
   OR c.name LIKE 'Water Sports'
   OR c.name LIKE 'Airport Parking'
   OR c.name LIKE 'Spas'
   OR c.name LIKE 'Golf'
   OR c.name LIKE 'Air Duct Cleaning'
   OR c.name LIKE 'Pottery Classes'
   OR c.name LIKE 'Dance Classes'
   OR c.name LIKE 'Music'
   OR c.name LIKE 'Wine Tours'
   OR c.name LIKE 'Salon Packages'
   OR c.name LIKE 'Yoga'
   OR c.name LIKE 'Hotels & Accommodations'
   OR c.name LIKE 'Bus Tours & Rentals'
   OR c.name LIKE 'Saunas'
   OR c.name LIKE 'Zoo'
   OR c.name LIKE 'Hotel Spas'
   OR c.name LIKE 'Cinema'
   OR c.name LIKE 'Carpet Cleaning'
   OR c.name LIKE 'Colonic Hydrotherapy'
   OR c.name LIKE 'Dinner & Entertainment'
   OR c.name LIKE 'Flying Lessons'
   OR c.name LIKE 'Wine'
   OR c.name LIKE 'Helicopter Ride'
   OR c.name LIKE 'Family Vacations'
   OR c.name LIKE 'Trampoline & Bounce Houses'
   OR c.name LIKE 'Nightlife'
   OR c.name LIKE 'Custom Massage'
   OR c.name LIKE 'Shooting Range'
   OR c.name LIKE 'Outdoor & Camping Trips'
   OR c.name LIKE 'Sports & Outdoors'
   OR c.name LIKE 'Dental Implants'
   OR c.name LIKE 'Bath Houses'
   OR c.name LIKE 'Eyelash Tinting'
   OR c.name LIKE 'Rock Climbing'
   OR c.name LIKE 'House Cleaning'
   OR c.name LIKE 'Skills & Hobbies'
   OR c.name LIKE 'Snorkeling'
   OR c.name LIKE 'Hair Restoration'
   OR c.name LIKE 'Cooking Classes'
   OR c.name LIKE 'Driving Lessons'
   OR c.name LIKE 'Reflexology'
   OR c.name LIKE 'Acupuncture'
   OR c.name LIKE 'Asian Restaurants'
   OR c.name LIKE 'Pilates'
   OR c.name LIKE 'Museums & Attractions'
   OR c.name LIKE 'Paintball'
   OR c.name LIKE 'Big City Vacations'
   OR c.name LIKE 'Casinos'
    
    GROUP BY customer_category_id

"""

# Execute the query and load the data into a DataFrame
df = pd.read_sql_query(sql_query, conn)
conn.close()

In [3]:
df.head()

Unnamed: 0,customer_category_id,name,text
0,00dcb3b8-8176-4d67-85a5-4a64c56e2955,Escape Games,"Dive into A Murder Among Us with exciting at-home murder mysteries for 4+ people, offering up to 45% off,Dive into A Murder Among Us with exciting at-home murder mysteries for 4+ people, offering ..."
1,024d4f6c-d1d6-4ecd-905e-0f91c9b4ab54,Pilates,"At Anastasio, Discover a Boutique Wellness Haven Offering Private Pilates Sessions with Up to 25% Off,Achieve Your Fitness Goals with 5 or 10 Pilates Group Reformer Classes and Expert Instructors ..."
2,05c41219-e7c2-4e72-afea-b9ec7dad0a83,Face & Skin Care,"Experience Style By Jamilah's rejuvenating microneedling sessions with hyaluronic acid for up to 50% off,Experience Style By Jamilah's rejuvenating microneedling sessions with hyaluronic acid for ..."
3,09b02335-2891-4ab5-9950-0ae27b8b6c53,Golf,"1 Swing Golf offers private 50-minute lessons with certified instructors, focusing on biomechanics and personalized plans up to 22%,Elevate your game with 360 Degree Golf's tailored lessons, inclu..."
4,0af090df-faac-4e32-a388-5d49bb33a797,Color & Highlights,"T & H Packages - Hair cut, style, coloring or more at The Tortoise & The Hair Up to 50% Off,T & H Packages - Hair cut, style, coloring or more at The Tortoise & The Hair Up to 50% Off,T & H Pack..."


#### Count important words for each category

In [4]:
prepositions_and_conjunctions = [
    "about", "above", "across", "after", "against", "along", "among", "around", "at",
    "before", "behind", "below", "beneath", "beside", "between", "beyond", "by",
    "during", "for", "from", "in", "inside", "into", "near", "of", "off", "on",
    "out", "outside", "over", "through", "throughout", "to", "toward", "under",
    "until", "up", "with", "within", "without", "and", "but", "or", "for", "nor",
    "so", "yet", "although", "because", "as", "since", "unless", "while", "when",
    "where", "after", "before", "the", "a", "b", "c", "d", "e", "f", "g", "h",
    "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w",
    "x", "y", "z"
]
pattern = r'\b(?:' + '|'.join(prepositions_and_conjunctions) + r')\b'

def remove_prepositions_and_conjunctions(text):
    cleaned_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    return re.sub(r'\s+', ' ', cleaned_text).strip()

In [5]:
def to_trigrams(text):
    text = remove_prepositions_and_conjunctions(text)
    if isinstance(text, list):
        text = " ".join(text)
    elif not isinstance(text, str):
        return []

    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)

    words = nltk.word_tokenize(text)
    bigrams_list = list(nltk.trigrams(words))
    bigram_counts = Counter(bigrams_list)
    sorted_bigrams = bigram_counts.most_common()

    return sorted_bigrams[:10]
    
def to_bigrams(text):
    text = remove_prepositions_and_conjunctions(text)
    if isinstance(text, list):
        text = " ".join(text)
    elif not isinstance(text, str):
        return []

    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)

    words = nltk.word_tokenize(text)
    bigrams_list = list(nltk.bigrams(words))
    bigram_counts = Counter(bigrams_list)
    sorted_bigrams = bigram_counts.most_common()

    return sorted_bigrams[:10]
    
def get_important_words(document):
    document = remove_prepositions_and_conjunctions(document)
    document = [document]

    vectorizer = TfidfVectorizer(ngram_range=(1, 1))
    tfidf_matrix = vectorizer.fit_transform(document)

    feature_names = vectorizer.get_feature_names_out()
    first_document_vector = tfidf_matrix[0]
    important_words = sorted(
        [(feature_names[i], first_document_vector[0, i]) for i in first_document_vector.nonzero()[1]],
        key=lambda x: x[1], reverse=True
    )
    return important_words[:10]

#### Create word embedings and transform data

In [6]:
df_ = df
df_['important_words'] = df['text'].apply(get_important_words)
df_["bigrams"] = df["text"].apply(to_bigrams)
df_["trigrams"] = df["text"].apply(to_trigrams)

In [7]:
df_.head(10)

Unnamed: 0,customer_category_id,name,text,important_words,bigrams,trigrams
0,00dcb3b8-8176-4d67-85a5-4a64c56e2955,Escape Games,"Dive into A Murder Among Us with exciting at-home murder mysteries for 4+ people, offering up to 45% off,Dive into A Murder Among Us with exciting at-home murder mysteries for 4+ people, offering ...","[(escape, 0.6554168439500331), (room, 0.5100300494198391), (people, 0.2206679826364433), (murder, 0.19455482051208808), (private, 0.18208637373199052), (valid, 0.16561861383374848), (experience, 0...","[((escape, room), 1272), ((murder, mystery), 351), ((private, escape), 308), ((escaperoomstyle, murder), 304), ((mystery, people), 262), ((escape, game), 262), ((murder, us), 254), ((room, experie...","[((escaperoomstyle, murder, mystery), 274), ((private, escape, room), 271), ((murder, mystery, people), 262), ((escape, room, experience), 234), ((escape, room, four), 143), ((four, six, eight), 1..."
1,024d4f6c-d1d6-4ecd-905e-0f91c9b4ab54,Pilates,"At Anastasio, Discover a Boutique Wellness Haven Offering Private Pilates Sessions with Up to 25% Off,Achieve Your Fitness Goals with 5 or 10 Pilates Group Reformer Classes and Expert Instructors ...","[(pilates, 0.7513298273881077), (classes, 0.4507978964328646), (reformer, 0.2335214328368443), (private, 0.13808223854700358), (one, 0.13605161739190058), (sessions, 0.1319903750816946), (fitness,...","[((pilates, reformer), 52), ((pilates, classes), 42), ((reformer, classes), 38), ((private, pilates), 25), ((im, pilates), 22), ((mat, pilates), 20), ((power, plate), 18), ((new, clients), 16), ((...","[((pilates, reformer, classes), 15), ((private, pilates, gyrotonic), 10), ((your, fitness, journey), 10), ((reformer, pilates, classes), 8), ((eight, pilates, reformer), 8), ((three, pilates, equi..."
2,05c41219-e7c2-4e72-afea-b9ec7dad0a83,Face & Skin Care,"Experience Style By Jamilah's rejuvenating microneedling sessions with hyaluronic acid for up to 50% off,Experience Style By Jamilah's rejuvenating microneedling sessions with hyaluronic acid for ...","[(skin, 0.40210511195151943), (one, 0.3844645329891161), (treatment, 0.3254123821362483), (micro, 0.2744924130745593), (needling, 0.2697382144889471), (microneedling, 0.2689875515543767), (laser, ...","[((microneedling, treatment), 524), ((laser, skin), 496), ((your, skin), 479), ((skin, resurfacing), 443), ((microneedling, treatments), 430), ((face, neck), 328), ((radiant, skin), 318), ((micron...","[((laser, skin, resurfacing), 364), ((transform, your, skin), 169), ((microneedling, treatmentone, microneedling), 127), ((microneedling, treatment, prp), 99), ((treatmentone, microneedling, treat..."
3,09b02335-2891-4ab5-9950-0ae27b8b6c53,Golf,"1 Swing Golf offers private 50-minute lessons with certified instructors, focusing on biomechanics and personalized plans up to 22%,Elevate your game with 360 Degree Golf's tailored lessons, inclu...","[(golf, 0.6825953224167746), (round, 0.32348562915232787), (tee, 0.30914036843603837), (times, 0.3043586148639419), (online, 0.3038804395067322), (booking, 0.3038804395067322), (club, 0.1692740764...","[((round, golf), 2670), ((booking, round), 2542), ((timestee, timestee), 2540), ((clubonline, booking), 1085), ((courseonline, booking), 819), ((golf, clubonline), 755), ((golf, courseonline), 710...","[((booking, round, golf), 2542), ((timestee, timestee, timestee), 2539), ((clubonline, booking, round), 1085), ((courseonline, booking, round), 819), ((golf, clubonline, booking), 755), ((golf, co..."
4,0af090df-faac-4e32-a388-5d49bb33a797,Color & Highlights,"T & H Packages - Hair cut, style, coloring or more at The Tortoise & The Hair Up to 50% Off,T & H Packages - Hair cut, style, coloring or more at The Tortoise & The Hair Up to 50% Off,T & H Pack...","[(hair, 0.47140452079103173), (cut, 0.47140452079103173), (package, 0.47140452079103173), (packages, 0.23570226039551587), (style, 0.23570226039551587), (coloring, 0.23570226039551587), (more, 0.2...","[((packages, hair), 4), ((hair, cut), 4), ((cut, style), 4), ((style, coloring), 4), ((coloring, more), 4), ((more, tortoise), 4), ((tortoise, hair), 4), ((package, cut), 4), ((hair, packages), 3)...","[((packages, hair, cut), 4), ((hair, cut, style), 4), ((cut, style, coloring), 4), ((style, coloring, more), 4), ((coloring, more, tortoise), 4), ((more, tortoise, hair), 4), ((tortoise, hair, pac..."
5,0bc79434-fc4c-482a-b19d-e8af6cc0ac5b,Music,"Movements with Citizen on April 6 at 7 p.m.,Movements with Citizen on April 3 at 7:30 p.m. 04/06/2025 at 7 PM: One Ticket,04/03/2024 at 7:30 PM: One Ticket","[(movements, 0.4082482904638631), (citizen, 0.4082482904638631), (april, 0.4082482904638631), (pm, 0.4082482904638631), (one, 0.4082482904638631), (ticket, 0.4082482904638631)]","[((movements, citizen), 2), ((citizen, april), 2), ((pm, one), 2), ((one, ticket), 2), ((april, movements), 1), ((april, pm), 1), ((ticket, pm), 1)]","[((movements, citizen, april), 2), ((pm, one, ticket), 2), ((citizen, april, movements), 1), ((april, movements, citizen), 1), ((citizen, april, pm), 1), ((april, pm, one), 1), ((one, ticket, pm),..."
6,0d5c5cc6-a79f-4b0c-ba25-90b14cec8d81,Asian Restaurants,"Watch Master Chefs Create Magic with Flair: Hibachi Dinner for 2 or 4 at A-Aki Sushi & Steakhouse- Up to 36% Off,Watch Master Chefs Create Magic with Flair: Hibachi Dinner for 2 or 4 at A-Aki Sush...","[(cash, 0.5046015618625171), (back, 0.5046015618625171), (sushi, 0.3382135468518164), (food, 0.20118812272535705), (cuisine, 0.18052555876977983), (japanese, 0.1663880150107007), (thai, 0.16638801...","[((cash, back), 464), ((food, drink), 74), ((japanese, cuisine), 63), ((any, day), 58), ((valid, any), 57), ((towards, food), 40), ((cuisine, worth), 37), ((food, drinks), 36), ((restaurant, cash)...","[((valid, any, day), 53), ((restaurant, cash, back), 34), ((sushi, cash, back), 32), ((cash, back, thai), 30), ((cash, back, noodles), 28), ((back, noodles, company), 26), ((cash, back, sushi), 24..."
7,10143ec5-1615-4b01-abf0-5dfad3b46247,Dinner & Entertainment,"Set Sail with City Cruises! $100 Dining Cruise Voucher valid for Chicago Dining Cruises in 2025! (Up to 40% off),Set sail with City Cruises! Grab $100 dining cruise vouchers for 2025! (Up to 40% o...","[(cruise, 0.6167909240139574), (sunset, 0.35167903562199326), (tour, 0.2759327817957178), (dinner, 0.2191230914260112), (catamaran, 0.18260257618834266), (sail, 0.1623134010563046), (hour, 0.14202...","[((dinner, cruise), 69), ((isla, mujeres), 65), ((sunset, cruise), 56), ((cruise, one), 36), ((sunset, sail), 30), ((boat, tour), 30), ((open, bar), 29), ((lunch, dinner), 21), ((voucher, valid), ...","[((dinner, cruise, one), 18), ((voucher, valid, any), 17), ((valid, any, brunch), 17), ((any, brunch, lunch), 17), ((brunch, lunch, dinner), 17), ((lunch, dinner, cruises), 17), ((tour, isla, muje..."
8,14798f0c-1e25-49c7-b4f5-0504fd79bb34,Business Training,"Up to 28% Off on Tax Training Course at 5 Starr Tax Professionals,Prepare for real estate exams with A Plus Real Estate School's Colorado pre licensing classes, up to 40% off,Enhance your career w...","[(course, 0.5085863326677958), (real, 0.3739184023275907), (estate, 0.3732020835491854), (license, 0.3466982887481876), (online, 0.2686195419020048), (hour, 0.18337760727176863), (pre, 0.171200188...","[((real, estate), 431), ((mlo, license), 212), ((license, course), 202), ((estate, license), 121), ((colibri, real), 95), ((course, realestateu), 90), ((packages, colibri), 77), ((license, courses...","[((mlo, license, course), 147), ((real, estate, license), 119), ((colibri, real, estate), 95), ((license, course, realestateu), 90), ((packages, colibri, real), 77), ((hour, real, estate), 68), ((..."
9,14de3fc4-461c-4c90-a9f8-a353859ef871,Hotel Spas,"Enjoy a full day beach rental with Adventure Water Sports Inc including chairs, umbrellas, tables and up to 33% off,Enjoy a full day beach rental with Adventure Water Sports Inc including chairs, ...","[(massage, 0.514107081867301), (spa, 0.49941830809966387), (facial, 0.3427380545782007), (minute, 0.3084642491203806), (one, 0.23502038028219477), (day, 0.19585031690182897), (min, 0.1517839955989...","[((massage, facial), 20), ((spa, day), 16), ((minute, massage), 13), ((day, one), 11), ((swedish, massage), 10), ((aromatherapy, massage), 8), ((facial, massage), 8), ((hot, towels), 8), ((minute,...","[((minute, massage, facial), 7), ((spa, spa, day), 6), ((ultimate, relaxation, kimpton), 6), ((relaxation, kimpton, gray), 6), ((kimpton, gray, spa), 6), ((gray, spa, rooms), 6), ((spa, rooms, var..."


In [23]:
def combine_words(row):
    important_words_str = ' '.join(word[0] for word in row['important_words'])
    bigrams_str = ' '.join(' '.join(bigram[0]) for bigram in row['bigrams'])
    trigrams_str = ' '.join(' '.join(trigram[0]) for trigram in row['trigrams'])
    
    return pd.Series([important_words_str, bigrams_str, trigrams_str])
    
df_[['important_words_combined', 'bigrams_combined', 'trigrams_combined']] = df_.apply(combine_words, axis=1)
df_['combined_text'] = df_[['important_words_combined', 'bigrams_combined', 'trigrams_combined']].agg(' '.join, axis=1)
df_.head(5)

Unnamed: 0,customer_category_id,name,text,important_words,bigrams,trigrams,important_words_combined,bigrams_combined,trigrams_combined,combined_text,combined_text_embeddings
0,00dcb3b8-8176-4d67-85a5-4a64c56e2955,Escape Games,"Dive into A Murder Among Us with exciting at-home murder mysteries for 4+ people, offering up to 45% off,Dive into A Murder Among Us with exciting at-home murder mysteries for 4+ people, offering ...","[(escape, 0.6554168439500331), (room, 0.5100300494198391), (people, 0.2206679826364433), (murder, 0.19455482051208808), (private, 0.18208637373199052), (valid, 0.16561861383374848), (experience, 0...","[((escape, room), 1272), ((murder, mystery), 351), ((private, escape), 308), ((escaperoomstyle, murder), 304), ((mystery, people), 262), ((escape, game), 262), ((murder, us), 254), ((room, experie...","[((escaperoomstyle, murder, mystery), 274), ((private, escape, room), 271), ((murder, mystery, people), 262), ((escape, room, experience), 234), ((escape, room, four), 143), ((four, six, eight), 1...",escape room people murder private valid experience game mystery home,escape room murder mystery private escape escaperoomstyle murder mystery people escape game murder us room experience valid any four six,escaperoomstyle murder mystery private escape room murder mystery people escape room experience escape room four four six eight flavor packhome escaperoomstyle packhome escaperoomstyle murder murd...,escape room people murder private valid experience game mystery home escape room murder mystery private escape escaperoomstyle murder mystery people escape game murder us room experience valid any...,"[0.051755723, -0.05070524, -0.037248805, 0.027916633, 0.014942261, 0.13387631, 0.046581652, -0.066211924, 0.014248056, -0.004914929, 0.03894971, -0.050729387, 0.01604902, -0.03739693, 0.0681715, -..."
1,024d4f6c-d1d6-4ecd-905e-0f91c9b4ab54,Pilates,"At Anastasio, Discover a Boutique Wellness Haven Offering Private Pilates Sessions with Up to 25% Off,Achieve Your Fitness Goals with 5 or 10 Pilates Group Reformer Classes and Expert Instructors ...","[(pilates, 0.7513298273881077), (classes, 0.4507978964328646), (reformer, 0.2335214328368443), (private, 0.13808223854700358), (one, 0.13605161739190058), (sessions, 0.1319903750816946), (fitness,...","[((pilates, reformer), 52), ((pilates, classes), 42), ((reformer, classes), 38), ((private, pilates), 25), ((im, pilates), 22), ((mat, pilates), 20), ((power, plate), 18), ((new, clients), 16), ((...","[((pilates, reformer, classes), 15), ((private, pilates, gyrotonic), 10), ((your, fitness, journey), 10), ((reformer, pilates, classes), 8), ((eight, pilates, reformer), 8), ((three, pilates, equi...",pilates classes reformer private one sessions fitness five group experience,pilates reformer pilates classes reformer classes private pilates im pilates mat pilates power plate new clients reformer pilates your fitness,pilates reformer classes private pilates gyrotonic your fitness journey reformer pilates classes eight pilates reformer three pilates equipment pilates equipment group classes restrictions apply i...,pilates classes reformer private one sessions fitness five group experience pilates reformer pilates classes reformer classes private pilates im pilates mat pilates power plate new clients reforme...,"[-0.029530413, -0.1045996, -0.005067386, -0.011923895, -0.03918428, 0.046191826, -0.04072516, -0.108365506, -0.01909609, -0.041376706, 0.033687733, 0.042577907, 0.01442494, -0.014881942, 0.0808527..."
2,05c41219-e7c2-4e72-afea-b9ec7dad0a83,Face & Skin Care,"Experience Style By Jamilah's rejuvenating microneedling sessions with hyaluronic acid for up to 50% off,Experience Style By Jamilah's rejuvenating microneedling sessions with hyaluronic acid for ...","[(skin, 0.40210511195151943), (one, 0.3844645329891161), (treatment, 0.3254123821362483), (micro, 0.2744924130745593), (needling, 0.2697382144889471), (microneedling, 0.2689875515543767), (laser, ...","[((microneedling, treatment), 524), ((laser, skin), 496), ((your, skin), 479), ((skin, resurfacing), 443), ((microneedling, treatments), 430), ((face, neck), 328), ((radiant, skin), 318), ((micron...","[((laser, skin, resurfacing), 364), ((transform, your, skin), 169), ((microneedling, treatmentone, microneedling), 127), ((microneedling, treatment, prp), 99), ((treatmentone, microneedling, treat...",skin one treatment micro needling microneedling laser treatments experience face,microneedling treatment laser skin your skin skin resurfacing microneedling treatments face neck radiant skin microneedling sessions one two acne treatment,laser skin resurfacing transform your skin microneedling treatmentone microneedling microneedling treatment prp treatmentone microneedling treatment one two three acne treatment sessions med spa m...,skin one treatment micro needling microneedling laser treatments experience face microneedling treatment laser skin your skin skin resurfacing microneedling treatments face neck radiant skin micro...,"[-0.018825267, -0.060907662, 0.09785038, 0.0072675515, -0.0030766504, 0.015615422, 0.09545651, 0.014158326, -0.05079385, -0.061235927, 0.055771723, -0.030539235, 0.013474434, 0.03650427, 0.0375866..."
3,09b02335-2891-4ab5-9950-0ae27b8b6c53,Golf,"1 Swing Golf offers private 50-minute lessons with certified instructors, focusing on biomechanics and personalized plans up to 22%,Elevate your game with 360 Degree Golf's tailored lessons, inclu...","[(golf, 0.6825953224167746), (round, 0.32348562915232787), (tee, 0.30914036843603837), (times, 0.3043586148639419), (online, 0.3038804395067322), (booking, 0.3038804395067322), (club, 0.1692740764...","[((round, golf), 2670), ((booking, round), 2542), ((timestee, timestee), 2540), ((clubonline, booking), 1085), ((courseonline, booking), 819), ((golf, clubonline), 755), ((golf, courseonline), 710...","[((booking, round, golf), 2542), ((timestee, timestee, timestee), 2539), ((clubonline, booking, round), 1085), ((courseonline, booking, round), 819), ((golf, clubonline, booking), 755), ((golf, co...",golf round tee times online booking club course country one,round golf booking round timestee timestee clubonline booking courseonline booking golf clubonline golf courseonline country clubonline golf club golf course,booking round golf timestee timestee timestee clubonline booking round courseonline booking round golf clubonline booking golf courseonline booking country clubonline booking golf country clubonli...,golf round tee times online booking club course country one round golf booking round timestee timestee clubonline booking courseonline booking golf clubonline golf courseonline country clubonline ...,"[0.06525999, -0.047371905, -0.034269102, 0.055809103, -0.0013817415, 0.050158758, -0.013868448, -0.030312382, 0.011317571, -0.01107864, -0.012815334, -0.0701439, -0.115662895, 0.0738143, 0.0640934..."
4,0af090df-faac-4e32-a388-5d49bb33a797,Color & Highlights,"T & H Packages - Hair cut, style, coloring or more at The Tortoise & The Hair Up to 50% Off,T & H Packages - Hair cut, style, coloring or more at The Tortoise & The Hair Up to 50% Off,T & H Pack...","[(hair, 0.47140452079103173), (cut, 0.47140452079103173), (package, 0.47140452079103173), (packages, 0.23570226039551587), (style, 0.23570226039551587), (coloring, 0.23570226039551587), (more, 0.2...","[((packages, hair), 4), ((hair, cut), 4), ((cut, style), 4), ((style, coloring), 4), ((coloring, more), 4), ((more, tortoise), 4), ((tortoise, hair), 4), ((package, cut), 4), ((hair, packages), 3)...","[((packages, hair, cut), 4), ((hair, cut, style), 4), ((cut, style, coloring), 4), ((style, coloring, more), 4), ((coloring, more, tortoise), 4), ((more, tortoise, hair), 4), ((tortoise, hair, pac...",hair cut package packages style coloring more tortoise color foil,packages hair hair cut cut style style coloring coloring more more tortoise tortoise hair package cut hair packages package package,packages hair cut hair cut style cut style coloring style coloring more coloring more tortoise more tortoise hair tortoise hair packages hair packages hair package package cut package cut color,hair cut package packages style coloring more tortoise color foil packages hair hair cut cut style style coloring coloring more more tortoise tortoise hair package cut hair packages package packag...,"[-0.029509587, 0.03216335, 0.08401561, -0.023248905, -0.015129266, 0.034098353, 0.049376488, -0.099545114, -0.10573029, 0.0732651, 0.05076101, -0.031119073, -0.006858014, -0.017476752, 0.05590441,..."


In [9]:
df_['combined_text'] = df_['combined_text'].tolist()
df_['combined_text_embeddings'] = df_['combined_text'].apply(lambda x: model.encode(x))

combined_embeddings = np.array(df_['combined_text_embeddings'].tolist())

In [25]:
def query_embedding_reduce(query_embedding):
    if query_embedding.shape[1] > 384:
        query_embedding_reduced = np.mean(query_embedding.reshape(-1, 2, 384), axis=1)
    else:
        query_embedding_reduced = query_embedding
    return query_embedding_reduced

df_[['name', 'customer_category_id', 'combined_text', 'combined_text_embeddings']].to_csv('models/category_embeding.csv')

df_.head(1)

Unnamed: 0,customer_category_id,name,text,important_words,bigrams,trigrams,important_words_combined,bigrams_combined,trigrams_combined,combined_text,combined_text_embeddings
0,00dcb3b8-8176-4d67-85a5-4a64c56e2955,Escape Games,"Dive into A Murder Among Us with exciting at-home murder mysteries for 4+ people, offering up to 45% off,Dive into A Murder Among Us with exciting at-home murder mysteries for 4+ people, offering ...","[(escape, 0.6554168439500331), (room, 0.5100300494198391), (people, 0.2206679826364433), (murder, 0.19455482051208808), (private, 0.18208637373199052), (valid, 0.16561861383374848), (experience, 0...","[((escape, room), 1272), ((murder, mystery), 351), ((private, escape), 308), ((escaperoomstyle, murder), 304), ((mystery, people), 262), ((escape, game), 262), ((murder, us), 254), ((room, experie...","[((escaperoomstyle, murder, mystery), 274), ((private, escape, room), 271), ((murder, mystery, people), 262), ((escape, room, experience), 234), ((escape, room, four), 143), ((four, six, eight), 1...",escape room people murder private valid experience game mystery home,escape room murder mystery private escape escaperoomstyle murder mystery people escape game murder us room experience valid any four six,escaperoomstyle murder mystery private escape room murder mystery people escape room experience escape room four four six eight flavor packhome escaperoomstyle packhome escaperoomstyle murder murd...,escape room people murder private valid experience game mystery home escape room murder mystery private escape escaperoomstyle murder mystery people escape game murder us room experience valid any...,"[0.051755723, -0.05070524, -0.037248805, 0.027916633, 0.014942261, 0.13387631, 0.046581652, -0.066211924, 0.014248056, -0.004914929, 0.03894971, -0.050729387, 0.01604902, -0.03739693, 0.0681715, -..."


In [11]:
def get_top_similarity(query_embedding_reduced, combined_embeddings):
    similarities = cosine_similarity(query_embedding_reduced, combined_embeddings).flatten()
    closest_indices = np.argsort(similarities)[-10:]

    closest_rows = []
    for index in reversed(closest_indices):
        closest_rows.append([df_.iloc[index], similarities[index]])

    return closest_rows

In [12]:
query_embedding_reduced = query_embedding_reduce(model.encode(['hair', 'cut']).reshape(1, -1))
result = get_top_similarity(query_embedding_reduced, combined_embeddings)
for row in result:
    print(f"Closest Category: <{row[0]['name']}> -> score {row[1]}")

Closest Category: <Hair Salons> -> score 0.39640313386917114
Closest Category: <Color & Highlights> -> score 0.3741774260997772
Closest Category: <Laser Hair Removal> -> score 0.3102554976940155
Closest Category: <Waxing> -> score 0.25154078006744385
Closest Category: <Brows & Lashes> -> score 0.2513980269432068
Closest Category: <Nightlife> -> score 0.2472742795944214
Closest Category: <Bath Houses> -> score 0.24375507235527039
Closest Category: <Eyelash Extensions> -> score 0.24254164099693298
Closest Category: <Salon Packages> -> score 0.24106483161449432
Closest Category: <Teeth Whitening> -> score 0.23872435092926025


In [13]:
query_embedding_reduced = query_embedding_reduce(model.encode(['oil']).reshape(1, -1))
result = get_top_similarity(query_embedding_reduced, combined_embeddings)
for row in result:
    print(f"Closest Category: <{row[0]['name']}> -> score {row[1]}")

Closest Category: <Hot Stone Massage> -> score 0.2689662575721741
Closest Category: <Oil Change> -> score 0.262820303440094
Closest Category: <Bath Houses> -> score 0.2262043058872223
Closest Category: <Couples Massage> -> score 0.22290490567684174
Closest Category: <Art Classes> -> score 0.21885952353477478
Closest Category: <Massage> -> score 0.1692541241645813
Closest Category: <Injectables & Fillers> -> score 0.16872821748256683
Closest Category: <Deep Tissue Massage> -> score 0.1677567958831787
Closest Category: <Swedish Massage> -> score 0.16382598876953125
Closest Category: <Day Spas> -> score 0.1541799008846283


In [14]:
query_embedding_reduced = query_embedding_reduce(model.encode(['change']).reshape(1, -1))
result = get_top_similarity(query_embedding_reduced, combined_embeddings)
for row in result:
    print(f"Closest Category: <{row[0]['name']}> -> score {row[1]}")

Closest Category: <Oil Change> -> score 0.29163557291030884
Closest Category: <Repair Services> -> score 0.220582515001297
Closest Category: <Bath Houses> -> score 0.21422189474105835
Closest Category: <Salon Packages> -> score 0.2020999789237976
Closest Category: <Teeth Whitening> -> score 0.19480732083320618
Closest Category: <Music> -> score 0.18857789039611816
Closest Category: <Chiropractor> -> score 0.1838555932044983
Closest Category: <Plastic Surgery> -> score 0.17581340670585632
Closest Category: <Permanent Makeup> -> score 0.1720854938030243
Closest Category: <Brows & Lashes> -> score 0.16813892126083374


In [15]:
query_embedding_reduced = query_embedding_reduce(model.encode(['oil', 'change']).reshape(1, -1))
result = get_top_similarity(query_embedding_reduced, combined_embeddings)
for row in result:
    print(f"Closest Category: <{row[0]['name']}> -> score {row[1]}")

Closest Category: <Oil Change> -> score 0.3433082699775696
Closest Category: <Bath Houses> -> score 0.2727033197879791
Closest Category: <Hot Stone Massage> -> score 0.22818416357040405
Closest Category: <Art Classes> -> score 0.22615733742713928
Closest Category: <Salon Packages> -> score 0.2153906524181366
Closest Category: <Repair Services> -> score 0.2090488225221634
Closest Category: <Couples Massage> -> score 0.20791716873645782
Closest Category: <Teeth Whitening> -> score 0.20052413642406464
Closest Category: <Massage> -> score 0.1870923787355423
Closest Category: <Permanent Makeup> -> score 0.18602722883224487


In [16]:
query_embedding_reduced = query_embedding_reduce(model.encode(['sauna', 'massage']).reshape(1, -1))
result = get_top_similarity(query_embedding_reduced, combined_embeddings)
for row in result:
    print(f"Closest Category: <{row[0]['name']}> -> score {row[1]}")

Closest Category: <Couples Massage> -> score 0.5151861906051636
Closest Category: <Hot Stone Massage> -> score 0.5096988677978516
Closest Category: <Saunas> -> score 0.5096933841705322
Closest Category: <Deep Tissue Massage> -> score 0.49157509207725525
Closest Category: <Swedish Massage> -> score 0.49139243364334106
Closest Category: <Custom Massage> -> score 0.4852566719055176
Closest Category: <Day Spas> -> score 0.47442150115966797
Closest Category: <Full Body Massage> -> score 0.4712368845939636
Closest Category: <Hotel Spas> -> score 0.4662875533103943
Closest Category: <Bath Houses> -> score 0.4532391130924225


In [17]:
query_embedding_reduced = query_embedding_reduce(model.encode(['massage', 'oil']).reshape(1, -1))
result = get_top_similarity(query_embedding_reduced, combined_embeddings)
for row in result:
    print(f"Closest Category: <{row[0]['name']}> -> score {row[1]}")

Closest Category: <Hot Stone Massage> -> score 0.4863801598548889
Closest Category: <Couples Massage> -> score 0.46065789461135864
Closest Category: <Massage> -> score 0.4169067442417145
Closest Category: <Deep Tissue Massage> -> score 0.4164223074913025
Closest Category: <Swedish Massage> -> score 0.4113725423812866
Closest Category: <Day Spas> -> score 0.40700072050094604
Closest Category: <Bath Houses> -> score 0.38684314489364624
Closest Category: <Full Body Massage> -> score 0.3867459297180176
Closest Category: <Custom Massage> -> score 0.379855215549469
Closest Category: <Massage> -> score 0.36719459295272827


In [18]:
query_embedding_reduced = query_embedding_reduce(model.encode(['valvoline', 'oil']).reshape(1, -1))
result = get_top_similarity(query_embedding_reduced, combined_embeddings)
for row in result:
    print(f"Closest Category: <{row[0]['name']}> -> score {row[1]}")

Closest Category: <Oil Change> -> score 0.36929693818092346
Closest Category: <Bath Houses> -> score 0.3062613010406494
Closest Category: <Hot Stone Massage> -> score 0.2871299386024475
Closest Category: <Couples Massage> -> score 0.2614961266517639
Closest Category: <Massage> -> score 0.25337427854537964
Closest Category: <Injectables & Fillers> -> score 0.24133934080600739
Closest Category: <Swedish Massage> -> score 0.23901599645614624
Closest Category: <Art Classes> -> score 0.23290862143039703
Closest Category: <Deep Tissue Massage> -> score 0.22767290472984314
Closest Category: <Full Body Massage> -> score 0.20511898398399353


In [19]:
query_embedding_reduced = query_embedding_reduce(model.encode(['water']).reshape(1, -1))
result = get_top_similarity(query_embedding_reduced, combined_embeddings)
for row in result:
    print(f"Closest Category: <{row[0]['name']}> -> score {row[1]}")

Closest Category: <Art Classes> -> score 0.26415902376174927
Closest Category: <Bath Houses> -> score 0.23926080763339996
Closest Category: <Nightlife> -> score 0.23492451012134552
Closest Category: <Colonic Hydrotherapy> -> score 0.21381857991218567
Closest Category: <Wine> -> score 0.20334063470363617
Closest Category: <Hot Stone Massage> -> score 0.20151033997535706
Closest Category: <Snorkeling> -> score 0.19873970746994019
Closest Category: <Couples Massage> -> score 0.18990017473697662
Closest Category: <Boat Tours> -> score 0.17164389789104462
Closest Category: <Teeth Whitening> -> score 0.1714426726102829


In [20]:
query_embedding_reduced = query_embedding_reduce(model.encode(['water', 'parks']).reshape(1, -1))
result = get_top_similarity(query_embedding_reduced, combined_embeddings)
for row in result:
    print(f"Closest Category: <{row[0]['name']}> -> score {row[1]}")

Closest Category: <Amusement Parks> -> score 0.3305146396160126
Closest Category: <Nightlife> -> score 0.3002397418022156
Closest Category: <Bus Tours & Rentals> -> score 0.2867066562175751
Closest Category: <Water Sports> -> score 0.2805006504058838
Closest Category: <Escape Games> -> score 0.2739951014518738
Closest Category: <Art Classes> -> score 0.2707948386669159
Closest Category: <Boat Tours> -> score 0.26897138357162476
Closest Category: <Electronics> -> score 0.26156988739967346
Closest Category: <Photographers> -> score 0.2608483135700226
Closest Category: <Zoo> -> score 0.25932130217552185


In [21]:
query_embedding_reduced = query_embedding_reduce(model.encode(['amc']).reshape(1, -1))
result = get_top_similarity(query_embedding_reduced, combined_embeddings)
for row in result:
    print(f"Closest Category: <{row[0]['name']}> -> score {row[1]}")

Closest Category: <Amusement Parks> -> score 0.2897689938545227
Closest Category: <Dance Classes> -> score 0.24482381343841553
Closest Category: <Escape Games> -> score 0.23929134011268616
Closest Category: <Boat Tours> -> score 0.23041410744190216
Closest Category: <Sightseeing & Tours> -> score 0.22909992933273315
Closest Category: <Zoo> -> score 0.22805669903755188
Closest Category: <Casinos> -> score 0.2251991331577301
Closest Category: <Nightlife> -> score 0.21825027465820312
Closest Category: <Helicopter Ride> -> score 0.21226397156715393
Closest Category: <Dinner & Entertainment> -> score 0.18781016767024994
