In [2]:
import sqlite3
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
import re

db_path = 'deals_db.db'

In [3]:
def clean_text(text):
    if text is None:
        return ''
    # HTML
    text = re.sub(r'<[^>]+>', '', text)
    # clean all except letters
    text = re.sub(r'[^a-zA-Zá-žÁ-Ž0-9\s]', '', text)
    # remove more spaces
    text = re.sub(r'\s+', ' ', text).strip().lower()
    return text

In [4]:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

cursor.execute("""SELECT d.title || ' ' || d.title_general AS document
    FROM deals_clean c
        JOIN deals d ON (c.deal_id=d.deal_id)
        JOIN categories c ON (d.customer_category_id=c.id)
    -- LIMIT 100 OFFSET 12000
    -- WHERE document like '%oil%'
""")
rows = cursor.fetchall()

In [5]:
rows[:10]

[('Sunset Tour by Kayak on Sebago Lake Maine Sunset Tour by Kayak on Sebago Lake Maine',),
 ('Quarter Rotisserie Chicken Meal Savor the Flavor: Quarter Rotisserie Chicken Meal at Texas Rotisserie And Grill-2581 Broadway(Up to 28% Off)',),
 ("A 30-minute foot reflexology massage, 30-minute detoxifying footbath and 20-minute head, neck and shoulder massage Experience Whole Health Network's Reflexology and Detox Foot Baths with up to 37% off",),
 ('Two 60-Minute Anti-Aging RF Frequency Facial with LED Up to 55% Off on Anti-Aging Facial at All Things Skin Organic Skincare',),
 ('Two 30 Minute Sauna Sessions Up to 42% Off on Spa - Sauna - Infrared at Lash and Bronze',),
 ("Chimney Sweep Set Experience Carolina HVAC's chimney sweep set for clean fireplaces with up to 89% off",),
 ('Parking for Tennis: Pollen 2023 Tour (03/31/23) Parking at 820 N. Front St. – Lot',),
 ('One 60 min Non Invasive Butt Lift One 60 min Non Invasive Butt Lift with a Wood Therapy Massage (Up to 60% Off)',),
 ('Anti-

In [137]:
sentences = []
count = 0
for deal in rows:
    #if deal[0] is not None:
    #    sents = deal[0].split('. ')
    #for sen in sents:
    sentences.append(clean_text(deal[0]))
sentencesList = [word_tokenize(sen) for sen in sentences]

In [None]:
sentencesList[:100]

In [151]:
model = Word2Vec(sentencesList, vector_size=100, window=4, min_count=3, workers=4)
model.train(sentencesList, total_examples=len(sentencesList), epochs=10)

(230517, 446480)

In [152]:
model.save("models/word2vec.model")

In [153]:
print(model.wv.index_to_key[:100])

['oil', 'with', 'to', 'up', 'change', 'off', 'massage', 'and', 'at', 'synthetic', 'hot', 'or', 'full', 'one', 'essential', 'body', 'on', 'stone', 'for', 'couples', 'blend', '60minute', 'stones', 'treatment', 'spa', 'experience', 'a', 'foot', 'oils', 'tire', 'auto', 'deep', 'filter', 'inspection', 'tissue', 'swedish', '50', 'of', 'service', 'options', '60', 'rotation', 'valvoline', 'conventional', 'replacement', 'massages', '25', 'instant', 'w', 'deluxe', 'services', 'two', 'care', '55', 'car', 'package', 'minute', 'aromatherapy', 'cbd', 'brake', 'relaxation', 'therapeutic', 'reflexology', 'free', 'repair', 'your', '90minute', 'lube', 'custom', '50minute', 'center', 'changes', 'offering', '30', 'signature', 'enjoy', '40', 'haircut', 'combo', '1', 'toilet', 'discover', 'scrub', 'more', 'choice', 'the', 'therapy', 'semisynthetic', 'style', 'valid', 'hair', 'including', 'relaxing', 'coconut', 'automotive', 'in', '35', 'deeptissue', 'foil', 'aroma']


In [155]:
model.wv.most_similar('synthetic', topn=10)

[('conventional', 0.9843521118164062),
 ('blend', 0.9780659675598145),
 ('fullsynthetic', 0.9672417640686035),
 ('syntheticblend', 0.9616682529449463),
 ('semisynthetic', 0.9604526162147522),
 ('mileage', 0.9506848454475403),
 ('semi', 0.9435716867446899),
 ('street', 0.9399734735488892),
 ('avenue', 0.9394060969352722),
 ('road', 0.9362101554870605)]

In [6]:
corpus = []
for row in rows:
    line = clean_text(row[0])
    corpus.append(line)

In [None]:
corpus[:10]

In [None]:
del rows
# TfidfVectorizer init for bigrams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 2))
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

bigrams = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=bigrams)

# Count AVG TF-IDF for every bigram
mean_tfidf = tfidf_df.mean(axis=0)

# to dataframe
mean_tfidf = pd.DataFrame(mean_tfidf, columns=['tfidf']).reset_index()
mean_tfidf.columns = ['bigram', 'tfidf']
mean_tfidf = mean_tfidf[mean_tfidf['tfidf'] > 0.0001]

output_file = 'models/mean_tfidf_bigram.csv'
mean_tfidf.to_csv(output_file, index=False)

In [236]:
search_word = 'las'

mean_tfidf[mean_tfidf["bigram"].str.startswith(search_word)].sort_values(by='tfidf', ascending=False).head(10)

Unnamed: 0,bigram,tfidf
5800,laser genesis,0.00031
5802,laser lipo,0.000241
5801,laser hair,0.000195
5805,lasting natural,0.000148
5807,lasting shellac,0.000136
5799,laser alignments,0.000132
5798,laser aesthetics,0.00013
5806,lasting relief,0.000126
5797,las vegas,0.00012
5803,laser luxury,0.000116
