In [120]:
import nltk
from nltk import bigrams, trigrams
from collections import Counter
import re
import sqlite3
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

db_path = '/Users/zphilipp/git/research/dealsdb/deals_db1.db'

prepositions_and_conjunctions = [
    "about", "above", "across", "after", "against", "along", "among", "around", "at",
    "before", "behind", "below", "beneath", "beside", "between", "beyond", "by",
    "during", "for", "from", "in", "inside", "into", "near", "of", "off", "on",
    "out", "outside", "over", "through", "throughout", "to", "toward", "under",
    "until", "up", "with", "within", "without", "and", "but", "or", "for", "nor",
    "so", "yet", "although", "because", "as", "since", "unless", "while", "when",
    "where", "after", "before", "the", "a", "b", "c", "d", "e", "f", "g", "h",
    "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w",
    "x", "y", "z"
]
pattern = r'\b(?:' + '|'.join(prepositions_and_conjunctions) + r')\b'

def remove_prepositions_and_conjunctions(text):
    cleaned_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    return re.sub(r'\s+', ' ', cleaned_text).strip()


# Stáhněte potřebné zdroje NLTK (jen poprvé)
nltk.download('punkt')

conn = sqlite3.connect(db_path)
cursor = conn.cursor()

sql_query = """
    SELECT
        d.customer_category_id,
        d.title_general || ' ' || GROUP_CONCAT(d.title_general, ',') || ' ' || GROUP_CONCAT(o.title, ',') AS text
    FROM deals d
        JOIN customer_taxonomy c ON (d.customer_category_id=c.id)
        LEFT JOIN options o ON (d.deal_id=o.deal_id)
    GROUP BY customer_category_id
    -- LIMIT 100
"""

# Execute the query and load the data into a DataFrame
df = pd.read_sql_query(sql_query, conn)
conn.close()

[nltk_data] Downloading package punkt to /Users/zphilipp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [121]:
def get_most_common_bigrams(df, target_word):
    # Vyfiltrujte DataFrame, abyste získali pouze bigramy obsahující cílové slovo
    filtered_bigrams = df[df['text'].str.contains(target_word, case=False, na=False)]
    
    # Seřadíme podle frekvence 'value' v sestupném pořadí
    sorted_bigrams = filtered_bigrams[filtered_bigrams["value"] > 20].sort_values(by='value', ascending=False)

    return sorted_bigrams
        
def to_bigrams(text):
    text = remove_prepositions_and_conjunctions(text)
    if isinstance(text, list):
        text = " ".join(text)
    elif not isinstance(text, str):
        return []

    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)

    words = nltk.word_tokenize(text)
    bigrams_list = list(bigrams(words))
    bigram_counts = Counter(bigrams_list)
    sorted_bigrams = bigram_counts.most_common()

    return sorted_bigrams

In [122]:
df["bigrams"] = df["text"].apply(to_bigrams)
df["bigrams"].count()

982

In [94]:
bigram_rows = []
for bigrams in df["bigrams"]:
    #print (bigrams)
    for (bigram, count) in bigrams:
        bigram_rows.append({"text": " ".join(bigram), "value": count})

bigram_df = pd.DataFrame(bigram_rows)
bigram_df = bigram_df.groupby('text', as_index=False)['value'].sum()
bigram_df.to_csv("models/top_bigrams.csv", index=False)

In [95]:
get_most_common_bigrams(bigram_df, "oil")[:5]

Unnamed: 0,text,value
436822,oil change,3005
646232,synthetic oil,1091
202335,essential oil,986
162416,de toilette,830
436996,oil hot,486


In [96]:
get_most_common_bigrams(bigram_df, "massage")[:5]

Unnamed: 0,text,value
69706,body massage,2620
381684,massage hot,2404
645162,swedish massage,1759
384003,massageone minute,1577
668917,tissue massage,1536


In [97]:
get_most_common_bigrams(bigram_df, "valvoline")[:5]

Unnamed: 0,text,value
715671,valvoline instant,280
105643,change valvoline,182
569258,services valvoline,87


In [98]:
get_most_common_bigrams(bigram_df, "amc")[:5]

Unnamed: 0,text,value
16213,amc classic,85
16238,amc dine,47
622497,square amc,23


In [99]:
get_most_common_bigrams(bigram_df, "park")[:5]

Unnamed: 0,text,value
468882,parking dailyhourly,2024
34316,availablereserve parking,2013
713576,valetevent parking,503
469026,parking four,503
675949,tour parking,282


In [100]:
get_most_common_bigrams(bigram_df, "valentine")[:5]

Unnamed: 0,text,value
713461,valentine day,740
292021,heart valentine,160
369427,loves valentine,160
454214,oversized valentine,160
645453,sweetest valentine,100


In [101]:
get_most_common_bigrams(bigram_df, "iphone")[:5]

Unnamed: 0,text,value
322963,iphone pro,1265
21765,apple iphone,593
97979,case iphone,511
509293,protector iphone,291
322882,iphone ipad,279


In [102]:
get_most_common_bigrams(bigram_df, "oli")[:5]

Unnamed: 0,text,value
745990,women solid,1420
98787,casual solid,1313
612564,solid sterling,888
612489,solid long,859
612416,solid casual,858


In [103]:
get_most_common_bigrams(bigram_df, "mani")[:5]

Unnamed: 0,text,value
260502,gel manicure,511
376807,manicure pedicure,337
552542,salon manipedi,202
377395,manipedi options,158
552541,salon manicure,133


In [104]:
get_most_common_bigrams(bigram_df, "mac")[:5]

Unnamed: 0,text,value
372554,macbook pro,345
21775,apple macbook,267
372541,macbook air,165
525705,refurbishedapple macbook,156
525596,refurbapple macbook,53


In [105]:
get_most_common_bigrams(bigram_df, "banana")[:5]

Unnamed: 0,text,value
41907,banana boat,30


In [106]:
get_most_common_bigrams(bigram_df, "boat")[:5]

Unnamed: 0,text,value
68748,boat tour,477
68696,boat rental,194
68661,boat neck,132
604737,sleeve boat,132
68713,boat ride,119


In [107]:
get_most_common_bigrams(bigram_df, "chocolate")[:5]

Unnamed: 0,text,value
111628,chocolate brazilian,57
111662,chocolate crayons,36
309270,ice chocolate,36
111711,chocolate jello,34
685686,tray chocolate,34


In [108]:
get_most_common_bigrams(bigram_df, "brazilian")[:5]

Unnamed: 0,text,value
78633,brazilian bikini,1667
78910,brazilian wax,1279
78986,brazilian waxing,1204
733310,waxone brazilian,511
55865,bikini brazilian,452


In [109]:
get_most_common_bigrams(bigram_df, "facial")[:5]

Unnamed: 0,text,value
225066,facial treatments,1186
224396,facial options,987
226537,facialone minute,899
403831,minute facial,827
19669,antiaging facial,684


In [110]:
get_most_common_bigrams(bigram_df, "earrings")[:5]

Unnamed: 0,text,value
633412,stud earrings,1868
300630,hoop earrings,583
190770,earrings ct,418
190823,earrings k,333
191048,earringsparis jewelry,302


In [111]:
get_most_common_bigrams(bigram_df, "jewelry")[:5]

Unnamed: 0,text,value
328125,jewelry k,754
191048,earringsparis jewelry,302
269709,goldparis jewelry,171
328200,jewelry women,146
328199,jewelry white,99


In [112]:
get_most_common_bigrams(bigram_df, "coffee")[:5]

Unnamed: 0,text,value
127641,coffee table,135
127574,coffee mug,67
127607,coffee pods,63
256461,funny coffee,60
127654,coffee themed,60


In [113]:
get_most_common_bigrams(bigram_df, "Wellness")[:5]

Unnamed: 0,text,value
736301,wellness center,1416
291397,health wellness,512
737018,wellness spa,407
50173,beauty wellness,402
10027,aesthetics wellness,307


In [115]:
get_most_common_bigrams(result_df, "Wellness")[:5]

Unnamed: 0,text,value
736301,wellness center,1416
291397,health wellness,512
737018,wellness spa,407
50173,beauty wellness,402
10027,aesthetics wellness,307


In [123]:
def to_trigrams(text):
    text = remove_prepositions_and_conjunctions(text)
    if isinstance(text, list):
        text = " ".join(text)
    elif not isinstance(text, str):
        return []

    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)

    words = nltk.word_tokenize(text)
    bigrams_list = list(trigrams(words))
    bigram_counts = Counter(bigrams_list)
    sorted_bigrams = bigram_counts.most_common()

    return sorted_bigrams

In [124]:
df["trigrams"] = df["text"].apply(to_trigrams)

In [127]:
trigram_rows = []
for trigram in df["trigrams"]:
    #print (bigrams)
    for (trigram, count) in trigram:
        trigram_rows.append({"text": " ".join(trigram), "value": count})

trigram_df = pd.DataFrame(trigram_rows)
trigram_df = trigram_df.groupby('text', as_index=False)['value'].sum()
trigram_df.to_csv("models/top_trigram.csv", index=False)

In [129]:
get_most_common_bigrams(trigram_df, "Wellness")[:5]

Unnamed: 0,text,value
650062,liquivida wellness center,142
1352558,wellness center offers,92
824322,oregon laser wellness,82
617789,laser wellness center,79
88604,beauty wellness center,76


In [130]:
get_most_common_bigrams(trigram_df, "oil")[:5]

Unnamed: 0,text,value
1184929,synthetic oil change,830
338469,eau de toilette,822
459074,full synthetic oil,558
576081,instant oil change,282
357732,essential oil hot,281


In [132]:
get_most_common_bigrams(trigram_df, "massage")[:5]

Unnamed: 0,text,value
292331,deep tissue massage,1490
684757,massage hot stone,971
455407,full body massage,960
684775,massage hot stones,785
121867,body massage hot,758


In [133]:
get_most_common_bigrams(trigram_df, "valvoline")[:5]

Unnamed: 0,text,value
1316258,valvoline instant oil,280
793265,oil change valvoline,179
188133,change valvoline instant,166
188038,change services valvoline,87
1032248,services valvoline instant,87
