In [1]:
import nltk
from nltk import bigrams, trigrams
from collections import Counter
import re
import sqlite3
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

db_path = '/Users/zphilipp/git/research/dealsdb/deals_db1.db'

prepositions_and_conjunctions = [
    "about", "above", "across", "after", "against", "along", "among", "around", "at",
    "before", "behind", "below", "beneath", "beside", "between", "beyond", "by",
    "during", "for", "from", "in", "inside", "into", "near", "of", "off", "on",
    "out", "outside", "over", "through", "throughout", "to", "toward", "under",
    "until", "up", "with", "within", "without", "and", "but", "or", "for", "nor",
    "so", "yet", "although", "because", "as", "since", "unless", "while", "when",
    "where", "after", "before", "the", "a", "b", "c", "d", "e", "f", "g", "h",
    "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w",
    "x", "y", "z"
]
pattern = r'\b(?:' + '|'.join(prepositions_and_conjunctions) + r')\b'

def remove_prepositions_and_conjunctions(text):
    cleaned_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    return re.sub(r'\s+', ' ', cleaned_text).strip()


# Stáhněte potřebné zdroje NLTK (jen poprvé)
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/zphilipp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

sql_query = """
    SELECT
        d.customer_category_id,
        d.title_general || ' ' || GROUP_CONCAT(d.title_general, ',') || ' ' || GROUP_CONCAT(o.title, ',') AS text
    FROM deals d
        JOIN customer_taxonomy c ON (d.customer_category_id=c.id)
        LEFT JOIN options o ON (d.deal_id=o.deal_id)
    GROUP BY customer_category_id
    -- LIMIT 100
"""

# Execute the query and load the data into a DataFrame
df = pd.read_sql_query(sql_query, conn)
conn.close()

In [3]:
def get_most_common_occurent(df, target_word):
    # Vyfiltrujte DataFrame, abyste získali pouze bigramy obsahující cílové slovo
    filtered_bigrams = df[df['text'].str.contains(target_word, case=False, na=False)]
    
    # Seřadíme podle frekvence 'value' v sestupném pořadí
    sorted_bigrams = filtered_bigrams[filtered_bigrams["value"] > 20].sort_values(by='value', ascending=False)

    return sorted_bigrams
        
def to_trigrams(text):
    text = remove_prepositions_and_conjunctions(text)
    if isinstance(text, list):
        text = " ".join(text)
    elif not isinstance(text, str):
        return []

    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)

    words = nltk.word_tokenize(text)
    bigrams_list = list(trigrams(words))
    bigram_counts = Counter(bigrams_list)
    sorted_bigrams = bigram_counts.most_common()

    return sorted_bigrams
    
def to_bigrams(text):
    text = remove_prepositions_and_conjunctions(text)
    if isinstance(text, list):
        text = " ".join(text)
    elif not isinstance(text, str):
        return []

    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)

    words = nltk.word_tokenize(text)
    bigrams_list = list(bigrams(words))
    bigram_counts = Counter(bigrams_list)
    sorted_bigrams = bigram_counts.most_common()

    return sorted_bigrams

In [4]:
df["bigrams"] = df["text"].apply(to_bigrams)
df["trigrams"] = df["text"].apply(to_trigrams)
df["bigrams"].count()

982

In [5]:
bigram_rows = []
for bigrams in df["bigrams"]:
    #print (bigrams)
    for (bigram, count) in bigrams:
        bigram_rows.append({"text": " ".join(bigram), "value": count})

bigram_df = pd.DataFrame(bigram_rows)
bigram_df = bigram_df.groupby('text', as_index=False)['value'].sum()
bigram_df.to_csv("models/top_bigrams.csv", index=False)

trigram_rows = []
for trigram in df["trigrams"]:
    #print (bigrams)
    for (trigram, count) in trigram:
        trigram_rows.append({"text": " ".join(trigram), "value": count})

trigram_df = pd.DataFrame(trigram_rows)
trigram_df = trigram_df.groupby('text', as_index=False)['value'].sum()
trigram_df.to_csv("models/top_trigram.csv", index=False)

In [20]:
for test_query in ['oli', 'oil', 'amc', 'valentine', 'park', 'mani', 'mac', 'banana', 'boaut']:
    print (f'------------------ Bigrams {test_query} -----------------------')
    print(get_most_common_occurent(bigram_df, test_query)[:5])
    print (f'------------------ Trigrams {test_query} -----------------------')
    print(get_most_common_occurent(trigram_df, test_query)[:5])

------------------ Bigrams oli -----------------------
                  text  value
745990     women solid   1420
98787     casual solid   1313
612564  solid sterling    888
612489      solid long    859
612416    solid casual    858
------------------ Trigrams oli -----------------------
                          text  value
1123464  solid sterling silver    758
812900        open front solid    502
1123349      solid long sleeve    499
1123317       solid high waist    448
1123446     solid short sleeve    420
------------------ Bigrams oil -----------------------
                 text  value
436822     oil change   3005
646232  synthetic oil   1091
202335  essential oil    986
162416    de toilette    830
436996        oil hot    486
------------------ Trigrams oil -----------------------
                         text  value
1184929  synthetic oil change    830
338469        eau de toilette    822
459074     full synthetic oil    558
576081     instant oil change    282
357732     