In [22]:
import sqlite3
from sklearn.feature_extraction.text import TfidfVectorizer
from elasticsearch import Elasticsearch
import numpy as np
import pandas as pd

#import nltk
import re

db_path = 'deals_db.db'
index_name = 'deals' 

In [23]:
def clean_text(text):
    if text is None:
        return ''
    # HTML
    text = re.sub(r'<[^>]+>', '', text)
    # clean all except letters
    text = re.sub(r'[^a-zA-Zá-žÁ-Ž0-9\s]', '', text)
    # remove more spaces
    text = re.sub(r'\s+', ' ', text).strip().lower()
    #text = text.replace(["the", "on", ])
    return text

#### Source Radecke, Joan - data/query_by_category.csv get all query witch more than 0 conversion

In [24]:
#pd.read_csv("ga.search.query.clean.csv", delimiter = ",").head()
querywitchconversion = pd.read_csv("data/query_by_category.csv", delimiter = ",")[
    ["query", "conversion"]].query(
    "conversion > 0").groupby(
    ["query"]).agg(
    {"query" : "first", "conversion" : "sum"})["conversion"].reset_index().sort_values(
      "conversion", ascending = False  
    )
querywitchconversion.head(5)

Unnamed: 0,query,conversion
645,oil change,108
528,massage,96
206,costco,47
808,shutterfly,32
211,couples massage,30


### Unigrams

In [25]:
# from GA
df = pd.read_csv("ga.search.query.clean.csv", delimiter = ",")
#df.sort_values(["count"], ascending = False).head(10)

corpus = []
for index, row in df.iterrows():
    if (len(row.query) < 80):
        corpus.append(row.query)

corpus = [' '.join(corpus)]

In [26]:
# from title
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

cursor.execute("""SELECT d.title_general  AS document
    FROM deals d
""")
rows = cursor.fetchall()

In [27]:
for row in rows:
    #line = clean_text(row[0])
    corpus.append(row[0])

In [28]:
# TfidfVectorizer init for bigram
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Get feature names
unigram = tfidf_vectorizer.get_feature_names_out()

# Compute the mean TF-IDF scores
mean_tfidf = np.asarray(tfidf_matrix.mean(axis=0)).flatten()

# Filter trigrams with mean TF-IDF greater than 0.0001
filtered_mean_tfidf = [(unigram, tfidf) for unigram, tfidf in zip(unigram, mean_tfidf) if tfidf > 0.00001]

# Write to CSV without pandas
output_file = 'models/mean_tfidf_unigram.csv'
with open(output_file, 'w', encoding='utf-8') as f:
    f.write('text,tfidf\n')
    for unigram, tfidf in filtered_mean_tfidf:
        f.write(f'{unigram},{tfidf}\n')

In [29]:
# join both sources
df = pd.read_csv("models/mean_tfidf_unigram.csv", delimiter = ",")
df_query = pd.read_csv("ga.search.query.clean.csv", delimiter = ",")
result = pd.merge(df, df_query, left_on='text', right_on='query')
result.sort_values(["count"], ascending = False).head()

Unnamed: 0,text,tfidf,query,count
4241,massage,0.012992,massage,157767
3251,hotel,0.001681,hotel,56627
3252,hotels,0.000193,hotels,49758
6383,spa,0.015029,spa,48544
897,botox,0.001972,botox,37938


In [30]:
# normalize tf-idf and save
result["tfidf"] = result["tfidf"] * result["count"]/100
res = result.sort_values(["text"], ascending = False).dropna(subset=['tfidf'])
res[["text", "tfidf"]].to_csv("models/mean_tfidf_unigram_normalize.csv", index=False)
result.sort_values(["tfidf"], ascending = False).head()

Unnamed: 0,text,tfidf,query,count
4241,massage,20.496701,massage,157767
6383,spa,7.295785,spa,48544
2446,facial,2.985337,facial,28776
2894,golf,0.986341,golf,8522
3813,laser,0.964078,laser,10539


In [93]:
# normalize unigrams
df = pd.read_csv("models/mean_tfidf_unigram_normalize.csv", delimiter = ",")#.query("text == 'oil'")

text     7686
tfidf    7686
dtype: int64

In [88]:
querywitchconversion.head()

Unnamed: 0,query,conversion
645,oil change,108
528,massage,96
206,costco,47
808,shutterfly,32
211,couples massage,30


In [135]:
# Vytvoření listu pro uchování výsledků
new_data = []

# Iterace přes první DataFrame
for _, row1 in df.iterrows():
    text_value = row1['text']
    matches = querywitchconversion[querywitchconversion['query'].str.startswith(text_value)]
    
    # Kontrola, zda bylo nalezeno nějaké shodné query
    if not matches.empty:
        # Přidání všech shodných řádků z druhého DataFrame do nového DataFrame
        for _, match in matches.iterrows():
            new_row = {
                'text': text_value,
                'tfidf': row1['tfidf'],
                'query': match['query'],
                'conversion': match['conversion']
            }
            new_data.append(new_row)
    else:
        # Pokud shoda nenalezena, přidat pouze řádek z prvního DataFrame
        new_data.append({
            'text': text_value,
            'tfidf': row1['tfidf'],
            'query': None,
            'conversion': None
        })

# Vytvoření nového DataFrame ze shromážděných řádků
new_df = pd.DataFrame(new_data)

# Zobrazíme výsledek
new_df['result'] = new_df['tfidf'] * 0.99
new_df1 = new_df[new_df['query'].notnull()]
new_df1 = new_df1[["query", "result"]].sort_values(by=['query', 'result'], ascending=[True, False]).drop_duplicates(subset='query', keep='first')
new_df1 = new_df1.rename(columns={"result" : "tfidf", "query": "text"})
new_df1

Unnamed: 0,text,tfidf
8920,1800 flowers,0.000093
8913,200 custom photo cards,0.000177
8912,2010 microsoft,0.000177
8861,aa batteries,0.000005
8860,aaa,0.000713
...,...,...
78,yag laser,0.000005
56,yoga,0.061910
43,youthful trends,0.000205
15,zipline,0.003199


In [137]:
result = pd.concat([new_df, new_df1], ignore_index=True).drop_duplicates(subset='text', keep='first').sort_values(by=['text', 'tfidf'], ascending=[True, False])
result[["text", "tfidf"]].to_csv("models/mean_tfidf_unigram_normalize1.csv", index=False)
result[["text", "tfidf"]].head(20)

Unnamed: 0,text,tfidf
8942,00,6.35866e-06
8941,10,0.0003843529
8940,100,0.000527471
8939,1000,2.359776e-05
8938,105,1.556321e-06
8937,1080,4.010097e-07
8936,1080p,4.554787e-06
8935,10d,4.985358e-07
8934,10k,1.399002e-05
8933,11,0.0005662639


### trigram from DB

In [14]:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

cursor.execute("""SELECT d.title || ' ' || d.title_general  AS document
    FROM deals_clean c
        JOIN deals d ON (c.deal_id=d.deal_id)
        JOIN categories c ON (d.customer_category_id=c.id)
    -- LIMIT 100 OFFSET 12000
    -- WHERE document like '%oil%'
""")
rows = cursor.fetchall()

In [15]:
corpus = []
for row in rows:
    line = clean_text(row[0])
    corpus.append(line)

In [16]:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

cursor.execute("""SELECT m.name  AS document
    FROM merchant m
""")
rows = cursor.fetchall()

In [87]:
for row in rows:
    #line = clean_text(row[0])
    corpus.append(row[0])

In [19]:
df = pd.read_csv("ga.search.query.clean.csv", delimiter = ",")
for index, row in df.iterrows():
    corpus.append(row.query)

In [24]:
# TfidfVectorizer init for trigram
tfidf_vectorizer = TfidfVectorizer(ngram_range=(3, 3))
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Get feature names
trigrams = tfidf_vectorizer.get_feature_names_out()

# Compute the mean TF-IDF scores
mean_tfidf = np.asarray(tfidf_matrix.mean(axis=0)).flatten()

# Filter trigrams with mean TF-IDF greater than 0.0001
filtered_mean_tfidf = [(trigram, tfidf) for trigram, tfidf in zip(trigrams, mean_tfidf) if tfidf > 0.00001]

# Write to CSV without pandas
output_file = 'models/mean_tfidf_trigram.csv'
with open(output_file, 'w', encoding='utf-8') as f:
    f.write('trigram,tfidf\n')
    for trigram, tfidf in filtered_mean_tfidf:
        f.write(f'{trigram},{tfidf}\n')

In [25]:
search_word = 'oil change'
df = pd.read_csv("models/mean_tfidf_trigram.csv")
df[df["trigram"].str.startswith(search_word)].sort_values(by='tfidf', ascending=False).head(10)

Unnamed: 0,trigram,tfidf
6664,oil change at,0.000215
6681,oil change up,0.000167
6684,oil change with,0.000158
6663,oil change and,7.8e-05
6678,oil change services,6e-05
6679,oil change synthetic,5.2e-05
6674,oil change oil,5e-05
6671,oil change in,4.5e-05
6670,oil change full,4.5e-05
6680,oil change tire,3.8e-05


In [None]:
# TfidfVectorizer init for bigram
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 2))
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Get feature names
bigrams = tfidf_vectorizer.get_feature_names_out()

# Compute the mean TF-IDF scores
mean_tfidf = np.asarray(tfidf_matrix.mean(axis=0)).flatten()

# Filter trigrams with mean TF-IDF greater than 0.0001
filtered_mean_tfidf = [(bigram, tfidf) for bigram, tfidf in zip(bigrams, mean_tfidf) if tfidf > 0.00001]

# Save to CSV
output_file = 'models/mean_tfidf_bigram.csv'
with open(output_file, 'w', encoding='utf-8') as f:
    f.write('bigram,tfidf\n')
    for trigram, tfidf in filtered_mean_tfidf:
        f.write(f'{trigram},{tfidf}\n')

In [12]:
search_word = 'oil'
df = pd.read_csv("models/mean_tfidf_bigram.csv")
df[df["bigram"].str.startswith(search_word)].sort_values(by='tfidf', ascending=False).head(10)

Unnamed: 0,bigram,tfidf
5981,oil change,0.001537
5979,oil and,0.000143
5982,oil changes,9e-05
5986,oil treatment,5.5e-05
5984,oil hot,5.3e-05
5985,oil massage,4.6e-05
5987,oil up,4.6e-05
5983,oil filter,3.4e-05
5988,oils and,3.1e-05
5989,oils up,2.3e-05


In [26]:
# TfidfVectorizer init for Four-gram
tfidf_vectorizer = TfidfVectorizer(ngram_range=(4, 4))
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Get feature names
fourgram = tfidf_vectorizer.get_feature_names_out()

# Compute the mean TF-IDF scores
mean_tfidf = np.asarray(tfidf_matrix.mean(axis=0)).flatten()

# Filter trigrams with mean TF-IDF greater than 0.0001
filtered_mean_tfidf = [(fourgram, tfidf) for fourgram, tfidf in zip(fourgram, mean_tfidf) if tfidf > 0.00001]

# Save to CSV
output_file = 'models/mean_tfidf_fourgram.csv'
with open(output_file, 'w', encoding='utf-8') as f:
    f.write('fourgram,tfidf\n')
    for fourgram, tfidf in filtered_mean_tfidf:
        f.write(f'{fourgram},{tfidf}\n')

In [27]:
search_word = 'oil change'
df = pd.read_csv("models/mean_tfidf_fourgram.csv")
df[df["fourgram"].str.startswith(search_word)].sort_values(by='tfidf', ascending=False).head(10)

Unnamed: 0,fourgram,tfidf
3184,oil change up to,0.000161
3174,oil change at valvoline,4.9e-05
3181,oil change services at,4.9e-05
3187,oil change with filter,4.8e-05
3179,oil change oil change,4.6e-05
3190,oil change with tire,4.6e-05
3182,oil change synthetic blend,3.1e-05
3173,oil change at midas,2.9e-05
3171,oil change and tire,2.9e-05
3183,oil change tire rotation,2.6e-05


### normalize martix by number of search counts

In [4]:
df = pd.read_csv("models/mean_tfidf_bigram.csv", delimiter = ",")
df_query = pd.read_csv("ga.search.query.clean.csv", delimiter = ",")

In [5]:
print(df.head())
print(df_query.head())

       bigram     tfidf
0    10 class  0.000028
1  10 classes  0.000024
2      10 for  0.000365
3     10 inch  0.000024
4   10 minute  0.000030
                    query  count
0   enterprise car rental      4
1        !arajuama doctor      4
2      #21 transportation      2
3          #long sentance      5
4                #roofing      6


In [6]:
result = pd.merge(df, df_query, left_on='bigram', right_on='query', how='left')

In [7]:
result.sort_values(["count"], ascending = False).head()

Unnamed: 0,bigram,tfidf,query,count
5981,oil change,0.001537,oil change,46065.0
2371,couples massage,0.000697,couples massage,26537.0
2914,escape room,0.000651,escape room,16579.0
4481,indoor waterpark,8.3e-05,indoor waterpark,11277.0
3909,great wolf,0.000558,great wolf,6156.0


In [8]:
result["tfidf"] = result["tfidf"] * result["count"]/500

In [9]:
result.sort_values(["count"], ascending = True).head()

Unnamed: 0,bigram,tfidf,query,count
3770,genuine leather,7.887825e-08,genuine leather,1.0
7984,spring beauty,5.419004e-08,spring beauty,1.0
8035,steam carpet,7.068453e-08,steam carpet,1.0
2219,club recharge,6.58664e-08,club recharge,1.0
3793,get your,2.089087e-07,get your,1.0


In [10]:
result[["bigram", "tfidf"]].to_csv("models/mean_tfidf_bigram_normalize.csv")