In [10]:
from elasticsearch import Elasticsearch
import pandas as pd
import sqlite3
from datetime import datetime, timezone
from IPython.display import display, HTML
import time
import re

pd.set_option('display.max_rows', None)

db_path = 'deals_db.db'
index_name = 'deals' 

In [11]:
es = Elasticsearch(
    ['https://localhost:9200'],
    basic_auth=('elastic', 'iEzoDQe8OWWl133BTsyq'),
    ca_certs='/Users/zphilipp/http_ca.crt'
)

def clean_text(text):
    if text is None:
        return ''
    # HTML
    text = re.sub(r'<[^>]+>', '', text)
    # clean all except letters
    #text = re.sub(r'[^a-zA-Zá-žÁ-Ž0-9\s]', '', text)
    # remove more spaces
    text = re.sub(r'\s+', ' ', text).strip().lower()
    #text = text.replace(["the", "on", ])
    return text

In [3]:
es.info()

ObjectApiResponse({'name': 'bf934ce3043f', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'c7G7Vpm4SOe8Qz5G_y0Otg', 'version': {'number': '8.17.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'd4b391d925c31d262eb767b8b2db8f398103f909', 'build_date': '2025-01-10T10:08:26.972230187Z', 'build_snapshot': False, 'lucene_version': '9.12.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [4]:
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
else:
    pass
    
index_settings = {
    'settings': {
        'number_of_shards': 1,
        'number_of_replicas': 1 ,
        "max_docvalue_fields_search": 200000
    },
    'mappings': {
        'properties': {
            'id': {'type': 'text'},
            'title': {'type': 'text'},
            'title_general': {'type': 'text'},
            'description': {'type': 'text'},
            'category' : {'type' : 'text'},
            'timestamp': {'type': 'date'}
        }
    }
}
es.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'deals'})

In [5]:
index_info = es.indices.get(index=index_name)
index_mapping = index_info[index_name]['mappings']
mapping_df = pd.json_normalize(index_mapping['properties'])
mapping_df

search_query = {
  "size": 0
}
response = es.search(index=index_name, body=search_query)
response

ObjectApiResponse({'took': 9, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 0, 'relation': 'eq'}, 'max_score': None, 'hits': []}})

In [12]:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

def fetch_tree(item_id):
    """Funkce pro načtení stromu pro daný item_id."""
    # Načtěte vlastnosti aktuální položky
    cursor.execute("SELECT parent_id, name FROM category WHERE id = ?", (item_id,))
    item = cursor.fetchone()
    #print(item)
    if not item:
        return None
    
    # Vytvoření uzlu
    return {
        'value': item[1],
        'parent_id': item[0]
    }

def get_taxonomy_path(id):

    data = fetch_tree(id)
    category_string = ""
    while data is not None:    
        data = fetch_tree(data["parent_id"])
        try:
            category_string = data["value"] + " / " + category_string
        except TypeError as e:
            #print (data)
            break
    
    return category_string[:-2]


cursor.execute("""SELECT d.deal_id, d.title, d.title_general, d.highlights, d.customer_category_id
    FROM deals d
""")
rows = cursor.fetchall()

counter = 0
for row in rows:
    
    if row[4]:
        category = get_taxonomy_path(row[4])
        
        document = {
            'deal_id': row[0],
            'title': row[1],
            'title_general': row[2],
            'highlights': clean_text(row[3]),
            'category' : category,
            'timestamp': datetime.now(timezone.utc).isoformat()
        }    
        es.index(index=index_name, document=document)
        counter = counter + 1

conn.close()
print (counter)

93044


In [15]:
#search_term = 'oil change'
search_term = 'Oil changes change Engine replacement services Lubr ication filter Routine oil change Motor oil change Oil maintenance Oil service intervals'
search_term = 'val'
start_time = time.time()
search_query = {
    'query': {
        'bool': {
            'should': [
                {'match': {'title': {'query': search_term, 'boost': 1}}},
                {'match': {'title_general': {'query': search_term, 'boost': 1}}},
                {'match': {'highlights': {'query': search_term, 'boost': 1}}}
            ]
        }
    },
    'size': 10000
}

response = es.search(index=index_name, body=search_query)
end_time = time.time()
elapsed_time = end_time - start_time

documents = []
for hit in response['hits']['hits']:
    document = hit['_source']
    score = hit['_score']

    document['score'] = score
    documents.append(document)
#print (response)
df = pd.DataFrame(documents)
df['title'] = df.apply(lambda x: f'<a href="https://www.groupon.com/deals/{x["deal_id"]}" target="_blank">{x["title"]}</a>', axis=1)
count = df["deal_id"].count()


display(HTML("<strong>Retrieval: " + str(df['deal_id'].count()) + f" deals. In {elapsed_time:.6f}" + "s.</strong>"))
display(HTML(df[["deal_id", "title", "category", "score"]].head(1000).to_html(escape=False)))

Unnamed: 0,deal_id,title,category,score
0,viator-mexitours-71,Val ‘Quirico - People’s Elves Show,Nearby / Things To Do / Sightseeing & Tours,38.51154
1,val-ds-signature,"Shampoo, Conditioning Smoothing Treatment",Nearby / Beauty & Spas / Salons,10.312409
2,beauty-spa-by-val-6,Six Laser Hair-Removal Sessions on One Small Area,Nearby / Beauty & Spas / Hair Removal,10.009469
3,skin-haus-by-val,One Brazilian Waxing Session,Nearby / Beauty & Spas / Hair Removal,8.956974
4,skin-haus-by-val-1,3 session of Procell Micro-Channeling with Stem Cell Infusion,Nearby / Beauty & Spas,8.509583
5,hair-by-val-1,Three Men's Haircuts with Shampoo and Style,Nearby / Beauty & Spas / Hair & Styling,8.302238


### search in categories

In [38]:
query = 'Mattresses'

search_query = {
    'query': {
        'multi_match': {
            'query': query,
            'fields': ['taxonomy']
        }
    },
    'size': 10000
}

response = es.search(index=index_name, body=search_query)
documents = []
for hit in response['hits']['hits']:
    document = hit['_source']
    score = hit['_score']
    document['score'] = score
    documents.append(document)
    
df = pd.DataFrame(documents)
df['title'] = df.apply(lambda x: f'<a href="https://www.groupon.com/deals/{x["deal_id"]}" target="_blank">{x["title"]}</a>', axis=1)
print(df.count())
display(HTML(df[["title", "taxonomy", "score"]].head(100).to_html(escape=False)))

deal_id          1
title            1
title_general    1
description      1
taxonomy         1
timestamp        1
score            1
dtype: int64


Unnamed: 0,title,taxonomy,score
0,Costway 4 Sizes Jacquard 8 Foam Mattress Medium Firm BedinaBox King 8 Inches Medium Firm Mattress Only,Goods / For the Home / Mattresses & Accessories,9.718934
