In [9]:
import sqlite3
import pandas as pd
#from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import math
from collections import Counter

db_path = 'deals_db.db'

In [3]:
def clean_text(text):
    if text is None:
        return ''
    # HTML
    text = re.sub(r'<[^>]+>', '', text)
    # clean all except letters
    text = re.sub(r'[^a-zA-Zá-žÁ-Ž0-9\s]', '', text)
    # remove more spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [5]:
conn = sqlite3.connect('deals_db.db')
cursor = conn.cursor()
query = "SELECT deal_id, title, title_general, description FROM deals"
df = pd.read_sql(query, conn)

df["description"] = df['description'].apply(clean_text)
df["title"] = df['title'].apply(clean_text)
df["title_general"] = df['title_general'].apply(clean_text)

df_cleaned = df[['deal_id', 'title', 'title_general', 'description']]
df_cleaned.to_sql('deals_clean', conn, if_exists='replace', index=False)

conn.close()
df.head()

Unnamed: 0,deal_id,title,title_general,description
0,viator-sebago-trails-paddling-co,Sunset Tour by Kayak on Sebago Lake Maine,Sunset Tour by Kayak on Sebago Lake Maine,Best Price Guarantee If you find a better pric...
1,texas-rotisserie-and-grill-2581-broadway-2,Quarter Rotisserie Chicken Meal,Savor the Flavor Quarter Rotisserie Chicken Me...,Craving a mouthwatering meal that hits the spo...
2,whole-health-network-2,A 30minute foot reflexology massage 30minute d...,Experience Whole Health Networks Reflexology a...,Looking to relieve stress and find balance Ref...
3,all-things-skin-organic-skincare-12,Two 60Minute AntiAging RF Frequency Facial wit...,Up to 55 Off on AntiAging Facial at All Things...,One 60Minute AntiAging RF Frequency Facial wit...
4,lash-and-bronze,Two 30 Minute Sauna Sessions,Up to 42 Off on Spa Sauna Infrared at Lash and...,Enjoy 30 minute sessions in our infrared sauna...


In [10]:
# TF-IDF global
def load_documents_from_db(db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # select clean data
    cursor.execute("SELECT description, title, title_general  FROM deals_clean")
    documents = [row[0] for row in cursor.fetchall() if row[0]]
    
    conn.close()
    return documents

# Count TF
def compute_tf(text):
    words = text.lower().split()
    word_count = len(words)
    tf = Counter(words)
    for word in tf:
        tf[word] = tf[word] / word_count  # Normalize frequency
    return tf

# Count IDF
def compute_idf(documents):
    idf = {}
    total_documents = len(documents)
    for doc in documents:
        words = set(doc.lower().split())  # Only unique words
        for word in words:
            if word in idf:
                idf[word] += 1
            else:
                idf[word] = 1
    for word in idf:
        idf[word] = math.log(total_documents / idf[word])  # IDF formula
    return idf

# TF-IDF 
def compute_tfidf_for_corpus(db_path):
    documents = load_documents_from_db(db_path)
    
    combined_text = ' '.join(documents)
    
    idf = compute_idf(documents)
    tf = compute_tf(combined_text)
    
    # TF-IDF for every word
    tfidf = {word: tf_value * idf.get(word, 0) for word, tf_value in tf.items()}

    return tfidf

tfidf_results = compute_tfidf_for_corpus(db_path)

In [14]:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute('''DROP TABLE IF EXISTS tfidf''')
cursor.execute('''
CREATE TABLE IF NOT EXISTS tfidf (
    word TEXT PRIMARY KEY,
    value REAL
)
''')

for word, score in tfidf_results.items():
    cursor.execute('''
    INSERT OR REPLACE INTO tfidf (word, value) 
    VALUES (?, ?)
    ''', (word, score))

conn.commit()
conn.close()

In [None]:
tfidf_results