In [2]:
import sqlite3
import pickle
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sparse_dot_topn import sp_matmul_topn

In [None]:
"""
def load_dataset(file_lists, prefix_path, chunk_size=10000):
    df_dict = {}
    prefix_path += "sampled_"
    for file in file_lists:
        try:
            df_chunks = []
            total_records = 0

            for chunk in pd.read_json(prefix_path + file, lines=True, chunksize=chunk_size):
                df_chunks.append(chunk)
                total_records += chunk.shape[0]

            df = pd.concat(df_chunks, ignore_index=True)
            df_dict[file] = df
            print(f"Total records in {file}: {df.shape[0]}.")

        except Exception as e:
            print(f"Error: {e}")
            continue
    return df_dict
"""

In [None]:
"""
folder_path = '../data/'
transit_bucket = 'raw_datasets/'
target_bucket = 'yelp/'
prefix_path = folder_path + transit_bucket + target_bucket
file_list = [
    "yelp_academic_dataset_business.json",
    "yelp_academic_dataset_review.json",
]
"""

In [1]:
# Define the database folder path and file names
db_folder = '../data/processed_data/yelp_data/'
db_files = ['yelp_business_data.db', 'yelp_review_data.db']
db_paths = [db_folder + db_file for db_file in db_files]

In [5]:
# Connect to the databases and load data
def load_data_from_db():
    data = {}
    
    # Open connections and read tables
    conns = [sqlite3.connect(db_path) for db_path in db_paths]
    try:
        # Load tables from the databases
        data['business'] = pd.read_sql_query("SELECT * FROM business_details", conns[0])
        data['categories'] = pd.read_sql_query("SELECT * FROM business_categories", conns[0])
        data['review'] = pd.read_sql_query("SELECT * FROM review_data", conns[1])
    finally:
        # Close all database connections
        for conn in conns:
            conn.close()
    return data

In [6]:
# Load data into a dictionary
yelp_data = load_data_from_db()

# Check loaded data
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.


In [7]:
df_business = yelp_data["business"]
df_review = yelp_data["review"]

df_concat = df_business.merge(df_review, on='business_id', how='outer', suffixes=('_business', '_review'))

user_business = df_concat[["user_id", "business_id", "stars_review"]]

In [8]:
# Function to calculate sparse cosine similarity with top N items
def sparse_cosine_similarity_topn(A, top_n, threshold=0):
    # A is the sparse matrix (user-item matrix)
    # ntop is the number of top similar items you want
    # lower_bound is the minimum similarity score to consider

    # # Compute the top N cosine similarities in a sparse format
    
    C = sp_matmul_topn(A.T, A.T, top_n=top_n, threshold=threshold, n_threads=4, sort=True)

    return C

In [9]:
# Create a copy of the user_business DataFrame to avoid issues with slicing
user_business = user_business.copy()

# Create user and business index mappings
user_mapping = {user: idx for idx, user in enumerate(user_business['user_id'].unique())}
business_mapping = {biz: idx for idx, biz in enumerate(user_business['business_id'].unique())}

# Map user_id and business_id to numerical indices
user_business['user_idx'] = user_business['user_id'].map(user_mapping)
user_business['business_idx'] = user_business['business_id'].map(business_mapping)

# Creating the sparse user-item interaction matrix (csr_matrix)
user_item_sparse = csr_matrix(
    (user_business['stars_review'], (user_business['user_idx'], user_business['business_idx'])),
    shape=(len(user_mapping), len(business_mapping))
)

# Replace any NaN values with 0 in the sparse matrix
user_item_sparse.data = np.nan_to_num(user_item_sparse.data)

# Compute sparse cosine similarity matrix with top 10 most similar items
item_similarity_sparse = sparse_cosine_similarity_topn(user_item_sparse, top_n=50, threshold=0.01,)

In [17]:
# Connect to SQLite (this will create a file-based database)
db_path = '../data/processed_data/yelp_ItemCF.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Create tables for user-item and item-item indexes
cursor.execute('''CREATE TABLE IF NOT EXISTS user_item_index (
    user_id TEXT,
    business_id TEXT,
    stars_review REAL,
    PRIMARY KEY (user_id, business_id)
)''')

cursor.execute('''CREATE INDEX idx_user_item ON user_item_index(user_id, business_id)''')

cursor.execute('''CREATE TABLE IF NOT EXISTS item_item_similarity (
    item_id TEXT PRIMARY KEY,
    similarity_vector BLOB
)''')

cursor.execute('''CREATE INDEX idx_item_similarity ON item_item_similarity(item_id)''')

# cursor.execute('''CREATE TABLE IF NOT EXISTS user_mapping (
#     user_id TEXT PRIMARY KEY,
#     user_idx INTEGER
# )''')

cursor.execute('''CREATE TABLE IF NOT EXISTS business_mapping (
    business_id TEXT PRIMARY KEY,
    business_idx INTEGER
)''')


# Commit the changes
conn.commit()

In [18]:
def insert_user_item(user_business, conn, batch_size=10000):
    cursor = conn.cursor()

    # Start a transaction
    cursor.execute('BEGIN TRANSACTION')

    # Insert user-item interactions in batches
    total_records = len(user_business)
    for i in range(0, total_records, batch_size):
        batch = user_business.iloc[i:i + batch_size]

        cursor.executemany('''INSERT OR IGNORE INTO user_item_index (user_id, business_id, stars_review)
                              VALUES (?, ?, ?)''',
                           batch[['user_id', 'business_id', 'stars_review']].values.tolist())

        # Show progress
        print(f"{i + len(batch)} / {total_records} records stored in user_item_index")

    # Commit once at the end of the transaction
    conn.commit()

In [19]:
def insert_item_vectors(item_similarity_sparse, business_mapping, conn, batch_size=1000, progress_interval=100000):
    cursor = conn.cursor()

    # Start a transaction
    cursor.execute('BEGIN TRANSACTION')

    total_inserted = 0
    batch = []

    # Iterate over each row (item) in the sparse matrix
    for row_idx in range(item_similarity_sparse.shape[0]):
        # Get the row as a sparse vector (csr_matrix row)
        row_vector = item_similarity_sparse.getrow(row_idx)

        # Extract indices and data from the sparse vector
        row_indices = row_vector.indices
        row_data = row_vector.data

        # Serialize only indices and data (not the full matrix)
        serialized_row = pickle.dumps((row_indices, row_data))

        # Get the item id (business_id)
        item_id = list(business_mapping.keys())[row_idx]

        # Add the item and its vector to the batch
        batch.append((item_id, serialized_row))

        # Insert in batches to reduce the number of commits
        if len(batch) >= batch_size:
            cursor.executemany('''INSERT OR REPLACE INTO item_item_similarity (item_id, similarity_vector)
                                  VALUES (?, ?)''', batch)
            total_inserted += len(batch)

            # Print progress every progress_interval records
            if total_inserted % progress_interval == 0:
                print(f"Inserted {total_inserted} item vectors so far...")

            batch = []  # Clear the batch after committing

    total_inserted += len(batch)  # Add any remaining records
    # Insert any remaining records after the loop
    if batch:
        cursor.executemany('''INSERT OR REPLACE INTO item_item_similarity (item_id, similarity_vector)
                                  VALUES (?, ?)''', batch)

    # Commit once at the end of the transaction
    conn.commit()

    # Final progress message
    print(f"Total {total_inserted} item vectors inserted.")

In [20]:
def insert_mappings(business_mapping, conn):
    cursor = conn.cursor()

    # Start a transaction
    cursor.execute('BEGIN TRANSACTION')

    # Insert user mappings
    # cursor.executemany('''INSERT OR REPLACE INTO user_mapping (user_id, user_idx) VALUES (?, ?)''',
    #                    [(user_id, idx) for user_id, idx in user_mapping.items()])

    # Insert business mappings
    cursor.executemany('''INSERT OR REPLACE INTO business_mapping (business_id, business_idx) VALUES (?, ?)''',
                       [(business_id, idx) for business_id, idx in business_mapping.items()])

    # Commit once at the end of the transaction
    conn.commit()

    print(f"Inserted {len(business_mapping)} business mappings.")

In [21]:
# Insert user-item index with progress
insert_user_item(user_business, conn)

# Insert item vectors into the database
insert_item_vectors(item_similarity_sparse, business_mapping, conn)

# Insert user and business mappings
insert_mappings(business_mapping, conn)

# Close the connection when done
conn.close()

10000 / 985732 records stored in user_item_index
20000 / 985732 records stored in user_item_index
30000 / 985732 records stored in user_item_index
40000 / 985732 records stored in user_item_index
50000 / 985732 records stored in user_item_index
60000 / 985732 records stored in user_item_index
70000 / 985732 records stored in user_item_index
80000 / 985732 records stored in user_item_index
90000 / 985732 records stored in user_item_index
100000 / 985732 records stored in user_item_index
110000 / 985732 records stored in user_item_index
120000 / 985732 records stored in user_item_index
130000 / 985732 records stored in user_item_index
140000 / 985732 records stored in user_item_index
150000 / 985732 records stored in user_item_index
160000 / 985732 records stored in user_item_index
170000 / 985732 records stored in user_item_index
180000 / 985732 records stored in user_item_index
190000 / 985732 records stored in user_item_index
200000 / 985732 records stored in user_item_index
210000 / 