### Item-base Collaborative Filtering - Model & Index
This notebook demonstrates how to build a Item-based collaborative filtering model using Yelp dataset. You can adjust the model to add more features or change the hyperparameters to improve the model performance. The index is built and stored in the `yelp_ItemCF.db` file.

#### Pre-requisites
1. Have the processed Yelp dataset in the `../../data/processed_data/yelp_data` folder.
2. Have the virtual environment setup and used for the notebook.

#### Move to Production
1. Copy the `yelp_ItemCF.db` file to the `../../data/processed_data` folder.
2. Update the `ItemCF.py` file in the `../backend/models` folder if there is changes in retrieval process.


In [42]:
import sqlite3
import pickle
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sparse_dot_topn import sp_matmul_topn
from geopy.distance import geodesic 

In [2]:
# Define the database folder path and file names
db_folder = '../../data/processed_data/yelp_data/'
db_files = ['yelp_business_data.db', 'yelp_review_data.db']
db_paths = [db_folder + db_file for db_file in db_files]

In [3]:
# Connect to the databases and load data
def load_data_from_db():
    data = {}
    
    # Open connections and read tables
    conns = [sqlite3.connect(db_path) for db_path in db_paths]
    try:
        # Load tables from the databases
        data['business'] = pd.read_sql_query("SELECT * FROM business_details", conns[0])
        data['categories'] = pd.read_sql_query("SELECT * FROM business_categories", conns[0])
        data['review'] = pd.read_sql_query("SELECT * FROM review_data", conns[1])
    finally:
        # Close all database connections
        for conn in conns:
            conn.close()
    return data

In [4]:
# Load data into a dictionary
yelp_data = load_data_from_db()

# Check loaded data
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.


In [43]:
df_business = yelp_data["business"]
df_review = yelp_data["review"]

df_concat = df_business.merge(df_review, on='business_id', how='outer', suffixes=('_business', '_review'))
df_concat["timestamp"] = pd.to_datetime(df_concat["date"]).astype(int) // 10**9

user_business = df_concat[["user_id", "business_id", "stars_review", "timestamp", "latitude", "longitude", "city", "state"]]


In [44]:
user_business

Unnamed: 0,user_id,business_id,stars_review,timestamp,latitude,longitude,city,state
0,razUB7ciYZluvxWM6shmtw,--30_8IhuyMHbSOcNWd6DQ,5.0,1342650622,40.255362,-75.088399,Jamison,PA
1,3YhG4h4Ok654iVfqdmkuRg,--7PUidqRWpRSpXebiyxTg,2.0,1443362006,53.554659,-113.493040,Edmonton,AB
2,VyC2fG4dcMG07nrxh4jLnw,--7PUidqRWpRSpXebiyxTg,1.0,1505578181,53.554659,-113.493040,Edmonton,AB
3,Q5jOFJYhIsN8ouJ1rnsLQQ,--7PUidqRWpRSpXebiyxTg,1.0,1407445516,53.554659,-113.493040,Edmonton,AB
4,gdcRlubKDmslUYFPHUp1Cg,--8IbOsAAxjKRoYsBFL-PA,2.0,1434420633,30.006341,-90.074523,Gentilly,LA
...,...,...,...,...,...,...,...,...
985727,TkwnhxZfy7AFW1cEIn5u1A,zznJox6-nmXlGYNWgTDwQQ,4.0,1366231576,27.990058,-82.730226,Clearwater,FL
985728,weuxfeOxeGs8InkBS1ivbQ,zznJox6-nmXlGYNWgTDwQQ,3.0,1573929847,27.990058,-82.730226,Clearwater,FL
985729,Gix3hMYtxiiQd4Pg626GfQ,zznJox6-nmXlGYNWgTDwQQ,1.0,1525717667,27.990058,-82.730226,Clearwater,FL
985730,rB1vREB0x_uynI0ADMs2iA,zztOG2cKm87I6Iw_tleZsQ,5.0,1622678754,40.092606,-75.393004,King of Prussia,PA


In [46]:
import time
# Define decay factor (tune this)
LAMBDA = 0.000000001  # Adjust this based on how fast old reviews should decay

# Get the current timestamp
current_timestamp = int(time.time())

# Compute time-based weight
user_business["time_weight"] = np.exp(-LAMBDA * (current_timestamp - user_business["timestamp"]))

# Apply weight to ratings
user_business["weighted_stars"] = user_business["stars_review"] * user_business["time_weight"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_business["time_weight"] = np.exp(-LAMBDA * (current_timestamp - user_business["timestamp"]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_business["weighted_stars"] = user_business["stars_review"] * user_business["time_weight"]


In [47]:
user_business["weighted_stars"]

0         3.363498
1         1.487954
2         0.791735
3         0.717730
4         1.474709
            ...   
985727    2.755004
985728    2.543231
985729    0.807842
985730    4.450470
985731    4.283590
Name: weighted_stars, Length: 985732, dtype: float64

In [45]:
### ----- REGIONAL SIMILARITY ADJUSTMENT -----
def region_similarity(biz1, biz2):
    city1, state1 = df_concat.loc[df_concat["business_id"] == biz1, ["city", "state"]].values[0]
    city2, state2 = df_concat.loc[df_concat["business_id"] == biz2, ["city", "state"]].values[0]
    
    if city1 == city2:
        return 1.2  # Boost for same city
    elif state1 == state2:
        return 1.1  # Small boost for same state
    return 1  # No boost otherwise


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_business["time_weight"] = np.exp(-LAMBDA * (current_timestamp - user_business["timestamp"]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_business["weighted_stars"] = user_business["stars_review"] * user_business["time_weight"]


In [None]:
# Create user and business index mappings
user_mapping = {user: idx for idx, user in enumerate(user_business['user_id'].unique())}
business_mapping = {biz: idx for idx, biz in enumerate(user_business['business_id'].unique())}

# Map user_id and business_id to numerical indices
user_business['user_idx'] = user_business['user_id'].map(user_mapping)
user_business['business_idx'] = user_business['business_id'].map(business_mapping)

# Creating the sparse user-item interaction matrix using weighted_stars
user_item_sparse = csr_matrix(
    (user_business['weighted_stars'], (user_business['user_idx'], user_business['business_idx'])),
    shape=(len(user_mapping), len(business_mapping))
)

# Replace NaN values in the sparse matrix
user_item_sparse.data = np.nan_to_num(user_item_sparse.data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_business['user_idx'] = user_business['user_id'].map(user_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_business['business_idx'] = user_business['business_id'].map(business_mapping)


In [53]:
# Precompute inverse mapping to avoid slow lookups
inv_business_mapping = {v: k for k, v in business_mapping.items()}

# Convert business IDs to arrays for fast lookups
business_ids = np.array(list(business_mapping.keys()))

def apply_region_weights(C, business_ids, region_similarity):
    """Applies only regional similarity weighting to a sparse similarity matrix."""
    
    row_indices = np.repeat(np.arange(C.shape[0]), np.diff(C.indptr))  # Row indices
    col_indices = C.indices  # Column indices (similar businesses)

    # Get actual business IDs for the matching pairs
    biz1_ids = business_ids[row_indices]  
    biz2_ids = business_ids[col_indices]  

    # Compute region weights for all stored values
    reg_weights = np.array([region_similarity(b1, b2) for b1, b2 in zip(biz1_ids, biz2_ids)])

    # Apply weights directly to the similarity matrix
    C.data *= reg_weights  # Only applying regional similarity

    return C

# Optimized sparse cosine similarity function with regional weights only
def sparse_cosine_similarity_topn(A, top_n, threshold=0):
    """Computes top-N sparse cosine similarity with regional similarity adjustment."""
    
    C = sp_matmul_topn(A.T, A.T, top_n=top_n, threshold=threshold, n_threads=4, sort=True)

    # Apply only regional similarity first
    C = apply_region_weights(C, business_ids, region_similarity)

    return C


In [54]:
# Compute optimized sparse cosine similarity matrix
item_similarity_sparse = sparse_cosine_similarity_topn(user_item_sparse, top_n=50, threshold=0.01)

KeyboardInterrupt: 

In [8]:
'''
# function without distance and region adjustments
# Function to calculate sparse cosine similarity with top N items
def sparse_cosine_similarity_topn(A, top_n, threshold=0):
    # A is the sparse matrix (user-item matrix)
    # ntop is the number of top similar items you want
    # lower_bound is the minimum similarity score to consider

    # # Compute the top N cosine similarities in a sparse format
    
    C = sp_matmul_topn(A.T, A.T, top_n=top_n, threshold=threshold, n_threads=4, sort=True)

    return C
    
'''

In [21]:
def optimize_db(conn):
    """Apply SQLite performance optimizations."""
    cursor = conn.cursor()
    cursor.executescript('''
        PRAGMA synchronous = OFF;
        PRAGMA journal_mode = MEMORY;
        PRAGMA temp_store = MEMORY;
        PRAGMA cache_size = 1000000;
    ''')
    conn.commit()


def insert_user_item(user_business, conn, batch_size=50000):
    """Optimized batch insert for user-item interactions."""
    cursor = conn.cursor()
    cursor.execute('BEGIN TRANSACTION')

    total_records = len(user_business)
    data = user_business[['user_id', 'business_id', 'stars_review']].values.tolist()

    for i in range(0, total_records, batch_size):
        batch = data[i:i + batch_size]
        cursor.executemany('''INSERT OR IGNORE INTO user_item_index (user_id, business_id, stars_review)
                              VALUES (?, ?, ?)''', batch)

        if i % (batch_size * 5) == 0:  # Commit every 5 batches
            conn.commit()
            print(f"Inserted {i + len(batch)} / {total_records} user-item records.")

    conn.commit()  # Final commit
    print(f"Total {total_records} user-item records inserted.")


def insert_item_vectors(item_similarity_sparse, business_mapping, conn, batch_size=5000, progress_interval=50000):
    """Optimized batch insert for item similarity vectors."""
    cursor = conn.cursor()
    cursor.execute('BEGIN TRANSACTION')

    total_inserted = 0
    batch = []
    business_keys = list(business_mapping.keys())  # Convert keys to list for faster indexing

    for row_idx in range(item_similarity_sparse.shape[0]):
        row_vector = item_similarity_sparse.getrow(row_idx)
        row_indices = row_vector.indices
        row_data = row_vector.data

        serialized_row = pickle.dumps((row_indices, row_data))
        item_id = business_keys[row_idx]  # Faster lookup

        batch.append((item_id, serialized_row))

        if len(batch) >= batch_size:
            cursor.executemany('''INSERT OR REPLACE INTO item_item_similarity (item_id, similarity_vector)
                                  VALUES (?, ?)''', batch)
            total_inserted += len(batch)

            if total_inserted % progress_interval == 0:
                print(f"Inserted {total_inserted} item vectors...")

            batch = []

    if batch:  # Insert remaining records
        cursor.executemany('''INSERT OR REPLACE INTO item_item_similarity (item_id, similarity_vector)
                              VALUES (?, ?)''', batch)
        total_inserted += len(batch)

    conn.commit()
    print(f"Total {total_inserted} item vectors inserted.")


def insert_mappings(business_mapping, conn, batch_size=50000):
    """Optimized batch insert for business mappings."""
    cursor = conn.cursor()
    cursor.execute('BEGIN TRANSACTION')

    data = list(business_mapping.items())
    total_records = len(data)

    for i in range(0, total_records, batch_size):
        batch = data[i:i + batch_size]
        cursor.executemany('''INSERT OR REPLACE INTO business_mapping (business_id, business_idx)
                              VALUES (?, ?)''', batch)

        if i % (batch_size * 5) == 0:  # Commit every 5 batches
            conn.commit()
            print(f"Inserted {i + len(batch)} / {total_records} business mappings.")

    conn.commit()
    print(f"Total {total_records} business mappings inserted.")


In [25]:
# Connect to SQLite (this will create a file-based database)
db_path = './yelp_ItemCF.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
optimize_db(conn)

# Create tables for user-item and item-item indexes
cursor.execute('''CREATE TABLE IF NOT EXISTS user_item_index (
    user_id TEXT,
    business_id TEXT,
    stars_review REAL,
    PRIMARY KEY (user_id, business_id)
)''')

cursor.execute('''CREATE INDEX idx_user_item ON user_item_index(user_id, business_id)''')

cursor.execute('''CREATE TABLE IF NOT EXISTS item_item_similarity (
    item_id TEXT PRIMARY KEY,
    similarity_vector BLOB
)''')

cursor.execute('''CREATE INDEX idx_item_similarity ON item_item_similarity(item_id)''')

# cursor.execute('''CREATE TABLE IF NOT EXISTS user_mapping (
#     user_id TEXT PRIMARY KEY,
#     user_idx INTEGER
# )''')

cursor.execute('''CREATE TABLE IF NOT EXISTS business_mapping (
    business_id TEXT PRIMARY KEY,
    business_idx INTEGER
)''')


# Commit the changes
conn.commit()

In [26]:
insert_user_item(user_business, conn)
insert_item_vectors(item_similarity_sparse, business_mapping, conn)
insert_mappings(business_mapping, conn)

Inserted 50000 / 985732 user-item records.
Inserted 300000 / 985732 user-item records.
Inserted 550000 / 985732 user-item records.
Inserted 800000 / 985732 user-item records.
Total 985732 user-item records inserted.
Inserted 50000 item vectors...
Total 78059 item vectors inserted.
Inserted 50000 / 78059 business mappings.
Total 78059 business mappings inserted.


In [29]:
# Close the connection when done
conn.close()