### User-base Collaborative Filtering - Model & Index
This notebook demonstrates how to build a User-based collaborative filtering model using Yelp dataset. You can adjust the model to add more features or change the hyperparameters to improve the model performance. The index is built and stored in the `yelp_UserCF.db` file.

Objective: Build a basic UserCF model for retrieval and prediction.  
Strategy: Use cosine similarity on training data with time-decay function; store in yelp_UserCF.db.  
Note: Test data balancing (50% +ve) handled in evaluation notebook.  

#### Pre-requisites
1. Have the processed Yelp dataset in the `../../data/processed_data/yelp_data` folder.
2. Have the virtual environment setup and used for the notebook.

#### Move to Production
1. Copy the `yelp_UserCF.db` file to the `../../data/processed_data` folder.
2. Update the `UserCF.py` file in the `../backend/models` folder if there is changes in retrieval process.

In [1]:
# Import utilities and dependencies
import sys
sys.path.append('../')
from utilities import *

from scipy.sparse import csr_matrix
from sparse_dot_topn import sp_matmul_topn
from sklearn.model_selection import train_test_split
import time

In [2]:
# Define the database folder path and file names
db_folder = '../../data/processed_data/yelp_data/'
data_files = ['business', 'categories', 'review']

# Load data into a dictionary
yelp_data = load_data_from_db(db_folder, data_files)

# Check loaded data
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.


In [3]:
# Assign dataframes
df_business = yelp_data["business"]
df_review = yelp_data["review"]

In [4]:
# Apply time decay to ratings
LAMBDA = 0.0000000005
current_timestamp = int(time.time())
df_review['timestamp'] = pd.to_datetime(df_review['date']).astype(int) // 10**9
df_review['timestamp'] = np.exp(-LAMBDA * (current_timestamp - df_review["timestamp"]))
df_review['stars'] = df_review['timestamp'] * df_review['stars']

In [5]:
# Function to prepare user-business interaction data
def get_user_business_with_time(df_business, df_review):
    df_concat = df_business.merge(df_review, on='business_id', how='outer', suffixes=('_business', '_review'))
    user_business = df_concat[["user_id", "business_id", "stars_review"]]
    user_mapping = {user: idx for idx, user in enumerate(user_business['user_id'].unique())}
    business_mapping = {biz: idx for idx, biz in enumerate(user_business['business_id'].unique())}    
    return user_mapping, business_mapping, user_business

In [6]:
# Get mappings and interaction data
user_mapping, business_mapping, user_business = get_user_business_with_time(df_business, df_review)

In [7]:
# Split into train (80%) and test (20%)
train_data, test_data = train_test_split(user_business, test_size=0.2, random_state=42)
user_business = train_data.copy()

In [8]:
# Map IDs to indices and create sparse matrix
user_business['user_idx'] = user_business['user_id'].map(user_mapping)
user_business['business_idx'] = user_business['business_id'].map(business_mapping)
user_item_sparse = csr_matrix(
    (user_business['stars_review'], (user_business['user_idx'], user_business['business_idx'])),
    shape=(len(user_mapping), len(business_mapping))
)
user_item_sparse.data = np.nan_to_num(user_item_sparse.data)

In [9]:
# Function to compute top-N cosine similarities
def sparse_cosine_similarity_topn(A, top_n, threshold=0):
    C = sp_matmul_topn(A, A.T, top_n=top_n, threshold=threshold, n_threads=4, sort=True)
    return C

In [10]:
# Compute user-user similarity
user_similarity_sparse = sparse_cosine_similarity_topn(user_item_sparse, top_n=50, threshold=0.01)

In [11]:
# Database optimization function
def optimize_db(conn):
    cursor = conn.cursor()
    cursor.executescript('''
        PRAGMA synchronous = OFF;
        PRAGMA journal_mode = MEMORY;
        PRAGMA temp_store = MEMORY;
        PRAGMA cache_size = 1000000;
    ''')
    conn.commit()

# Batch insert for user-item interactions
def insert_user_item(user_business, conn, batch_size=50000):
    cursor = conn.cursor()
    cursor.execute('BEGIN TRANSACTION')
    total_records = len(user_business)
    data = user_business[['user_id', 'business_id', 'stars_review']].values.tolist()
    for i in range(0, total_records, batch_size):
        batch = data[i:i + batch_size]
        cursor.executemany('''INSERT OR IGNORE INTO user_item_index (user_id, business_id, stars_review)
                              VALUES (?, ?, ?)''', batch)
        if i % (batch_size * 5) == 0:
            conn.commit()
            print(f"Inserted {i + len(batch)} / {total_records} user-item records.")
    conn.commit()
    print(f"Total {total_records} user-item records inserted.")

# Batch insert for user similarity vectors
def insert_user_vectors(user_similarity_sparse, user_mapping, conn, batch_size=5000, progress_interval=50000):
    cursor = conn.cursor()
    cursor.execute('BEGIN TRANSACTION')
    total_inserted = 0
    batch = []
    user_keys = list(user_mapping.keys())
    for row_idx in range(user_similarity_sparse.shape[0]):
        row_vector = user_similarity_sparse.getrow(row_idx)
        row_indices = row_vector.indices
        row_data = row_vector.data
        serialized_row = pickle.dumps((row_indices, row_data))
        user_id = user_keys[row_idx]
        batch.append((user_id, serialized_row))
        if len(batch) >= batch_size:
            cursor.executemany('''INSERT OR REPLACE INTO user_user_similarity (user_id, similarity_vector)
                                  VALUES (?, ?)''', batch)
            total_inserted += len(batch)
            if total_inserted % progress_interval == 0:
                print(f"Inserted {total_inserted} user vectors...")
            batch = []
    if batch:
        cursor.executemany('''INSERT OR REPLACE INTO user_user_similarity (user_id, similarity_vector)
                              VALUES (?, ?)''', batch)
        total_inserted += len(batch)
    conn.commit()
    print(f"Total {total_inserted} user vectors inserted.")

# Batch insert for user mappings
def insert_mappings(user_mapping, conn, batch_size=50000):
    cursor = conn.cursor()
    cursor.execute('BEGIN TRANSACTION')
    data = list(user_mapping.items())
    total_records = len(data)
    for i in range(0, total_records, batch_size):
        batch = data[i:i + batch_size]
        cursor.executemany('''INSERT OR REPLACE INTO user_mapping (user_id, user_idx)
                              VALUES (?, ?)''', batch)
        if i % (batch_size * 5) == 0:
            conn.commit()
            print(f"Inserted {i + len(batch)} / {total_records} user mappings.")
    conn.commit()
    print(f"Total {total_records} user mappings inserted.")

In [12]:
# Connect to SQLite and create tables
db_path = './yelp_UserCF.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
optimize_db(conn)

cursor.execute('''CREATE TABLE IF NOT EXISTS user_item_index (
    user_id TEXT,
    business_id TEXT,
    stars_review REAL,
    PRIMARY KEY (user_id, business_id)
)''')
cursor.execute('''CREATE INDEX IF NOT EXISTS idx_user_item ON user_item_index(user_id, business_id)''')

cursor.execute('''CREATE TABLE IF NOT EXISTS user_user_similarity (
    user_id TEXT PRIMARY KEY,
    similarity_vector BLOB
)''')
cursor.execute('''CREATE INDEX IF NOT EXISTS idx_user_similarity ON user_user_similarity(user_id)''')

cursor.execute('''CREATE TABLE IF NOT EXISTS user_mapping (
    user_id TEXT PRIMARY KEY,
    user_idx INTEGER
)''')

conn.commit()

In [13]:
# Insert data into database
insert_user_item(user_business, conn)
insert_user_vectors(user_similarity_sparse, user_mapping, conn)
insert_mappings(user_mapping, conn)

Inserted 50000 / 788585 user-item records.
Inserted 300000 / 788585 user-item records.
Inserted 550000 / 788585 user-item records.
Inserted 788585 / 788585 user-item records.
Total 788585 user-item records inserted.
Inserted 50000 user vectors...
Inserted 100000 user vectors...
Inserted 150000 user vectors...
Total 162080 user vectors inserted.
Inserted 50000 / 162080 user mappings.
Total 162080 user mappings inserted.


In [14]:
# Close the connection
conn.close()