In the Retrieval stage, we try to use the following methods/models to retrieve the relevant documents:
1. Item-based Collaborative Filtering
   - We only consider the `star` information that users give to the business in the `yelp_academic_dataset_review.json` file.
   - We use `csr_matrix` from `scipy` to store the user-item matrix (for sparse matrix).
   - We use `sp_matmul_topn` from `sparse_dot_topn` to calculate the cosine similarity to between two businesses. 
     - Compare to the `cosine_similarity` from `sklearn`, `sp_matmul_topn` is much faster as it only calculates the top-n similar items.
     - Compare to the `approximate_nearest_neighbors` from `annoy`, `sp_matmul_topn` is much faster when finding building the index, but slower when querying. However, Item-based Collaborative Filtering 
2. User-based Collaborative Filtering
3. Deep Structured Semantic Models

In [1]:
import sqlite3
import faiss
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sparse_dot_topn import sp_matmul_topn

ModuleNotFoundError: No module named 'faiss'

In [2]:
def load_dataset(file_lists, prefix_path, chunk_size=10000):
    df_dict = {}
    prefix_path += "sampled_"
    for file in file_lists:
        try:
            df_chunks = []
            total_records = 0

            for chunk in pd.read_json(prefix_path + file, lines=True, chunksize=chunk_size):
                df_chunks.append(chunk)
                total_records += chunk.shape[0]

            df = pd.concat(df_chunks, ignore_index=True)
            df_dict[file] = df
            print(f"Total records in {file}: {df.shape[0]}.")

        except Exception as e:
            print(f"Error: {e}")
            continue
    return df_dict

In [3]:
folder_path = '../data/'
transit_bucket = 'raw_datasets/'
target_bucket = 'yelp/'
prefix_path = folder_path + transit_bucket + target_bucket
file_list = [
    "yelp_academic_dataset_business.json",
    "yelp_academic_dataset_review.json",
    # "yelp_academic_dataset_tip.json",
    # "yelp_academic_dataset_checkin.json",
    # "yelp_academic_dataset_user.json",
]

In [4]:
df = load_dataset(file_list, prefix_path)
df_business = df["yelp_academic_dataset_business.json"]
df_review = df["yelp_academic_dataset_review.json"]

df_concat = df_business.merge(df_review, on='business_id', how='outer', suffixes=('_business', '_review'))

user_business = df_concat[["user_id", "business_id", "stars_review"]]

Total records in yelp_academic_dataset_business.json: 78059.
Total records in yelp_academic_dataset_review.json: 980418.


In [5]:
# Function to calculate sparse cosine similarity with top N items
def sparse_cosine_similarity_topn(A, top_n, threshold=0):
    # A is the sparse matrix (user-item matrix)
    # ntop is the number of top similar items you want
    # lower_bound is the minimum similarity score to consider

    # # Compute the top N cosine similarities in a sparse format
    
    C = sp_matmul_topn(A.T, A.T, top_n=top_n, threshold=threshold, n_threads=4, sort=True)

    return C

In [6]:
# Create a copy of the user_business DataFrame to avoid issues with slicing
user_business = user_business.copy()

# Create user and business index mappings
user_mapping = {user: idx for idx, user in enumerate(user_business['user_id'].unique())}
business_mapping = {biz: idx for idx, biz in enumerate(user_business['business_id'].unique())}

# Map user_id and business_id to numerical indices
user_business['user_idx'] = user_business['user_id'].map(user_mapping)
user_business['business_idx'] = user_business['business_id'].map(business_mapping)

# Creating the sparse user-item interaction matrix (csr_matrix)
user_item_sparse = csr_matrix(
    (user_business['stars_review'], (user_business['user_idx'], user_business['business_idx'])),
    shape=(len(user_mapping), len(business_mapping))
)

# Replace any NaN values with 0 in the sparse matrix
user_item_sparse.data = np.nan_to_num(user_item_sparse.data)

# Compute sparse cosine similarity matrix with top 10 most similar items
item_similarity_sparse = sparse_cosine_similarity_topn(user_item_sparse, top_n=50, threshold=0.01,)

# Convert sparse similarity matrix to a DataFrame (optional, for viewing)
item_sim_sparse_df = pd.DataFrame.sparse.from_spmatrix(
    item_similarity_sparse, 
    index=business_mapping.keys(), 
    columns=business_mapping.keys()
)

In [7]:
# Connect to SQLite (this will create a file-based database)
conn = sqlite3.connect('../data/processed_data/yelp_data.db')
cursor = conn.cursor()

# Create tables for storing user-item interactions and item-item similarities
cursor.execute('''
CREATE TABLE IF NOT EXISTS ItemCF_user_item (
    user_id TEXT,
    business_id TEXT,
    stars_review REAL
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS ItemCF_item_item (
    business_id TEXT,
    similar_business_id TEXT,
    similarity_score REAL
)
''')

<sqlite3.Cursor at 0x24409d71140>

In [None]:
# Insert the entire user-item interaction vectors (as JSON strings) with progress bar
user_item_vectors = {user_idx: user_item_sparse[user_idx].toarray().tolist() for user_idx in range(user_item_sparse.shape[0])}
print("Inserting user-item vectors:")
for user_id, vector in tqdm(user_mapping.items(), total=len(user_mapping), desc="User-Item Vectors"):
    cursor.execute('''
        INSERT INTO ItemCF_user_item_vector (user_id, interaction_vector)
        VALUES (?, ?)
    ''', (user_id, json.dumps(user_item_vectors[vector])))

# Insert the entire item-item similarity vectors (as JSON strings) with progress bar
item_item_vectors = {biz_idx: item_similarity_sparse[biz_idx].toarray().tolist() for biz_idx in range(item_similarity_sparse.shape[0])}
print("Inserting item-item vectors:")
for business_id, vector in tqdm(business_mapping.items(), total=len(business_mapping), desc="Item-Item Vectors"):
    cursor.execute('''
        INSERT INTO ItemCF_item_item_vector (business_id, similarity_vector)
        VALUES (?, ?)
    ''', (business_id, json.dumps(item_item_vectors[vector])))

# Commit changes and close connection
conn.commit()
conn.close()