In the Retrieval stage, we try to use the following methods/models to retrieve the relevant documents:
1. Item-based Collaborative Filtering
   - We only consider the `star` information that users give to the business in the `yelp_academic_dataset_review.json` file.
   - We use `csr_matrix` from `scipy` to store the user-item matrix (for sparse matrix).
   - We use `sp_matmul_topn` from `sparse_dot_topn` to calculate the cosine similarity to between two businesses. 
     - Compare to the `cosine_similarity` from `sklearn`, `sp_matmul_topn` is much faster as it only calculates the top-n similar items.
     - Compare to the `approximate_nearest_neighbors` from `annoy`, `sp_matmul_topn` is much faster when finding building the index, but slower when querying. However, Item-based Collaborative Filtering 
2. User-based Collaborative Filtering
3. Deep Structured Semantic Models

In [2]:
import sqlite3
import pickle
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sparse_dot_topn import sp_matmul_topn

In [3]:
def load_dataset(file_lists, prefix_path, chunk_size=10000):
    df_dict = {}
    prefix_path += "sampled_"
    for file in file_lists:
        try:
            df_chunks = []
            total_records = 0

            for chunk in pd.read_json(prefix_path + file, lines=True, chunksize=chunk_size):
                df_chunks.append(chunk)
                total_records += chunk.shape[0]

            df = pd.concat(df_chunks, ignore_index=True)
            df_dict[file] = df
            print(f"Total records in {file}: {df.shape[0]}.")

        except Exception as e:
            print(f"Error: {e}")
            continue
    return df_dict

In [4]:
folder_path = '../data/'
transit_bucket = 'raw_datasets/'
target_bucket = 'yelp/'
prefix_path = folder_path + transit_bucket + target_bucket
file_list = [
    "yelp_academic_dataset_business.json",
    "yelp_academic_dataset_review.json",
    # "yelp_academic_dataset_tip.json",
    # "yelp_academic_dataset_checkin.json",
    # "yelp_academic_dataset_user.json",
]

In [5]:
df = load_dataset(file_list, prefix_path)
df_business = df["yelp_academic_dataset_business.json"]
df_review = df["yelp_academic_dataset_review.json"]

df_concat = df_business.merge(df_review, on='business_id', how='outer', suffixes=('_business', '_review'))

user_business = df_concat[["user_id", "business_id", "stars_review"]]

Total records in yelp_academic_dataset_business.json: 78059.
Total records in yelp_academic_dataset_review.json: 980418.


In [6]:
# Function to calculate sparse cosine similarity with top N items
def sparse_cosine_similarity_topn(A, top_n, threshold=0):
    # A is the sparse matrix (user-item matrix)
    # ntop is the number of top similar items you want
    # lower_bound is the minimum similarity score to consider

    # # Compute the top N cosine similarities in a sparse format
    
    C = sp_matmul_topn(A.T, A.T, top_n=top_n, threshold=threshold, n_threads=4, sort=True)

    return C

In [7]:
# Create a copy of the user_business DataFrame to avoid issues with slicing
user_business = user_business.copy()

# Create user and business index mappings
user_mapping = {user: idx for idx, user in enumerate(user_business['user_id'].unique())}
business_mapping = {biz: idx for idx, biz in enumerate(user_business['business_id'].unique())}

# Map user_id and business_id to numerical indices
user_business['user_idx'] = user_business['user_id'].map(user_mapping)
user_business['business_idx'] = user_business['business_id'].map(business_mapping)

# Creating the sparse user-item interaction matrix (csr_matrix)
user_item_sparse = csr_matrix(
    (user_business['stars_review'], (user_business['user_idx'], user_business['business_idx'])),
    shape=(len(user_mapping), len(business_mapping))
)

# Replace any NaN values with 0 in the sparse matrix
user_item_sparse.data = np.nan_to_num(user_item_sparse.data)

# Compute sparse cosine similarity matrix with top 10 most similar items
item_similarity_sparse = sparse_cosine_similarity_topn(user_item_sparse, top_n=50, threshold=0.01,)

# Convert sparse similarity matrix to a DataFrame (optional, for viewing)
# item_sim_sparse_df = pd.DataFrame.sparse.from_spmatrix(
#     item_similarity_sparse, 
#     index=business_mapping.keys(), 
#     columns=business_mapping.keys()
# )

In [23]:
# Connect to SQLite (this will create a file-based database)
db_path = '../data/processed_data/yelp_ItemCF.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Create tables for user-item and item-item indexes
cursor.execute('''CREATE TABLE IF NOT EXISTS user_item_index (
    user_id TEXT,
    business_id TEXT,
    stars_review REAL
)''')

cursor.execute('''CREATE TABLE IF NOT EXISTS item_item_similarity (
    item_id TEXT PRIMARY KEY,
    similarity_vector BLOB
)''')

# Commit the changes
conn.commit()

In [24]:
# Insert user-item interaction matrix into SQLite
def insert_user_item(user_business, conn, batch_size=10000):
    cursor = conn.cursor()
    
    # Insert user-item interactions in batches
    total_records = len(user_business)
    for i in range(0, total_records, batch_size):
        batch = user_business.iloc[i:i + batch_size]
        
        cursor.executemany('''INSERT INTO user_item_index (user_id, business_id, stars_review)
                              VALUES (?, ?, ?)''',
                           batch[['user_id', 'business_id', 'stars_review']].values.tolist())
        
        conn.commit()  # Commit the batch
        
        # Show progress
        print(f"{i + len(batch)} / {total_records} records stored in user_item_index")



In [25]:
# Function to insert item vectors into SQLite
def insert_item_vectors(item_similarity_sparse, business_mapping, conn, batch_size=1000, progress_interval=100000):
    cursor = conn.cursor()

    total_inserted = 0
    batch = []

    # Iterate over each row (item) in the sparse matrix
    for row_idx in range(item_similarity_sparse.shape[0]):
        # Get the row as a sparse vector (csr_matrix row)
        row_vector = item_similarity_sparse.getrow(row_idx)

        # Serialize the row vector using pickle
        serialized_row = pickle.dumps(row_vector)

        # Get the item id (business_id)
        item_id = list(business_mapping.keys())[row_idx]

        # Add the item and its vector to the batch
        batch.append((item_id, serialized_row))

        # Insert in batches to reduce the number of commits
        if len(batch) >= batch_size:
            cursor.executemany('''INSERT INTO item_item_similarity (item_id, similarity_vector)
                                  VALUES (?, ?)''', batch)
            conn.commit()  # Commit the batch
            total_inserted += len(batch)

            # Print progress every 100,000 records
            if total_inserted % progress_interval == 0:
                print(f"Inserted {total_inserted} item vectors so far...")

            batch = []  # Clear the batch after committing

    # Insert any remaining records after the loop
    if batch:
        cursor.executemany('''INSERT INTO item_item_similarity (item_id, similarity_vector)
                              VALUES (?, ?)''', batch)
        conn.commit()
        total_inserted += len(batch)

    # Final progress message
    print(f"Total {total_inserted} item vectors inserted.")

In [26]:
# Insert user-item index with progress
insert_user_item(user_business, conn)

# Insert item vectors into the database
insert_item_vectors(item_similarity_sparse, business_mapping, conn)


# Close the connection when done

conn.close()

10000 / 985732 records stored in user_item_index
20000 / 985732 records stored in user_item_index
30000 / 985732 records stored in user_item_index
40000 / 985732 records stored in user_item_index
50000 / 985732 records stored in user_item_index
60000 / 985732 records stored in user_item_index
70000 / 985732 records stored in user_item_index
80000 / 985732 records stored in user_item_index
90000 / 985732 records stored in user_item_index
100000 / 985732 records stored in user_item_index
110000 / 985732 records stored in user_item_index
120000 / 985732 records stored in user_item_index
130000 / 985732 records stored in user_item_index
140000 / 985732 records stored in user_item_index
150000 / 985732 records stored in user_item_index
160000 / 985732 records stored in user_item_index
170000 / 985732 records stored in user_item_index
180000 / 985732 records stored in user_item_index
190000 / 985732 records stored in user_item_index
200000 / 985732 records stored in user_item_index
210000 / 

In [32]:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Retrieve the first one hundred item vectors
cursor.execute('SELECT similarity_vector FROM item_item_similarity LIMIT 100')
serialized_vectors = cursor.fetchall()

# Deserialize all vectors
item_vectors = [pickle.loads(row[0]) for row in serialized_vectors]

# Close the connection
conn.close()


In [36]:
# display the stored elements in the first item vector
item_vectors[0].data

array([25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25.,
       25., 25., 25., 25., 25., 25., 25., 20., 20., 10.,  5.])