In [1]:
import sqlite3
import pickle
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sparse_dot_topn import sp_matmul_topn

In [2]:
"""
def load_dataset(file_lists, prefix_path, chunk_size=10000):
    df_dict = {}
    prefix_path += "sampled_"
    for file in file_lists:
        try:
            df_chunks = []
            total_records = 0

            for chunk in pd.read_json(prefix_path + file, lines=True, chunksize=chunk_size):
                df_chunks.append(chunk)
                total_records += chunk.shape[0]

            df = pd.concat(df_chunks, ignore_index=True)
            df_dict[file] = df
            print(f"Total records in {file}: {df.shape[0]}.")

        except Exception as e:
            print(f"Error: {e}")
            continue
    return df_dict
"""

'\ndef load_dataset(file_lists, prefix_path, chunk_size=10000):\n    df_dict = {}\n    prefix_path += "sampled_"\n    for file in file_lists:\n        try:\n            df_chunks = []\n            total_records = 0\n\n            for chunk in pd.read_json(prefix_path + file, lines=True, chunksize=chunk_size):\n                df_chunks.append(chunk)\n                total_records += chunk.shape[0]\n\n            df = pd.concat(df_chunks, ignore_index=True)\n            df_dict[file] = df\n            print(f"Total records in {file}: {df.shape[0]}.")\n\n        except Exception as e:\n            print(f"Error: {e}")\n            continue\n    return df_dict\n'

In [3]:
"""
folder_path = '../data/'
transit_bucket = 'raw_datasets/'
target_bucket = 'yelp/'
prefix_path = folder_path + transit_bucket + target_bucket
file_list = [
    "yelp_academic_dataset_business.json",
    "yelp_academic_dataset_review.json",
]
"""

'\nfolder_path = \'../data/\'\ntransit_bucket = \'raw_datasets/\'\ntarget_bucket = \'yelp/\'\nprefix_path = folder_path + transit_bucket + target_bucket\nfile_list = [\n    "yelp_academic_dataset_business.json",\n    "yelp_academic_dataset_review.json",\n]\n'

In [4]:
# Define the database folder path and file names
db_folder = '../../data/processed_data/yelp_data/'
db_files = ['yelp_business_data.db', 'yelp_review_data.db']
db_paths = [db_folder + db_file for db_file in db_files]

In [5]:
# Connect to the databases and load data
def load_data_from_db():
    data = {}
    
    # Open connections and read tables
    conns = [sqlite3.connect(db_path) for db_path in db_paths]
    try:
        # Load tables from the databases
        data['business'] = pd.read_sql_query("SELECT * FROM business_details", conns[0])
        data['categories'] = pd.read_sql_query("SELECT * FROM business_categories", conns[0])
        data['review'] = pd.read_sql_query("SELECT * FROM review_data", conns[1])
    finally:
        # Close all database connections
        for conn in conns:
            conn.close()
    return data

In [6]:
# Load data into a dictionary
yelp_data = load_data_from_db()

# Check loaded data
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.


In [7]:
df_business = yelp_data["business"]
df_review = yelp_data["review"]

df_concat = df_business.merge(df_review, on='business_id', how='outer', suffixes=('_business', '_review'))

user_business = df_concat[["user_id", "business_id", "stars_review"]]

In [8]:
# split the data into training and test sets
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(user_business, test_size=0.2, random_state=42)

In [9]:
# Function to calculate sparse cosine similarity with top N items
def sparse_cosine_similarity_topn(A, top_n, threshold=0):
    # A is the sparse matrix (user-item matrix)
    # ntop is the number of top similar items you want
    # lower_bound is the minimum similarity score to consider

    # # Compute the top N cosine similarities in a sparse format
    
    C = sp_matmul_topn(A.T, A.T, top_n=top_n, threshold=threshold, n_threads=4, sort=True)

    return C

In [10]:
# Create a copy of the user_business DataFrame to avoid issues with slicing
user_business = train_data.copy()

# Create user and business index mappings
user_mapping = {user: idx for idx, user in enumerate(user_business['user_id'].unique())}
business_mapping = {biz: idx for idx, biz in enumerate(user_business['business_id'].unique())}

# Map user_id and business_id to numerical indices
user_business['user_idx'] = user_business['user_id'].map(user_mapping)
user_business['business_idx'] = user_business['business_id'].map(business_mapping)

# Creating the sparse user-item interaction matrix (csr_matrix)
user_item_sparse = csr_matrix(
    (user_business['stars_review'], (user_business['user_idx'], user_business['business_idx'])),
    shape=(len(user_mapping), len(business_mapping))
)

# Replace any NaN values with 0 in the sparse matrix
user_item_sparse.data = np.nan_to_num(user_item_sparse.data)

# Compute sparse cosine similarity matrix with top 10 most similar items
item_similarity_sparse = sparse_cosine_similarity_topn(user_item_sparse, top_n=50, threshold=0.01,)

In [11]:
# Connect to SQLite (this will create a file-based database)
db_path = './yelp_ItemCF.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Create tables for user-item and item-item indexes
cursor.execute('''CREATE TABLE IF NOT EXISTS user_item_index (
    user_id TEXT,
    business_id TEXT,
    stars_review REAL,
    PRIMARY KEY (user_id, business_id)
)''')

cursor.execute('''CREATE INDEX idx_user_item ON user_item_index(user_id, business_id)''')

cursor.execute('''CREATE TABLE IF NOT EXISTS item_item_similarity (
    item_id TEXT PRIMARY KEY,
    similarity_vector BLOB
)''')

cursor.execute('''CREATE INDEX idx_item_similarity ON item_item_similarity(item_id)''')

# cursor.execute('''CREATE TABLE IF NOT EXISTS user_mapping (
#     user_id TEXT PRIMARY KEY,
#     user_idx INTEGER
# )''')

cursor.execute('''CREATE TABLE IF NOT EXISTS business_mapping (
    business_id TEXT PRIMARY KEY,
    business_idx INTEGER
)''')


# Commit the changes
conn.commit()

In [12]:
def insert_user_item(user_business, conn, batch_size=10000):
    cursor = conn.cursor()

    # Start a transaction
    cursor.execute('BEGIN TRANSACTION')

    # Insert user-item interactions in batches
    total_records = len(user_business)
    for i in range(0, total_records, batch_size):
        batch = user_business.iloc[i:i + batch_size]

        cursor.executemany('''INSERT OR IGNORE INTO user_item_index (user_id, business_id, stars_review)
                              VALUES (?, ?, ?)''',
                           batch[['user_id', 'business_id', 'stars_review']].values.tolist())

        # Show progress
        print(f"{i + len(batch)} / {total_records} records stored in user_item_index")

    # Commit once at the end of the transaction
    conn.commit()

In [13]:
def insert_item_vectors(item_similarity_sparse, business_mapping, conn, batch_size=1000, progress_interval=100000):
    cursor = conn.cursor()

    # Start a transaction
    cursor.execute('BEGIN TRANSACTION')

    total_inserted = 0
    batch = []

    # Iterate over each row (item) in the sparse matrix
    for row_idx in range(item_similarity_sparse.shape[0]):
        # Get the row as a sparse vector (csr_matrix row)
        row_vector = item_similarity_sparse.getrow(row_idx)

        # Extract indices and data from the sparse vector
        row_indices = row_vector.indices
        row_data = row_vector.data

        # Serialize only indices and data (not the full matrix)
        serialized_row = pickle.dumps((row_indices, row_data))

        # Get the item id (business_id)
        item_id = list(business_mapping.keys())[row_idx]

        # Add the item and its vector to the batch
        batch.append((item_id, serialized_row))

        # Insert in batches to reduce the number of commits
        if len(batch) >= batch_size:
            cursor.executemany('''INSERT OR REPLACE INTO item_item_similarity (item_id, similarity_vector)
                                  VALUES (?, ?)''', batch)
            total_inserted += len(batch)

            # Print progress every progress_interval records
            if total_inserted % progress_interval == 0:
                print(f"Inserted {total_inserted} item vectors so far...")

            batch = []  # Clear the batch after committing

    total_inserted += len(batch)  # Add any remaining records
    # Insert any remaining records after the loop
    if batch:
        cursor.executemany('''INSERT OR REPLACE INTO item_item_similarity (item_id, similarity_vector)
                                  VALUES (?, ?)''', batch)

    # Commit once at the end of the transaction
    conn.commit()

    # Final progress message
    print(f"Total {total_inserted} item vectors inserted.")

In [14]:
def insert_mappings(business_mapping, conn):
    cursor = conn.cursor()

    # Start a transaction
    cursor.execute('BEGIN TRANSACTION')

    # Insert user mappings
    # cursor.executemany('''INSERT OR REPLACE INTO user_mapping (user_id, user_idx) VALUES (?, ?)''',
    #                    [(user_id, idx) for user_id, idx in user_mapping.items()])

    # Insert business mappings
    cursor.executemany('''INSERT OR REPLACE INTO business_mapping (business_id, business_idx) VALUES (?, ?)''',
                       [(business_id, idx) for business_id, idx in business_mapping.items()])

    # Commit once at the end of the transaction
    conn.commit()

    print(f"Inserted {len(business_mapping)} business mappings.")

In [15]:
# Insert user-item index with progress
insert_user_item(user_business, conn)

# Insert item vectors into the database
insert_item_vectors(item_similarity_sparse, business_mapping, conn)

# Insert user and business mappings
insert_mappings(business_mapping, conn)

# Close the connection when done
conn.close()

10000 / 788585 records stored in user_item_index
20000 / 788585 records stored in user_item_index
30000 / 788585 records stored in user_item_index
40000 / 788585 records stored in user_item_index
50000 / 788585 records stored in user_item_index
60000 / 788585 records stored in user_item_index
70000 / 788585 records stored in user_item_index
80000 / 788585 records stored in user_item_index
90000 / 788585 records stored in user_item_index
100000 / 788585 records stored in user_item_index
110000 / 788585 records stored in user_item_index
120000 / 788585 records stored in user_item_index
130000 / 788585 records stored in user_item_index
140000 / 788585 records stored in user_item_index
150000 / 788585 records stored in user_item_index
160000 / 788585 records stored in user_item_index
170000 / 788585 records stored in user_item_index
180000 / 788585 records stored in user_item_index
190000 / 788585 records stored in user_item_index
200000 / 788585 records stored in user_item_index
210000 / 

In [16]:
# Connect to the SQLite database
db_path = './yelp_ItemCF.db'
conn = sqlite3.connect(db_path)

In [17]:
# Function to get businesses a user interacted with
def get_user_businesses(user_id, conn):
    cursor = conn.cursor()
    cursor.execute('''SELECT business_id, stars_review FROM user_item_index WHERE user_id = ?''', (user_id,))
    return cursor.fetchall()

In [18]:
# Function to get top-k similar businesses for a given business
def get_top_k_similar_businesses(business_id, k, conn):
    cursor = conn.cursor()
    cursor.execute('''SELECT similarity_vector FROM item_item_similarity WHERE item_id = ?''', (business_id,))
    result = cursor.fetchone()

    if result is None:
        return []

    similarity_vector = pickle.loads(result[0])
    indices, data = similarity_vector

    # Get top-k similar businesses
    top_k = sorted(zip(indices, data), key=lambda x: -x[1])[:k]

    # Map indices to business ids
    similar_businesses = [(list(business_mapping.keys())[idx], score) for idx, score in top_k]

    return similar_businesses

In [19]:
# Function to predict user interests based on similar businesses
def predict_user_interests(user_id, k=10, conn=conn):
    user_businesses = get_user_businesses(user_id, conn)

    recommended_businesses = {}
    for business_id, _ in user_businesses:
        similar_businesses = get_top_k_similar_businesses(business_id, k, conn)

        for similar_business_id, score in similar_businesses:
            if similar_business_id in recommended_businesses:
                recommended_businesses[similar_business_id] += score
            else:
                recommended_businesses[similar_business_id] = score

    # Sort recommendations by score
    recommended_businesses = sorted(recommended_businesses.items(), key=lambda x: -x[1])

    return recommended_businesses[:k]

In [20]:
# get the number of positive and negative reviews in the test data
positive_reviews = test_data[test_data['stars_review'] >= 4]
negative_reviews = test_data[test_data['stars_review'] <= 2]

print(f"Number of positive reviews: {len(positive_reviews)}")
print(f"Number of negative reviews: {len(negative_reviews)}")
print(f"Total number of reviews: {len(test_data)}")
print(f"Ratio of positive to negative reviews: {len(positive_reviews) / len(negative_reviews):.2f}")

# down-sample the positive reviews to balance the dataset
positive_reviews_downsampled = positive_reviews.sample(n=len(negative_reviews), random_state=42)

# combine the down-sampled positive reviews with the negative reviews
balanced_test_data = pd.concat([positive_reviews_downsampled, negative_reviews], ignore_index=True)

# shuffle the balanced test data
balanced_test_data = balanced_test_data.sample(frac=1, random_state=42).reset_index(drop=True)

# new statistics for the balanced test data
positive_reviews = balanced_test_data[balanced_test_data['stars_review'] >= 4]
negative_reviews = balanced_test_data[balanced_test_data['stars_review'] <= 2]

print(f"Number of positive reviews: {len(positive_reviews)}")
print(f"Number of negative reviews: {len(negative_reviews)}")
print(f"Total number of reviews: {len(balanced_test_data)}")
print(f"Ratio of positive to negative reviews: {len(positive_reviews) / len(negative_reviews):.2f}")

Number of positive reviews: 136473
Number of negative reviews: 32929
Total number of reviews: 197147
Ratio of positive to negative reviews: 4.14
Number of positive reviews: 32929
Number of negative reviews: 32929
Total number of reviews: 65858
Ratio of positive to negative reviews: 1.00


In [22]:



# group the test data by user_id and get the business_id
test_data_grouped = balanced_test_data.groupby('user_id')['business_id'].apply(list).reset_index()

# get the recommendations for each user in the test data
recommendations = {}

i = 0
for user_id in test_data_grouped['user_id']:
    recommendation = predict_user_interests(user_id, k=300, conn=conn)
    business_ids, scores = [], []
    for business_id, score in recommendation:
        business_ids.append(business_id)
        scores.append(score)
    recommendations[user_id] = (business_ids, scores) 
    i += 1
    if i == 1000:
        break



In [23]:
def check_recommendations(recommendations, test_data_grouped, pos=4):
    total = 0
    total_positive = 0
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    ranks = []
    for i, row in test_data_grouped.iterrows():
        user_id = row['user_id']
        business_ids = row['business_id']
        rank = 0
        if user_id in recommendations:
            recommended_businesses = recommendations[user_id][0]
            for business_id in business_ids:
                star_rating = test_data[(test_data['user_id'] == user_id) & (test_data['business_id'] == business_id)]['stars_review'].values[0]
                if star_rating >= pos:
                    total_positive += 1
                if business_id in recommended_businesses:
                    if star_rating >= pos:
                        true_positive += 1
                    else:
                        false_positive += 1
                    # get the rank of the business_id in the recommendations
                    rank = recommended_businesses.index(business_id) + 1
                else:
                    if star_rating < pos:
                        true_negative += 1
                    else:
                        false_negative += 1
            total += len(business_ids)
        ranks.append(rank)
    return true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks

In [24]:
true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks = check_recommendations(recommendations, test_data_grouped)


In [25]:
# calculate the evaluation metrics
accuracy = (true_positive + true_negative) / total
precision = true_positive / (true_positive + false_positive) 
recall = true_positive / total_positive
f1_score = 2 * precision * recall / (precision + recall)
# mean_reciprocal_rank = np.mean([1 / rank for rank in ranks])
beta = 1.5
f_beta = (1 + beta**2) * precision * recall / (beta**2 * precision + recall)


total_negative = total - total_positive
background_stats = pd.DataFrame({
    'Total Positive': [total_positive],
    'Total Negative': [total_negative],
    'Total': [total],
    'Ratio': [total_positive / total],
})

print("Testing Data Statistics")
display(background_stats)


evaluation_metric = pd.DataFrame({
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1_score],
    # 'Mean Reciprocal Rank': [mean_reciprocal_rank],
    'F-beta Score': [f_beta]
})

print("Evaluation Metrics")
display(evaluation_metric)


confusion_matrix = pd.DataFrame({
    'True Positive': [true_positive],
    'True Negative': [true_negative],
    'False Positive': [false_positive],
    'False Negative': [false_negative]
})

print("Confusion Matrix")
display(confusion_matrix)

Testing Data Statistics


Unnamed: 0,Total Positive,Total Negative,Total,Ratio
0,833,839,1672,0.498206


Evaluation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,F-beta Score
0,0.557416,0.633238,0.265306,0.373942,0.323063


Confusion Matrix


Unnamed: 0,True Positive,True Negative,False Positive,False Negative
0,221,711,128,612
