### Random Recommendation Baseline,
    This notebook implements a Random Recommendation baseline model for the Yelp dataset. It recommends a random set of businesses to each user and evaluates the performance using retrieval metrics.
    
#### Pre-requisites,
- The Yelp dataset is loaded from the processed data folder (`../../data/processed_data/yelp_data/`).

In [1]:
import sys
sys.path.append('../')
from utilities import *

In [2]:
# Define the database folder path and file names
db_folder = '../../data/processed_data/yelp_data/'
data_files = ['business', 'categories', 'review']

# Load data into a dictionary
yelp_data = load_data_from_db(db_folder, data_files)

# Check loaded data
for table, df in yelp_data.items():
    print(f"Loaded {len(df)} rows from {table} table.")

Loaded 78059 rows from business table.
Loaded 360656 rows from categories table.
Loaded 980418 rows from review table.


In [3]:
df_business = yelp_data["business"]
df_review = yelp_data["review"]

user_mapping, business_mapping, user_business = get_user_business(df_business, df_review)

In [4]:
df_concat = df_business.merge(df_review, on='business_id', how='outer', suffixes=('_business', '_review'))
user_business = df_concat[["user_id", "business_id", "stars_review"]]

# Get all unique business IDs from the dataset
all_business_ids = user_business['business_id'].unique()

In [5]:
train_data, test_data = train_test_split(user_business, test_size=0.2, random_state=42)

In [6]:
# Function to generate random recommendations for a user
def predict_random_interests(user_id, k=10, all_business_ids=all_business_ids):
    # Randomly sample k business IDs without replacement
    recommended_businesses = np.random.choice(all_business_ids, size=k, replace=False)
    # Generate random scores between 0 and 1
    scores = np.random.uniform(0, 1, size=k)
    return list(zip(recommended_businesses, scores))

In [7]:
positive_reviews = test_data[test_data['stars_review'] >= 4]
negative_reviews = test_data[test_data['stars_review'] < 4]

In [8]:
print(f"Number of positive reviews: {len(positive_reviews)}")
print(f"Number of negative reviews: {len(negative_reviews)}")
print(f"Total number of reviews: {len(test_data)}")
print(f"Ratio of positive to negative reviews: {len(positive_reviews) / len(negative_reviews):.2f}")

Number of positive reviews: 136473
Number of negative reviews: 59624
Total number of reviews: 197147
Ratio of positive to negative reviews: 2.29


In [9]:
test_data_grouped = test_data.groupby('user_id')['business_id'].apply(list).reset_index()

In [10]:
recommendations = {}
k_recommendations = 300
max_users = min(1000, len(test_data_grouped))

# Loop through users and generate recommendations
for i in range(max_users):
    user_id = test_data_grouped['user_id'].iloc[i]
    recommendation = predict_random_interests(user_id, k=k_recommendations)
    business_ids, scores = zip(*recommendation)
    recommendations[user_id] = (list(business_ids), list(scores))

In [11]:
def check_recommendations(recommendations, test_data_grouped, pos=4):
    total = 0
    total_positive = 0
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    ranks = []
    for i, row in test_data_grouped.iterrows():
        user_id = row['user_id']
        business_ids = row['business_id']
        rank = 0
        if user_id in recommendations:
            recommended_businesses = recommendations[user_id][0]
            for business_id in business_ids:
                star_rating = test_data[(test_data['user_id'] == user_id) & 
                                       (test_data['business_id'] == business_id)]['stars_review'].values[0]
                if star_rating >= pos:
                    total_positive += 1
                if business_id in recommended_businesses:
                    if star_rating >= pos:
                        true_positive += 1
                    else:
                        false_positive += 1
                    rank = recommended_businesses.index(business_id) + 1
                else:
                    if star_rating < pos:
                        true_negative += 1
                    else:
                        false_negative += 1
            total += len(business_ids)
        ranks.append(rank)
    return true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks

In [12]:
true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks = check_recommendations(recommendations, test_data_grouped)

In [13]:
evaluation_metric, confusion_matrix, background_stats = compute_evaluation_metric(true_positive, true_negative, false_positive, false_negative, total, total_positive, ranks)

In [14]:
print("Testing Data Statistics")
display(background_stats)

print("Evaluation Metrics")
display(evaluation_metric)

print("Confusion Matrix")
display(confusion_matrix)

Testing Data Statistics


Unnamed: 0,Total Positive,Total Negative,Total,Ratio
0,1821,813,2634,0.691344


Evaluation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,F-beta Score,Mean Reciprocal Rank
0,0.3087,0.5,0.0022,0.0044,0.0027,0.0204


Confusion Matrix


Unnamed: 0,True Positive,True Negative,False Positive,False Negative
0,4,809,4,1817
