### User-base Collaborative Filtering - Retrieval
This notebook demonstrates how to retrieve the recommendations for a user using the Item-based Collaborative Filtering model in production. 

#### Pre-requisites
1. Have the index trained and saved in the `yelp_UserCF.db` file.

In [1]:
import sqlite3
import pickle
import sys
sys.path.append('../')
from utilities import *
from sklearn.model_selection import train_test_split

In [2]:
# Connect to the SQLite database
db_path = './yelp_UserCF.db'
conn = sqlite3.connect(db_path)

In [3]:
# Database consistency check
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM user_mapping")
print(f"User mappings: {cursor.fetchone()[0]}")
cursor.execute("SELECT COUNT(*) FROM user_user_similarity")
print(f"Similarity vectors: {cursor.fetchone()[0]}")
cursor.execute("SELECT COUNT(*) FROM user_item_index")
print(f"User-item records: {cursor.fetchone()[0]}")

User mappings: 162083
Similarity vectors: 162083
User-item records: 785955


In [4]:
# Load test data for overlap check
yelp_data = load_data_from_db('../../data/processed_data/yelp_data/', ['business', 'review'])
_, _, user_business = get_user_business(yelp_data['business'], yelp_data['review'])
train_data, test_data = train_test_split(user_business, test_size=0.2, random_state=42)
test_businesses = set(test_data['business_id'])

In [5]:
# Function to retrieve user-business mappings from the database
def retrieve_user_user_mapping(conn):
    cursor = conn.cursor()

    # Fetch user mappings
    cursor.execute('''SELECT user_id, user_idx FROM user_mapping''')
    user_mapping = {row[0]: row[1] for row in cursor.fetchall()}

    return user_mapping

In [6]:
# Function to get businesses a user interacted with
def get_user_businesses(user_id, conn):
    cursor = conn.cursor()
    cursor.execute('''SELECT business_id, stars_review FROM user_item_index WHERE user_id = ?''', (user_id,))
    return cursor.fetchall()

In [7]:
# Function to get top-k similar users for a given user
def get_top_k_similar_users(user_id, user_mapping, k, conn):
    cursor = conn.cursor()
    cursor.execute('''SELECT similarity_vector FROM user_user_similarity WHERE user_id = ?''', (user_id,))
    result = cursor.fetchone()

    if result is None:
        return []

    similarity_vector = pickle.loads(result[0])
    indices, data = similarity_vector

    # Get top-k similar users
    top_k = sorted(zip(indices, data), key=lambda x: -x[1])[:k]
    
    # # Map indices to user ids
    # similar_users = [(list(user_mapping.keys())[idx], score) for idx, score in top_k]

    idx_to_user = {v: k for k, v in user_mapping.items()}  # Reverse mapping
    similar_users = [(idx_to_user.get(idx, "Unknown"), score) for idx, score in top_k]

    return similar_users

In [8]:
# Function to predict user interests based on similar users
def predict_user_interests_usercf(user_id, user_mapping, conn, k=300, test_businesses=None):
    # Get top-k similar users
    similar_users = get_top_k_similar_users(user_id, user_mapping, k, conn)  # More similar users

    recommended_businesses = {}

    # For each similar user, get their business interactions
    for similar_user_id, similarity_score in similar_users:

        similar_user_businesses = get_user_businesses(similar_user_id, conn)
        for business_id, score in similar_user_businesses:
            if test_businesses and business_id not in test_businesses:  # Debug: force overlap
                continue
            if business_id in recommended_businesses:
                recommended_businesses[business_id] += score
            else:
                recommended_businesses[business_id] = score

    # Sort recommendations by score
    recommended_businesses = sorted(recommended_businesses.items(), key=lambda x: -x[1])   
    
    return recommended_businesses[:k]

In [9]:
# Retrieve the user-business mappings
user_mapping = retrieve_user_user_mapping(conn)

In [10]:
# Test similarity retrieval
user_id = 'razUB7ciYZluvxWM6shmtw'
similar_users = get_top_k_similar_users(user_id, user_mapping, 5, conn)
print(f"Top 5 similar users for {user_id}: {similar_users}")

Top 5 similar users for razUB7ciYZluvxWM6shmtw: [('razUB7ciYZluvxWM6shmtw', np.float64(433.0)), ('8EneFRH-5HEmfQ9AOJwc-Q', np.float64(52.0)), ('h8L_oYRfB_fhpwPM9-pgpQ', np.float64(45.0)), ('_BcWyKQL16ndpBdggh2kNA', np.float64(35.0)), ('nDEC-7GY0jSG2PlAnMCeZg', np.float64(25.0))]


In [11]:
# Get the top 10 users in the database
# user_ids = list(user_mapping.keys())[:10]

user_ids = ['razUB7ciYZluvxWM6shmtw', '3YhG4h4Ok654iVfqdmkuRg', 'VyC2fG4dcMG07nrxh4jLnw', 'Q5jOFJYhIsN8ouJ1rnsLQQ', 'gdcRlubKDmslUYFPHUp1Cg', 'CNeaJDD_ZihiafOvSGSpPw', 'TItDMAEw7_6Nge38jJPspA', 'zxyKewY6p0CjnFprEutaog', 'IsdzdZEH9uHTnwX3acqhtg', 'AC-oO5luq3enSNFDYAVKJA']

for user_id in user_ids:
    recommendations = predict_user_interests_usercf(user_id, user_mapping, k=10, conn=conn)
    print(f"Recommendations for user {user_id}: {recommendations}")

# Close the database connection when done
conn.close()

Recommendations for user razUB7ciYZluvxWM6shmtw: [('KIjOP0a4gTe-zRdnVCgsGA', 15.0), ('297f-8AdTbu-wwVedvac6w', 14.0), ('qlt0aeVnh4B3X5mUeEUtnQ', 13.0), ('Mt1FB8hM4XTi3Tk4i0q5gQ', 10.0), ('U6pzPIczfAYNrEygELdUdw', 10.0), ('VgkRK_mHhy15M3bZk0isTw', 10.0), ('xVFNGIsWBTcYmKB85HH6Qg', 10.0), ('neiMxOk7V4Zgq5zsvuWTaQ', 10.0), ('uKK_8fcwaV5JuCv3Nj-OIw', 10.0), ('tYqIKRQjDCywDVkbfUGSug', 10.0)]
Recommendations for user 3YhG4h4Ok654iVfqdmkuRg: [('VXPpzhD7mA262gIv1T0WPQ', 24.0), ('f9_TLVlUHBv0869CygEbZg', 17.0), ('2fTfpN5SggLgW4LlzptMPg', 14.0), ('mMnkbIMG9MUW-rb5a5Q8Og', 14.0), ('HK8VCqWyo1Rxw_2KTCn2Og', 14.0), ('26EPJeHypRPvF4UgJWi2Zw', 13.0), ('6FPr7wbkqnRlRmFVniwmHg', 13.0), ('eOWBvwP1MgJx6PrrWBoUuA', 13.0), ('Eq5w0ZAW0PV30nNkJxJY_A', 13.0), ('2Y7h1oxMRfVVGvc02aTn5g', 12.0)]
Recommendations for user VyC2fG4dcMG07nrxh4jLnw: [('NnlXgLZ93KPj1wYSiqjyIg', 20.0), ('l4IeGBHXV2E8S8COd9rx9A', 17.0), ('j4KA-BujEH7xmG6bD5E-gQ', 14.0), ('-B6XL-ZWsVHlAQyYcd3eEg', 13.0), ('x45_iDu3qheXFJrWNobmHQ', 10.0), 