In [1]:
import numpy as np
import pandas as pd
import pickle
import sqlite3
from tensorflow.keras.models import load_model

In [2]:
save_folder_path = "./Saved_DeepFM/"

# Load the DeepFM model
DeepFM_model = load_model(save_folder_path + 'DeepFM.keras')

# Load the saved label encoders
with open(save_folder_path + 'user_id_encoder.pkl', 'rb') as f:
    DeepFM_user_id_encoder = pickle.load(f)

with open(save_folder_path + 'business_id_encoder.pkl', 'rb') as f:
    DeepFM_business_id_encoder = pickle.load(f)

# Load the saved scalers
with open(save_folder_path + 'user_scaler.pkl', 'rb') as f:
    DeepFM_user_scaler = pickle.load(f)

with open(save_folder_path + 'business_scaler.pkl', 'rb') as f:
    DeepFM_business_scaler = pickle.load(f)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [3]:
def retrieve_user_info(user_ids):
    # Path to the user database
    db_path_user = '../../data/processed_data/yelp_data/yelp_user_data.db'

    # Connect to the database
    conn_user = sqlite3.connect(db_path_user)
    user_info = {}

    # Convert user_ids to a list if it's a single string
    if type(user_ids) == str:
        user_ids = [user_ids]

    try:
        # Fetch user details including the new fields
        user_details_query = f"""
        SELECT user_id, name, review_count, yelping_since, useful, funny, cool, fans,
               average_stars, friends, elite, compliment_hot, compliment_more,
               compliment_profile, compliment_cute, compliment_list, compliment_note,
               compliment_plain, compliment_cool, compliment_funny,
               compliment_writer, compliment_photos
        FROM user_data
        WHERE user_id IN ({','.join(['?' for _ in user_ids])})
        """
        cursor = conn_user.execute(user_details_query, user_ids)
        user_details = cursor.fetchall()

        # Add user details to result
        for user in user_details:
            user_id = user[0]
            user_info[user_id] = {
                "user_id": user_id,
                "name": user[1],
                "review_count": user[2],
                "yelping_since": user[3],
                "useful": user[4],
                "funny": user[5],
                "cool": user[6],
                "fans": user[7],
                "average_stars": user[8],
                "friends": [],
                "elite": [],
                "compliments": {
                    "hot": user[11],
                    "more": user[12],
                    "profile": user[13],
                    "cute": user[14],
                    "list": user[15],
                    "note": user[16],
                    "plain": user[17],
                    "cool": user[18],
                    "funny": user[19],
                    "writer": user[20],
                    "photos": user[21],
                }
            }
            if user[9]:
                user_info[user_id]["friends"] = user[9].split(',')
            if user[10]:
                user_info[user_id]["elite"] = user[10].split(',')

    except Exception as e:
        print(f"Error retrieving user info: {str(e)}")

    finally:
        # Close database connection
        conn_user.close()

    return user_info

In [4]:
def retrieve_business_info(business_ids):
    # Paths to database files
    db_path_business = '../../data/processed_data/yelp_data/yelp_business_data.db'
    db_path_review = '../../data/processed_data/yelp_data/yelp_review_data.db'
    db_path_user = '../../data/processed_data/yelp_data/yelp_user_data.db'
    db_path_tip = '../../data/processed_data/yelp_data/yelp_tip_data.db'

    # Connect to databases
    conn_business = sqlite3.connect(db_path_business)
    conn_review = sqlite3.connect(db_path_review)
    conn_user = sqlite3.connect(db_path_user)
    conn_tip = sqlite3.connect(db_path_tip)

    business_info = {}

    try:
        # Fetch business details including the new fields: categories, attributes, hours
        business_details_query = f"""
        SELECT b.business_id, b.name, b.address, b.city, b.state, b.postal_code, b.latitude, b.longitude, 
               b.stars, b.review_count, b.is_open, b.attributes, b.hours
        FROM business_details b
        WHERE b.business_id IN ({','.join(['?' for _ in business_ids])})
        """
        cursor = conn_business.execute(business_details_query, business_ids)
        business_details = cursor.fetchall()

        # Add business details to result, including attributes and hours
        for business in business_details:
            business_id = business[0]
            business_info[business_id] = {
                "business_id": business_id,
                "name": business[1],
                "address": business[2],
                "city": business[3],
                "state": business[4],
                "postal_code": business[5],
                "latitude": business[6],
                "longitude": business[7],
                "stars": business[8],
                "review_count": business[9],
                "is_open": business[10],
                "attributes": business[11],
                "hours": business[12],
                "categories": [],  # Placeholder for categories
                "reviews": [],
                "tips": [],
                "checkins": []
            }

        # Fetch categories for each business
        category_query = f"""
        SELECT business_id, category
        FROM business_categories
        WHERE business_id IN ({','.join(['?' for _ in business_ids])})
        """
        cursor = conn_business.execute(category_query, business_ids)
        categories = cursor.fetchall()

        # Add categories to corresponding businesses
        for category in categories:
            business_id = category[0]
            if business_id in business_info:
                business_info[business_id]['categories'].append(category[1])

        # Fetch reviews
        review_query = f"SELECT * FROM review_data WHERE business_id IN ({','.join(['?' for _ in business_ids])})"
        cursor = conn_review.execute(review_query, business_ids)
        reviews = cursor.fetchall()

        # Add reviews to corresponding businesses
        for review in reviews:
            business_id = review[2]
            review_data = {
                "review_id": review[0],
                "user_id": review[1],
                "stars": review[3],
                "date": review[4],
                "text": review[5],
                "useful": review[6],
                "funny": review[7],
                "cool": review[8]
            }
            if business_id in business_info:
                business_info[business_id]['reviews'].append(review_data)

        # Fetch tips
        tip_query = f"SELECT * FROM tip_data WHERE business_id IN ({','.join(['?' for _ in business_ids])})"
        cursor = conn_tip.execute(tip_query, business_ids)
        tips = cursor.fetchall()

        # Add tips to corresponding businesses
        for tip in tips:
            business_id = tip[1]
            tip_data = {
                "user_id": tip[0],
                "text": tip[2],
                "date": tip[3],
                "compliment_count": tip[4]
            }
            if business_id in business_info:
                business_info[business_id]['tips'].append(tip_data)

        # Fetch check-ins
        checkin_query = f"SELECT * FROM checkin_data WHERE business_id IN ({','.join(['?' for _ in business_ids])})"
        cursor = conn_business.execute(checkin_query, business_ids)
        checkins = cursor.fetchall()

        # Add check-in data to corresponding businesses
        for checkin in checkins:
            business_id = checkin[0]
            checkin_data = {
                "checkin_date": checkin[1]
            }
            if business_id in business_info:
                business_info[business_id]['checkins'].append(checkin_data)

    except Exception as e:
        print(f"Error retrieving business info: {str(e)}")

    finally:
        # Close database connections
        conn_business.close()
        conn_review.close()
        conn_user.close()
        conn_tip.close()

    return business_info

In [5]:
user_id = "9HQLEChkam3GMBQn0SmvVw"
user_info = retrieve_user_info(user_id)
encoded_user_id = DeepFM_user_id_encoder.transform([user_id])[0]

user_continuous_features = ['review_count', 'useful', 'funny', 'cool', 'fans', 'average_stars']
user_compliments = ['hot', 'more', 'profile', 'cute', 'list', 'note', 'plain', 'cool', 'funny', 'writer', 'photos']

# Ensure user_info exists
if not user_info:
    raise ValueError(f"No user information found for user_id: {user_id}")

user_data_list = []

for user_id in user_info:
    # Extract continuous features
    user_features = [user_info[user_id].get(feat, 0) for feat in user_continuous_features]
    
    # One-hot encode compliments
    user_compliments_vector = [1 if c in user_info[user_id].get('compliments', []) else 0 for c in user_compliments]
    
    # Combine into a single row
    user_data_list.append(user_features + user_compliments_vector)

# Convert to DataFrame
user_continuous_features = pd.DataFrame(user_data_list, 
                                        columns=user_continuous_features + [f'compliment_{c}' for c in user_compliments])

# Scale continuous features
user_continuous_features = pd.DataFrame(DeepFM_user_scaler.transform(user_continuous_features), 
                                        columns=user_continuous_features.columns)


In [6]:
business_ids = ["Rv8bW3pkzpi5dZu5ckbgtA", "3StNEgKAwpCFR1q0urmJrw", "tkY5BJVXxUmP4jsKavkf2g"]
business_info = retrieve_business_info(business_ids)
encoded_business_ids = DeepFM_business_id_encoder.transform(business_ids)

later_features = ["name", "address", "city", "state", "postal_code",]
business_continuous_features = ["latitude", "longitude", "stars", 
"review_count"]

# if not business_info:
    # raise ValueError(f"No business information found for business_id: {business_id}")

business_data_list = []
for business_id in business_info:
    # Extract continuous features
    business_features = [business_info[business_id].get(feat, 0) for feat in business_continuous_features]

    total_reviews = 0
    for review in business_info[business_id]['reviews']:
        total_reviews += review['stars']

    # Calculate average review
    if len(business_info[business_id]['reviews']) > 0:
        avg_review = total_reviews / len(business_info[business_id]['reviews'])
    else:
        avg_review = 0
    # Combine into a single row
    business_data_list.append(business_features + [avg_review])

business_continuous_features += ["avg_review"]

# Convert to DataFrame
business_continuous_features = pd.DataFrame(business_data_list, 
                                            columns=business_continuous_features)

ordered_business_continuous_features = ["stars", "review_count", "avg_review", "latitude", "longitude",] 

business_continuous_features = business_continuous_features[ordered_business_continuous_features]

# Scale continuous features and reorder columns
business_continuous_features = pd.DataFrame(DeepFM_business_scaler.transform(business_continuous_features), 
                                            columns=ordered_business_continuous_features)


In [7]:
# combine user and business features into one dataframe (there is only one user and maybe multiple businesses)
user_features = np.repeat(user_continuous_features.values, len(business_continuous_features), axis=0)
business_features = np.tile(business_continuous_features.values, (len(user_continuous_features), 1))
all_features = np.concatenate([user_features, business_features], axis=1)

# get the user_ids and business_ids in the right format (i.e. -1, 1)
user_ids = np.repeat(encoded_user_id, len(encoded_business_ids)).reshape(-1, 1)
business_ids = np.array(encoded_business_ids).reshape(-1, 1)

# make the prediction
predictions = DeepFM_model.predict([all_features, user_ids, business_ids])

# get the predictions
predictions = predictions.flatten()

# decode the business_ids
decoded_business_ids = DeepFM_business_id_encoder.inverse_transform(encoded_business_ids)

# combine the business_ids and predictions and sort by predictions
recommendations = list(zip(decoded_business_ids, predictions))
recommendations.sort(key=lambda x: x[1], reverse=True)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 455ms/step


In [8]:
predictions

array([ 0.3200516 , -0.43189168,  0.39021745], dtype=float32)

In [9]:
predictions = [float(prediction) for prediction in predictions]
predictions

[0.3200516104698181, -0.43189167976379395, 0.39021745324134827]

In [10]:
recommendations

[('tkY5BJVXxUmP4jsKavkf2g', np.float32(0.39021745)),
 ('Rv8bW3pkzpi5dZu5ckbgtA', np.float32(0.3200516)),
 ('3StNEgKAwpCFR1q0urmJrw', np.float32(-0.43189168))]