In [None]:
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.preprocessing import MinMaxScaler
from surprise import Dataset, Reader, KNNWithMeans, SVD, accuracy
from surprise.model_selection import train_test_split

# Define interaction values with weights for various user actions.
# These weights determine the relative importance of each interaction type.
interaction_values = {
    b'browse': 3,
    b'view': 4,  # Lower priority for items the user simply viewed
    b'add_to_cart': 4,
    b'remove_from_cart': 0,
    b'add_to_favorite': 5,
    b'remove_from_favorite': 0,
    b'completed_checkout': 7  # Highest weight for completed purchases
}

# Function to print debugging information about a DataFrame
def debug_data(data, name="DataFrame"):
    print(f"--- Debugging {name} ---")
    print("NaN Values:\n", data.isna().sum())  # Check for missing values
    print("Sample Data:\n", data.head())       # Print a sample of the data
    print("Variance of interaction_value:\n", data['interaction_value'].var())  # Show variance

# Load and preprocess data from multiple files
def load_data(user_activity_path, orders_path, order_items_path, products_path, users_path):
    scaler = MinMaxScaler()  # Initialize scaler for normalization
    
    # Load datasets
    data, meta = arff.loadarff(user_activity_path)  # Load ARFF data
    user_activity_df = pd.DataFrame(data)           # Convert to DataFrame
    orders_df = pd.read_csv(orders_path)            # Load orders
    order_items_df = pd.read_csv(order_items_path)  # Load order items
    products_df = pd.read_csv(products_path)        # Load product details
    user_data = pd.read_csv(users_path)             # Load user data with top categories

    # Rename product and order IDs for consistency
    products_df.rename(columns={'id': 'product_id'}, inplace=True)
    orders_df.rename(columns={'id': 'order_id'}, inplace=True)
    
    # Convert timestamps to datetime format and calculate time decay
    user_activity_df['timestamp'] = pd.to_datetime(user_activity_df['activity_timestamp'])
    max_timestamp = user_activity_df['timestamp'].max()
    # Calculate time decay as days since last activity
    user_activity_df['time_decay'] = (max_timestamp - user_activity_df['timestamp']).dt.total_seconds() / (24 * 60 * 60)
    user_activity_df['time_decay'] = np.exp(-user_activity_df['time_decay'] / 30)  # Apply decay factor with 30-day half-life
    
    # Map each interaction type to a predefined weight and apply time decay
    user_activity_df['interaction_value'] = user_activity_df['activity_type'].map(interaction_values)
    user_activity_df['interaction_value'] *= user_activity_df['time_decay']
    
    # Adjust for session length if `activity_duration` is available
    if 'activity_duration' in user_activity_df.columns:
        user_activity_df['session_weight'] = scaler.fit_transform(user_activity_df[['activity_duration']].fillna(0))
    else:
        user_activity_df['session_weight'] = 0  # Set zero weight if session length is missing
    
    # Multiply interaction value by session weight
    user_activity_df['interaction_value'] *= (1 + user_activity_df['session_weight'])
    
    # Fill missing values in `interaction_value` with the mean value
    user_activity_df['interaction_value'].fillna(user_activity_df['interaction_value'].mean(), inplace=True)
    
    # Process completed orders with high priority
    orders_with_items = pd.merge(order_items_df, orders_df, on='order_id')
    orders_with_items['interaction_value'] = interaction_values[b'completed_checkout']
    
    # Combine all interactions for training the model
    all_interactions = pd.concat([
        user_activity_df[['user_id', 'product_id', 'interaction_value']],
        orders_with_items[['user_id', 'product_id', 'interaction_value']]
    ])
    
    # Normalize interaction values to fit the model's input requirements
    all_interactions['interaction_value_normalized'] = scaler.fit_transform(all_interactions[['interaction_value']])
    
    # Print debugging information
    debug_data(all_interactions, "Combined Interactions")
    
    return all_interactions, products_df, user_data

# Load and preprocess data from specified files
all_interactions, products_df, user_data = load_data(
    'cleaned__user_activity1.arff', 'orders.csv', 'order_items.csv', 'products.csv', 'users.csv'
)

# Train KNN and SVD models, and calculate RMSE for both training and test sets
def train_models(interactions):
    # Prepare data for training
    reader = Reader(rating_scale=(0, 1))
    data = Dataset.load_from_df(interactions[['user_id', 'product_id', 'interaction_value_normalized']], reader)
    trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
    
    # Train KNN model with specified parameters
    sim_options = {'name': 'pearson_baseline', 'user_based': True, 'min_support': 3}
    knn_model = KNNWithMeans(k=40, min_k=2, sim_options=sim_options)
    knn_model.fit(trainset)
    
    # Evaluate KNN model on training and test sets
    knn_train_predictions = knn_model.test(trainset.build_testset())
    knn_train_rmse = accuracy.rmse(knn_train_predictions, verbose=False)
    knn_test_predictions = knn_model.test(testset)
    knn_test_rmse = accuracy.rmse(knn_test_predictions, verbose=False)
    print(f"KNN Training RMSE: {knn_train_rmse:.4f}, KNN Testing RMSE: {knn_test_rmse:.4f}")
    
    # Train SVD model with matrix factorization
    svd_model = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)
    svd_model.fit(trainset)
    
    # Evaluate SVD model on training and test sets
    svd_train_predictions = svd_model.test(trainset.build_testset())
    svd_train_rmse = accuracy.rmse(svd_train_predictions, verbose=False)
    svd_test_predictions = svd_model.test(testset)
    svd_test_rmse = accuracy.rmse(svd_test_predictions, verbose=False)
    print(f"SVD Training RMSE: {svd_train_rmse:.4f}, SVD Testing RMSE: {svd_test_rmse:.4f}")
    
    return knn_model, svd_model

# Run the training function and print RMSE results
knn_model, svd_model = train_models(all_interactions)

# Hybrid recommendation function to generate personalized recommendations for a user
def hybrid_recommend(user_id, n=5, weights=(0.6, 0.4)):
    # Retrieve items that the user has completed checkout for (purchased)
    purchased_items = set(all_interactions[
        (all_interactions['user_id'] == user_id) & 
        (all_interactions['interaction_value'] == interaction_values[b'completed_checkout'])
    ]['product_id'])
    
    # Identify items to predict by excluding already purchased items
    all_items = set(all_interactions['product_id'])
    items_to_predict = list(all_items - purchased_items)

    # Retrieve user's top preferred categories from `user_data`
    user_top_categories = user_data.loc[user_data['id'] == user_id, 
                                        ['top_category1', 'top_category2', 'top_category3']].values.flatten()
    
    predictions = []
    for item_id in items_to_predict:
        # Check that the item category exists in `products_df`
        item_category_row = products_df.loc[products_df['product_id'] == item_id, 'category_id']
        if item_category_row.empty:
            continue  # Skip items with missing category information
        
        item_category = item_category_row.values[0]
        
        # Apply category boost if item belongs to user's top categories
        category_boost = 1.2 if item_category in user_top_categories else 1.0
        
        # Get predictions from KNN and SVD models, then combine with weights and boost
        knn_pred = knn_model.predict(user_id, item_id).est
        svd_pred = svd_model.predict(user_id, item_id).est
        hybrid_score = (weights[0] * knn_pred + weights[1] * svd_pred) * category_boost
        predictions.append((item_id, hybrid_score))
    
    # Select top N recommendations and merge with product details
    top_n = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]
    recommendations = pd.DataFrame(top_n, columns=['product_id', 'predicted_score'])
    recommendations = pd.merge(recommendations, products_df[['product_id', 'name', 'price', 'category_id']], on='product_id', how='left')
    
    return recommendations

# Generate recommendations for a specific user
test_user_id = 5
recommendations = hybrid_recommend(test_user_id, n=10)
print(f"\nTop 5 recommendations for user {test_user_id}:")
print(recommendations.to_string(index=False))
