In [73]:
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define interaction values with adjusted weights
interaction_values = {
    b'browse': 3,
    b'view': 4,
    b'add_to_cart': 4,
    b'remove_from_cart': 0,
    b'add_to_favorite': 5,
    b'remove_from_favorite': 0,
    b'completed_checkout': 7
}

# Define relevant interactions that indicate a user is interested in a product
relevant_interactions = {b'completed_checkout', b'add_to_favorite'}

# Debug function
def debug_data(data, name="DataFrame"):
    print(f"--- Debugging {name} ---")
    print("NaN Values:\n", data.isna().sum())
    print("Sample Data:\n", data.head())
    print("Variance of interaction_value:\n", data['interaction_value'].var())

# Load and preprocess data
def load_data(user_activity_path, products_path, users_path, orders_path, order_items_path):
    data, meta = arff.loadarff(user_activity_path)
    user_activity_df = pd.DataFrame(data)
    products_df = pd.read_csv(products_path)
    user_data = pd.read_csv(users_path)
    orders_df = pd.read_csv(orders_path)
    order_items_df = pd.read_csv(order_items_path)

    # Rename columns for consistency
    products_df.rename(columns={'id': 'product_id'}, inplace=True)
    orders_df.rename(columns={'id': 'order_id'}, inplace=True)

    # Map interaction values and fill NaN values
    user_activity_df['interaction_value'] = user_activity_df['activity_type'].map(interaction_values)
    user_activity_df['interaction_value'].fillna(user_activity_df['interaction_value'].mean(), inplace=True)
    
    # Ensure product_id consistency across dataframes
    user_activity_df['product_id'] = user_activity_df['product_id'].astype(int)
    products_df['product_id'] = products_df['product_id'].astype(int)
    order_items_df['product_id'] = order_items_df['product_id'].astype(int)
    orders_df['order_id'] = orders_df['order_id'].astype(int)

    # Merge orders and order items to link user_id with product_id
    ordered_products_df = pd.merge(orders_df[['order_id', 'user_id']], order_items_df[['order_id', 'product_id']],
                                   on='order_id', how='inner')

    debug_data(user_activity_df, "User Activity")
    return user_activity_df, products_df, user_data, ordered_products_df, order_items_df  # Return order_items_df as well

# Generate user relevant items based on actual data
def get_user_relevant_items(user_activity_df, relevant_interactions):
    relevant_df = user_activity_df[user_activity_df['activity_type'].isin(relevant_interactions)]
    user_relevant_items = relevant_df.groupby('user_id')['product_id'].apply(set).to_dict()
    return user_relevant_items

# Track ordered products based on merged orders and order_items data
def get_ordered_products(user_id, ordered_products_df):
    ordered_products = ordered_products_df[ordered_products_df['user_id'] == user_id]['product_id']
    return set(ordered_products)

# Calculate precision and recall
def calculate_precision_recall(recommended_products, relevant_products):
    recommended_set = set(recommended_products)
    true_positives = recommended_set.intersection(relevant_products)
    precision = len(true_positives) / len(recommended_set) if recommended_set else 0
    recall = len(true_positives) / len(relevant_products) if relevant_products else 0
    return {"precision": precision, "recall": recall}

# Load and preprocess
user_activity_df, products_df, user_data, ordered_products_df, order_items_df = load_data(
    'cleaned__user_activity1.arff', 'products.csv', 'users.csv', 'orders.csv', 'order_items.csv'
)

# Generate user_relevant_items from actual data
user_relevant_items = get_user_relevant_items(user_activity_df, relevant_interactions)

# Approach 1: Popular Products Recommendation for New Users
def recommend_popular_products(n=5, user_id=None):
    popular_products = (user_activity_df.groupby('product_id')['interaction_value']
                        .sum()
                        .reset_index()
                        .sort_values(by='interaction_value', ascending=False))
    
    # Exclude products the user has already ordered
    if user_id:
        ordered_products = get_ordered_products(user_id, ordered_products_df)
        popular_products = popular_products[~popular_products['product_id'].isin(ordered_products)]
    
    recommendations = popular_products.head(n)
    recommendations = pd.merge(recommendations, products_df[['product_id', 'name', 'price', 'category_id']], 
                               on='product_id', how='left').dropna(subset=['name', 'price', 'category_id'])
    
    # Calculate precision and recall if user_id is provided
    if user_id in user_relevant_items:
        relevant_items = user_relevant_items[user_id]
        precision_recall = calculate_precision_recall(recommendations['product_id'], relevant_items)
        print(f"Popular Products - Precision: {precision_recall['precision']:.2f}, Recall: {precision_recall['recall']:.2f}")
    
    return recommendations[['product_id', 'name', 'price', 'category_id']]

# Approach 2: Category-Based Recommendation for New Users
def recommend_by_selected_category(category_ids, n=10, user_id=None):
    filtered_products = products_df[products_df['category_id'].isin(category_ids)]
    
    # Exclude products the user has already ordered
    if user_id:
        ordered_products = get_ordered_products(user_id, ordered_products_df)
        filtered_products = filtered_products[~filtered_products['product_id'].isin(ordered_products)]
    
    recommendations = filtered_products.sample(n=min(n, len(filtered_products)))
    return recommendations[['product_id', 'name', 'price', 'category_id']]

# Approach 3: Metadata-Based Filtering (Category-Based Similarity)
def metadata_based_recommend(user_id, n=10):
    user_top_categories = user_data.loc[user_data['id'] == user_id, 
                                        ['top_category1', 'top_category2', 'top_category3']].values.flatten()
    filtered_products = products_df[products_df['category_id'].isin(user_top_categories)]
    
    # Exclude products the user has already ordered
    ordered_products = get_ordered_products(user_id, ordered_products_df)
    filtered_products = filtered_products[~filtered_products['product_id'].isin(ordered_products)]
    
    if filtered_products.empty:
        print("No products found in user's preferred categories.")
        return pd.DataFrame(columns=['product_id', 'category_similarity', 'name', 'price', 'category_id'])
    
    recommendations = [(row['product_id'], 1.0) for _, row in filtered_products.iterrows()]
    recommendations_df = pd.DataFrame(recommendations, columns=['product_id', 'category_similarity'])
    recommendations_df = pd.merge(recommendations_df.drop_duplicates('product_id'), 
                                  products_df[['product_id', 'name', 'price', 'category_id']], 
                                  on='product_id', how='left')
    
    if user_id in user_relevant_items:
        relevant_items = user_relevant_items[user_id]
        precision_recall = calculate_precision_recall(recommendations_df['product_id'], relevant_items)
        print(f"Metadata-Based - Precision: {precision_recall['precision']:.2f}, Recall: {precision_recall['recall']:.2f}")
    
    return recommendations_df.head(n)

# Approach 4: Profile-Based Filtering with Weighted Features
def profile_based_recommend(user_id, n=10):
    user_profile = user_activity_df[user_activity_df['user_id'] == user_id].copy()
    
    # Apply weights based on time decay if available
    if 'time_decay' in user_profile.columns:
        user_profile['weight'] = user_profile['interaction_value'] * user_profile['time_decay']
    else:
        user_profile['weight'] = user_profile['interaction_value']
    
    # Aggregate the weights to determine user interests
    user_interests = user_profile.groupby('product_id')['weight'].sum().reset_index()
    
    # Exclude products the user has already ordered
    ordered_products = get_ordered_products(user_id, ordered_products_df)
    user_interests = user_interests[~user_interests['product_id'].isin(ordered_products)]
    
    recommendations_df = pd.merge(user_interests, products_df[['product_id', 'name', 'price', 'category_id']], on='product_id')
    recommendations_df = recommendations_df.sort_values(by='weight', ascending=False).head(n)
    recommendations_df.rename(columns={'weight': 'interest_score'}, inplace=True)
    
    if user_id in user_relevant_items:
        relevant_items = user_relevant_items[user_id]
        precision_recall = calculate_precision_recall(recommendations_df['product_id'], relevant_items)
        print(f"Profile-Based - Precision: {precision_recall['precision']:.2f}, Recall: {precision_recall['recall']:.2f}")
    
    return recommendations_df



# Example usage for new and existing users
print("Popular Recommendations for new users:")
print(recommend_popular_products(n=10, user_id=5).to_string(index=False))

selected_categories = [2, 6, 8]
print("\nCategory-Based Recommendations for new users with selected categories [2, 6, 8]:")
print(recommend_by_selected_category(selected_categories, n=10, user_id=5).to_string(index=False))

# Define test user ID
test_user_id = 3

print("\nMetadata-Based Recommendations for user 8:")
metadata_recommendations = metadata_based_recommend(test_user_id, n=10)
print(metadata_recommendations.to_string(index=False))

print("\nProfile-Based Recommendations for user 8:")
profile_recommendations = profile_based_recommend(test_user_id, n=10)
print(profile_recommendations.to_string(index=False))

--- Debugging User Activity ---
NaN Values:
 id                     0
user_id                0
product_id             0
activity_type          0
activity_timestamp     0
product_category_id    0
activity_duration      0
category_filter        0
top_category1          0
top_category2          0
top_category3          0
search_filter          0
min_price_filter       0
max_price_filter       0
order_id               0
interaction_value      0
dtype: int64
Sample Data:
     id  user_id  product_id activity_type  activity_timestamp  \
0  1.0      2.0          14     b'browse'        1.729776e+12   
1  2.0      2.0          14     b'browse'        1.729776e+12   
2  3.0      2.0          14     b'browse'        1.729776e+12   
3  4.0      2.0          14     b'browse'        1.729776e+12   
4  5.0      2.0          14     b'browse'        1.729776e+12   

   product_category_id  activity_duration  category_filter  top_category1  \
0             3.912791                3.0         4.689655  

In [75]:
import pickle

# Save preprocessed data to avoid reloading in each API call
def save_preprocessed_data():
    # Save each relevant DataFrame
    user_activity_df.to_csv('user_activity.csv', index=False)
    products_df.to_csv('products.csv', index=False)
    user_data.to_csv('user_data.csv', index=False)
    ordered_products_df.to_csv('ordered_products.csv', index=False)
    order_items_df.to_csv('order_items.csv', index=False)

    # Save the user_relevant_items as a pickle file
    with open('user_relevant_items.pkl', 'wb') as f:
        pickle.dump(user_relevant_items, f)

save_preprocessed_data()
