# 🛍️ SmartCart Starter Notebook
This notebook will guide you through your group project on collaborative filtering and association rule mining for an e-commerce recommender system.

## 📥 Part 1: Data Preprocessing
Steps:
- Load `ecommerce_user_data.csv` and `product_details.csv`
- Merge data if necessary
- Create user-item matrix
- Fill missing ratings with 0
- Group user behavior by category

In [None]:
# Load data
import pandas as pd

user_data = pd.read_csv('data/ecommerce_user_data.csv')
product_data = pd.read_csv('data/product_details.csv')

print(user_data.head())
print(product_data.head())

In [None]:
# Create user-item matrix
user_item_matrix = user_data.pivot_table(index='UserID', columns='ProductID', values='Rating')
user_item_matrix_filled = user_item_matrix.fillna(0)
user_item_matrix_filled.head()

In [None]:
# Aggregate user behavior by category
user_category_agg = user_data.groupby(['UserID', 'Category']).agg({'Rating': ['count', 'mean']}).reset_index()
user_category_agg.columns = ['UserID', 'Category', 'TotalInteractions', 'AverageRating']
user_category_agg.head()

## 🤝 Part 2: User-Based Collaborative Filtering
Steps:
- Use cosine similarity to compare users
- Recommend top-N products based on similar users
- Evaluate with Precision@K and Coverage

In [None]:
# Compute cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between users
user_similarity = cosine_similarity(user_item_matrix_filled)

# Put into a DataFrame for easier reading
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix_filled.index, columns=user_item_matrix_filled.index)
print(user_similarity_df.head())

In [None]:
# Get Top-N Similar Users for Each User
def get_top_n_similar_users(user_id, n=5):
    if user_id not in user_similarity_df:
        return []
    # Sort users by similarity score
    sorted_users = user_similarity_df[user_id].sort_values(ascending=False)
    # Exclude the user themself
    top_users = sorted_users.drop(user_id).head(n)
    return top_users.index.tolist()

# Recommend Products Based on Similar Users
def recommend_products(user_id, n_similar=5, n_recommendations=5):
    if user_id not in user_item_matrix_filled.index:
        return []
    
    top_users = get_top_n_similar_users(user_id, n_similar)
    
    user_ratings = user_item_matrix_filled.loc[user_id]
    unseen_products = user_ratings[user_ratings == 0].index

    # Aggregate ratings from similar users
    similar_users_ratings = user_item_matrix_filled.loc[top_users]
    avg_ratings = similar_users_ratings[unseen_products].mean(axis=0)
    
    top_products = avg_ratings.sort_values(ascending=False).head(n_recommendations)
    
    return top_products.index.tolist()

# Generate Recommendations for All Users
recommendations = {}
for user in user_item_matrix_filled.index:
    recommendations[user] = recommend_products(user, n_similar=5, n_recommendations=5)

# Convert to DataFrame
recommendations_df = pd.DataFrame.from_dict(recommendations, orient='index')
recommendations_df.columns = [f"Rec_{i+1}" for i in range(recommendations_df.shape[1])]
print(recommendations_df.head())

In [None]:
# Evaluate with Precision@K
def precision_at_k(user_id, recommended_items, k=5):
    actual_items = set(ecommerce_data[ecommerce_data['UserID'] == user_id]['ProductID'])
    recommended_items = set(recommended_items[:k])
    if not actual_items:
        return 0.0
    return len(actual_items.intersection(recommended_items)) / k

precision_scores = []
for user in recommendations_df.index:
    recommended = recommendations_df.loc[user].dropna().tolist()
    precision_scores.append(precision_at_k(user, recommended))

print(f"\nAverage Precision@5: {sum(precision_scores) / len(precision_scores):.2f}")

## 🔍 Part 3: Association Rule Mining (Apriori)
Steps:
- Convert user-product interactions to transaction format
- Apply Apriori algorithm to find frequent itemsets
- Generate association rules (support, confidence, lift)

In [None]:
# Convert to transaction format
from mlxtend.preprocessing import TransactionEncoder
transactions = user_data.groupby('UserID')['ProductID'].apply(list).tolist()
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_trans = pd.DataFrame(te_ary, columns=te.columns_)

In [None]:
# Apply Apriori and generate rules
from mlxtend.frequent_patterns import apriori, association_rules
frequent_itemsets = apriori(df_trans, min_support=0.05, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.5)
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head()

## 📊 Part 4: Visualization
Steps:
- Plot user similarity heatmap
- Plot top frequent itemsets
- Visualize top recommendations

In [None]:
# Heatmap of user similarity
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
sns.heatmap(similarity_df, cmap='YlGnBu')
plt.title('User Similarity Heatmap')
plt.show()

In [None]:
# Frequent itemsets bar chart
frequent_itemsets.nlargest(10, 'support').plot(kind='bar', x='itemsets', y='support', legend=False)
plt.title('Top 10 Frequent Itemsets')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 🧠 Part 5: Conceptual Questions
Answer the following questions in your report:
1. How does data sparsity affect performance?
2. What kinds of product bundles were discovered?
3. What improvements would you suggest for real-world deployment?