# Pre-Requisits

### Importing Libraries and Data Loading

In [None]:
import pandas as pd
import numpy as np
import datetime as dt

import implicit
import scipy.sparse as sparse

import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)


df = pd.read_csv("C:/Users/tohid/OneDrive/Desktop/Data Analysis/Project/E-Commerce Project/data/raw/E-Commerce_data.csv")
print(df.shape)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Transforming Columns to DateTime
df["purchase_date"] = pd.to_datetime(df["purchase_date"])
df["return_date"] = pd.to_datetime(df["return_date"], errors="coerce")

In [None]:
# Including Related Columns
model_data = df[
    ['id','returning','product','category','purchase_amount','purchase_date','returned','return_date']
    ].copy()
# Filling missing values
model_data['return_date'] = model_data['return_date'].fillna("Not-Returned")
# model_data.head(10)

# Model Building


### User Based Product Recommendation

In [None]:
## Filtering Data for Returned Items
fd = model_data[(model_data['returned']!='refund') & (model_data['returned']!='exchange')]
fd.shape

(2230742, 8)

In [5]:
# Dataset Copy with filtered columns
cf_data = fd[['id','product', 'purchase_amount']].copy()

In [6]:
purchase_counts = cf_data.groupby('id').size()
high_purchase_ids = purchase_counts[purchase_counts>4].index
filtered_cf = cf_data[cf_data['id'].isin(high_purchase_ids)]
low_purchase_data = cf_data[~cf_data['id'].isin(high_purchase_ids)]
low_purchase_data = low_purchase_data.groupby(['id','product'])['purchase_amount'].sum().reset_index()

In [7]:
np.random.seed(42)
split_mask = filtered_cf.groupby('id')['purchase_amount'].transform(lambda x: np.random.rand(len(x))<0.8)
train_data = filtered_cf[split_mask]
test_data = filtered_cf[~split_mask]

In [9]:
test_data_2 = test_data.copy()
test_data_2 = pd.concat([test_data_2, low_purchase_data])
test_data_2 = test_data_2.sample(frac=1).reset_index(drop=True)

In [10]:
train_pivot = train_data.pivot_table(index='id',columns='product',values='purchase_amount',aggfunc='count',fill_value=0)
test_pivot = test_data.pivot_table(index='id',columns='product',values='purchase_amount',aggfunc='count',fill_value=0)
data = data.pivot_table(index='id',columns='product',values='purchase_amount',aggfunc='count',fill_value=0)

In [11]:
test_pivot = test_pivot.reindex(columns=train_pivot.columns,fill_value=0)

In [12]:
train_csr = sparse.csr_matrix(train_pivot.values)
test_csr = sparse.csr_matrix(test_pivot.values)

In [13]:
def simple_evaluation(test_csr, predictions, k):
    """
    Evaluate recommendations using average Precision@k and Recall@k.
    
    Parameters:
      test_csr: 2D array or CSR matrix of actual user-item interactions (n_users x n_items).
      predictions: 2D array where each row contains recommended item indices for the corresponding user.
      k: Number of top recommendations to consider.
    
    Returns:
      avg_precision: Average Precision@k over all users in the predictions array.
      avg_recall: Average Recall@k over all users in the predictions array.
    """
    # Iterate only over the number of rows present in predictions.
    n_users = predictions.shape[0]
    precisions = []
    recalls = []
    
    for user in range(n_users):
        # If using a CSR matrix, convert the row to a dense array.
        if hasattr(test_csr[user], "toarray"):
            user_actual = test_csr[user].toarray()[0]
        else:
            user_actual = test_csr[user] 
        
        # Get the indices of items with interactions.
        actual_items = set(np.where(user_actual > 0)[0])
        # Get the top-K predicted item indices for the user.
        recommended_items = set(predictions[user][:k])
        
        # Count how many recommended items are in the actual interactions.
        hits = len(actual_items & recommended_items)
        
        # Compute precision and recall.
        precision = hits / k
        recall = hits / len(actual_items) if actual_items else 0
        
        precisions.append(precision)
        recalls.append(recall)
    
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    
    return avg_precision, avg_recall


def evaluate_ranking_metrics(test_csr, predictions, k):
    """
    Evaluate ranking metrics: Mean Average Precision (MAP@k) and NDCG@k.
    
    Parameters:
      test_csr: 2D array or CSR matrix of actual user-item interactions 
                (shape: n_users x n_items).
      predictions: 2D array where each row contains recommended item indices 
                   for the corresponding user.
      k: Number of top recommendations to consider.
    
    Returns:
      avg_map: Mean Average Precision at k over all users.
      avg_ndcg: Mean Normalized Discounted Cumulative Gain at k over all users.
    """
    map_scores = []
    ndcg_scores = []
    n_users = predictions.shape[0]
    
    for user in range(n_users):
        # Convert sparse row to dense array if necessary.
        if hasattr(test_csr[user], "toarray"):
            user_actual = test_csr[user].toarray()[0]
        else:
            user_actual = test_csr[user]
            
        # Identify items with interactions.
        actual_items = set(np.where(user_actual > 0)[0])
        if not actual_items:
            # Skip users with no interactions in the test set.
            continue
        
        # ----- MAP@k Calculation -----
        num_hits = 0.0
        ap = 0.0
        for i, pred in enumerate(predictions[user][:k]):
            if pred in actual_items:
                num_hits += 1
                ap += num_hits / (i + 1)
        # Average precision: normalize by the number of actual relevant items.
        average_precision = ap / len(actual_items)
        map_scores.append(average_precision)
        
        # ----- NDCG@k Calculation -----
        dcg = 0.0
        for i, pred in enumerate(predictions[user][:k]):
            if pred in actual_items:
                dcg += 1.0 / np.log2(i + 2)  # i+2 because positions are 1-indexed in the log term.
        
        # Calculate the ideal DCG (IDCG): best possible ranking of the relevant items.
        ideal_hits = min(len(actual_items), k)
        idcg = sum(1.0 / np.log2(i + 2) for i in range(ideal_hits))
        ndcg = dcg / idcg if idcg > 0 else 0.0
        ndcg_scores.append(ndcg)
    
    avg_map = np.mean(map_scores) if map_scores else 0.0
    avg_ndcg = np.mean(ndcg_scores) if ndcg_scores else 0.0
    return avg_map, avg_ndcg

In [23]:
from implicit.nearest_neighbours import bm25_weight
N = 50
user_ids = np.arange(len(train_pivot.index))
model = implicit.als.AlternatingLeastSquares(factors=5,calculate_training_loss=True)
train_csr = bm25_weight(train_csr)
model.fit(train_csr)

100%|██████████| 15/15 [00:08<00:00,  1.68it/s, loss=0.0513]


In [24]:
predictions = model.recommend_all(user_items=test_csr, N=N, filter_already_liked_items=True)

In [25]:
K = 50
avg_precision, avg_recall = simple_evaluation(test_csr, predictions, k=K)
avg_map, avg_ndcg = evaluate_ranking_metrics(test_csr, predictions, k=K)
print(f"MAP@{K}: {avg_map:.4f}")
print(f"NDCG@{K}: {avg_ndcg:.4f}")
print(f"Average Precision@{K}: {avg_precision:.4f}")
print(f"Average Recall@{K}: {avg_recall:.4f}")

MAP@50: 0.0000
NDCG@50: 0.0000
Average Precision@50: 0.0000
Average Recall@50: 0.0000


In [17]:
customer_id = np.random.choice(list(test_pivot.index))
product_indices, scores = model.recommend(test_pivot.index.get_loc(customer_id),
                                          user_items= test_csr[test_pivot.index.get_loc(customer_id)],
                                          N=20,
                                          filter_already_liked_items=False
                                          )

test_pivot.columns[product_indices]

Index(['Headlight Restoration Kit', 'Convection Oven', 'Brake Pads',
       'Dehumidifier', 'Popcorn', 'Dog Collar', 'Tire Shine', 'Kombucha',
       'Cat Shampoo', 'Candy', 'Spark Plugs', 'Bread Maker', 'RC Helicopters',
       'Acne Treatment', 'Bangle', 'Hamster Cage', 'Hand Cream',
       'Utility Knife', 'Car Drying Towel', 'Windshield Washer Fluid'],
      dtype='object', name='product')

In [18]:
data_csr = sparse.csc_matrix(data)
model = implicit.als.AlternatingLeastSquares(factors=50,calculate_training_loss=True)
model.fit(data_csr)

100%|██████████| 15/15 [00:31<00:00,  2.09s/it, loss=0.0109]


In [19]:
customer_id = np.random.choice(list(data.index))
product_indices, scores = model.recommend(data.index.get_loc(customer_id),
                                          user_items= data_csr[data.index.get_loc(customer_id)].tocsr(),
                                          N=10,
                                          filter_already_liked_items=True
                                          )

data.columns[product_indices]

Index(['Stand Mixer', 'Pet Scale', 'Facial Oil', 'Miter Saw', 'Pet Bowls',
       'Self-help Books', 'Pet Carrier', 'Water Dispenser', 'Tambourine',
       'Soundbar'],
      dtype='object', name='product')