# Capstone Project: Shopping Coupon Recommendation

**Business Objective**: In this captsone project, a customer ecommerce shooping transaction data from Kaggle is used to evaluate data and compare the performance of recommendation models using Surprise library. The business objective of this project is to recommend coupons for the brand where a user have most event_type - viewed, in-cart or purchased.

This notebook contains the recommendation modeling analysis of the selected dataset.

#### Libraries used in this application

In [86]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import squarify
import os

from matplotlib import rcParams
from scipy.linalg import svd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge, LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from surprise import Dataset, Reader, SVD, NMF, KNNBasic, SlopeOne, CoClustering, accuracy, BaselineOnly
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from collections import defaultdict
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

In [8]:
shopping = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/shopping_sample.csv')

In [9]:
shopping = shopping.drop('Unnamed: 0', axis=1)

In [10]:
shopping.drop(shopping[shopping.shopping_time == 0].index, inplace=True)

In [11]:
(shopping == 0).all()

user_id          False
brand            False
category_code    False
product_code     False
price            False
event_type       False
shopping_time    False
dtype: bool

In [12]:
shopping.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52722 entries, 0 to 53120
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   user_id        52722 non-null  int64  
 1   brand          52722 non-null  object 
 2   category_code  52722 non-null  object 
 3   product_code   52722 non-null  object 
 4   price          52722 non-null  float64
 5   event_type     52722 non-null  object 
 6   shopping_time  52722 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 3.2+ MB


#### Data Normalization

In [88]:
numeric_col = ['user_id', 'price', 'shopping_time']
categorical_col = ['brand', 'category_code', 'product_code' , 'event_type']

In [97]:
#Converting Categorical Columns to Numerical Columns using Target Encoder
shopping_encoder=shopping.copy()

encoder=preprocessing.LabelEncoder()

def target_encoder(data):
    impute_ordinal = encoder.fit_transform(np.array(data).reshape(-1,1))
    data.loc[data.notnull()] = np.squeeze(impute_ordinal)
    return data

for i in tqdm(range(len(categorical_col))):
    target_encoder(shopping_encoder[categorical_col[i]])

100%|██████████| 4/4 [00:00<00:00, 56.50it/s]


In [98]:
shopping_encoder.head()

Unnamed: 0,user_id,brand,category_code,product_code,price,event_type,shopping_time
0,513706639,818,3,3,420.09,2,15
1,518135646,49,7,86,949.47,2,20
2,513692818,49,7,86,1088.7,2,3
3,514596160,58,7,104,239.59,2,17
4,558153369,1,4,62,643.49,2,17


In [99]:
shopping_encoder.describe()

Unnamed: 0,user_id,price,shopping_time
count,52722.0,52722.0,52722.0
mean,533971900.0,354.408939,11.282406
std,18533300.0,385.276514,5.192909
min,321655800.0,0.88,1.0
25%,516032000.0,107.85,7.0
50%,530618900.0,218.26,11.0
75%,552121200.0,459.345,16.0
max,566258600.0,2574.07,23.0


In [100]:
# Removing outliers from the target variable 'price' using IQR
first_quartile = shopping_encoder['price'].quantile(.25)
third_quartile = shopping_encoder['price'].quantile(.75)
iqr = third_quartile - first_quartile 
lower = first_quartile - 1.5*iqr
upper = third_quartile + 1.5*iqr
shopping_no_outlier = shopping_encoder.loc[(shopping_encoder['price']>lower) & (shopping_encoder['price']<upper)]

In [101]:
shopping_no_outlier.describe()

Unnamed: 0,user_id,price,shopping_time
count,48497.0,48497.0,48497.0
mean,534058900.0,266.350895,11.255191
std,18562590.0,228.899468,5.186178
min,321655800.0,0.88,1.0
25%,516079400.0,99.72,7.0
50%,530858600.0,190.97,11.0
75%,552251500.0,366.8,16.0
max,566258600.0,986.09,23.0


##### Feature Importance - Work in Progress (WIP)

##### Standardizing features

In [94]:
# Before we model, we need to scale the numerical values. Since we are predicting the recommendation of coupons based on the brand purchased by the user, we will keep the 'user_brand' as target
shopping_no_outlier['user_id'] = StandardScaler().fit_transform(np.array(shopping_no_outlier['user_id']).reshape(-1,1))
shopping_no_outlier['brand'] = StandardScaler().fit_transform(np.array(shopping_no_outlier['brand']).reshape(-1,1))
shopping_no_outlier['product_code'] = StandardScaler().fit_transform(np.array(shopping_no_outlier['category_code']).reshape(-1,1))
shopping_no_outlier['price'] = StandardScaler().fit_transform(np.array(shopping_no_outlier['brand']).reshape(-1,1))

In [95]:
shopping_no_outlier.head()

Unnamed: 0,user_id,brand,category_code,product_code,price,event_type,shopping_time
0,-1.096423,0.929427,3,-1.00238,0.929427,2,15
1,-0.857822,-1.540302,7,0.670248,-1.540302,2,20
3,-1.048502,-1.511398,7,0.670248,-1.511398,2,17
4,1.298027,-1.69446,4,-0.584223,-1.69446,2,17
5,0.544104,-0.303832,2,-1.420537,-0.303832,2,17


In [None]:
# Logistic Regression
lgr = LogisticRegression()


#### Association Rules - Clustering **- TBD**

*   PCA - patterns in the columns
*   K-Means - patterns in the rows




#### Data Modeling

In [54]:
reader = Reader(rating_scale=(1,3))
data = Dataset.load_from_df(shopping_no_outlier[['user_id', 'brand', 'event_type']], reader)
trainset, testset = train_test_split(data, test_size=.30, random_state=42)

In [56]:
param_grid = {'bsl_options':{'method': ['als','sgd'],'n_epochs': [5, 10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6]}}
params = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6]}
params1 = {'n_epochs': [5, 10]}

In [23]:
models = ['Baseline', 'SVD', 'SlopeOne', 'CoClustering']

In [58]:
best_parameters = []
best_mae = []
best_rmse = []
mean_fit_time = []
mean_test_fit_time = []

##### Baseline Only

In [59]:
base = BaselineOnly()
base.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7f175a69ced0>

In [60]:
# Cross Validate the Model
cross_val_results = cross_validate(base, data, measures=['MAE', 'RMSE'], cv=5, verbose=True)
cross_val_results

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating MAE, RMSE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.1563  0.1564  0.1609  0.1606  0.1583  0.1585  0.0020  
RMSE (testset)    0.3642  0.3601  0.3782  0.3783  0.3644  0.3690  0.0077  
Fit time          0.30    0.31    0.35    0.32    0.32    0.32    0.02    
Test time         0.08    0.08    0.08    0.08    0.08    0.08    0.00    


{'test_mae': array([0.15626063, 0.15639055, 0.16091861, 0.16060951, 0.15828638]),
 'test_rmse': array([0.36420303, 0.36008602, 0.37819603, 0.37830369, 0.3643884 ]),
 'fit_time': (0.3003675937652588,
  0.3129093647003174,
  0.35263633728027344,
  0.3173205852508545,
  0.32468700408935547),
 'test_time': (0.08164763450622559,
  0.07927703857421875,
  0.07802200317382812,
  0.07595539093017578,
  0.07676911354064941)}

In [61]:
predicts = base.test(testset)
predicts[:5]

[Prediction(uid=1.2420687566653663, iid=1.2987623731977087, r_ui=2.0, est=1.9177225429136011, details={'was_impossible': False}),
 Prediction(uid=-0.6484151835398171, iid=-1.5403022941811848, r_ui=2.0, est=1.8533391358246647, details={'was_impossible': False}),
 Prediction(uid=-1.1418742086117013, iid=-1.5724184103280048, r_ui=2.0, est=1.9619179466254866, details={'was_impossible': False}),
 Prediction(uid=-0.24248671179864179, iid=-1.5403022941811848, r_ui=2.0, est=1.8533391358246647, details={'was_impossible': False}),
 Prediction(uid=-1.0887782141911377, iid=-0.3744872780516165, r_ui=2.0, est=1.9084957364314539, details={'was_impossible': False})]

In [62]:
accuracy.mae(predicts)
accuracy.rmse(predicts)

MAE:  0.1500
RMSE: 0.3504


0.3503921967762271

In [63]:
base_grid = GridSearchCV(BaselineOnly, param_grid, measures=['MAE', 'RMSE'], cv=5)
base_grid.fit(data)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimati

In [64]:
best_parameters.append(base_grid.best_params['rmse'])
best_mae.append(base_grid.best_score['mae'])
best_rmse.append(base_grid.best_score['rmse'])
mean_fit_time.append(base_grid.cv_results['mean_fit_time'].mean())
mean_test_fit_time.append(base_grid.cv_results['mean_test_time'].mean())

##### SVD

In [65]:
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f175a69c050>

In [66]:
# Cross Validate the Model
cross_val_results = cross_validate(svd, data, measures=['MAE', 'RMSE'], cv=5, verbose=True)
cross_val_results

Evaluating MAE, RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.1782  0.1733  0.1706  0.1698  0.1516  0.1687  0.0091  
RMSE (testset)    0.3817  0.3705  0.3618  0.3625  0.3770  0.3707  0.0078  
Fit time          2.67    2.64    2.65    2.62    2.66    2.65    0.02    
Test time         0.08    0.08    0.07    0.08    0.08    0.08    0.00    


{'test_mae': array([0.17822156, 0.17328514, 0.17059583, 0.16983354, 0.15155155]),
 'test_rmse': array([0.381715  , 0.37051616, 0.36184288, 0.3625056 , 0.37696481]),
 'fit_time': (2.674093723297119,
  2.6446878910064697,
  2.6499009132385254,
  2.623044967651367,
  2.6592512130737305),
 'test_time': (0.07867741584777832,
  0.08414459228515625,
  0.07491016387939453,
  0.07757043838500977,
  0.08164739608764648)}

In [67]:
predicts = svd.test(testset)
predicts[:5]

[Prediction(uid=1.2420687566653663, iid=1.2987623731977087, r_ui=2.0, est=1.9375420172053621, details={'was_impossible': False}),
 Prediction(uid=-0.6484151835398171, iid=-1.5403022941811848, r_ui=2.0, est=1.9247210534395738, details={'was_impossible': False}),
 Prediction(uid=-1.1418742086117013, iid=-1.5724184103280048, r_ui=2.0, est=1.889730853993163, details={'was_impossible': False}),
 Prediction(uid=-0.24248671179864179, iid=-1.5403022941811848, r_ui=2.0, est=1.8647285648381589, details={'was_impossible': False}),
 Prediction(uid=-1.0887782141911377, iid=-0.3744872780516165, r_ui=2.0, est=1.9232704019787057, details={'was_impossible': False})]

In [68]:
accuracy.mae(predicts)
accuracy.rmse(predicts)

MAE:  0.1382
RMSE: 0.3336


0.3335573941141729

In [69]:
svd_grid = GridSearchCV(SVD, param_grid=params, measures=['MAE', 'RMSE'], cv=5)
svd_grid.fit(data)

In [70]:
best_parameters.append(svd_grid.best_params['rmse'])
best_mae.append(svd_grid.best_score['mae'])
best_rmse.append(svd_grid.best_score['rmse'])
mean_fit_time.append(svd_grid.cv_results['mean_fit_time'].mean())
mean_test_fit_time.append(svd_grid.cv_results['mean_test_time'].mean())

##### SlopeOne

In [71]:
slope = SlopeOne()
slope.fit(trainset)

<surprise.prediction_algorithms.slope_one.SlopeOne at 0x7f175f075310>

In [72]:
# Cross Validate the Model
cross_val_results = cross_validate(slope, data, measures=['MAE', 'RMSE'], cv=5, verbose=True)
cross_val_results

Evaluating MAE, RMSE of algorithm SlopeOne on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.1514  0.1582  0.1562  0.1543  0.1556  0.1552  0.0023  
RMSE (testset)    0.3672  0.3877  0.3772  0.3715  0.3810  0.3769  0.0072  
Fit time          0.45    0.46    0.45    0.46    0.45    0.46    0.01    
Test time         0.11    0.11    0.10    0.11    0.11    0.11    0.00    


{'test_mae': array([0.15140868, 0.15821373, 0.15620595, 0.15430018, 0.15564479]),
 'test_rmse': array([0.36720644, 0.3876654 , 0.37723415, 0.37151949, 0.38099618]),
 'fit_time': (0.45217323303222656,
  0.4609370231628418,
  0.44827747344970703,
  0.4615166187286377,
  0.453336238861084),
 'test_time': (0.10975527763366699,
  0.1114797592163086,
  0.10226082801818848,
  0.10633301734924316,
  0.11178135871887207)}

In [73]:
predicts = slope.test(testset)
predicts[:5]

[Prediction(uid=1.2420687566653663, iid=1.2987623731977087, r_ui=2.0, est=2.0, details={'was_impossible': False}),
 Prediction(uid=-0.6484151835398171, iid=-1.5403022941811848, r_ui=2.0, est=2.0, details={'was_impossible': False}),
 Prediction(uid=-1.1418742086117013, iid=-1.5724184103280048, r_ui=2.0, est=2.0, details={'was_impossible': False}),
 Prediction(uid=-0.24248671179864179, iid=-1.5403022941811848, r_ui=2.0, est=2.0, details={'was_impossible': False}),
 Prediction(uid=-1.0887782141911377, iid=-0.3744872780516165, r_ui=2.0, est=1.9180370122171246, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})]

In [74]:
accuracy.mae(predicts)
accuracy.rmse(predicts)

MAE:  0.0592
RMSE: 0.2377


0.23767179259879564

In [75]:
slope_grid = GridSearchCV(SlopeOne, param_grid={}, measures=['MAE', 'RMSE'], cv=5)
slope_grid.fit(data)

In [76]:
best_parameters.append(slope_grid.best_params['rmse'])
best_mae.append(slope_grid.best_score['mae'])
best_rmse.append(slope_grid.best_score['rmse'])
mean_fit_time.append(slope_grid.cv_results['mean_fit_time'].mean())
mean_test_fit_time.append(slope_grid.cv_results['mean_test_time'].mean())

##### CoClustering

In [77]:
cocluster = CoClustering()
cocluster.fit(trainset)

<surprise.prediction_algorithms.co_clustering.CoClustering at 0x7f175969a750>

In [78]:
# Cross Validate the Model
cross_val_results = cross_validate(cocluster, data, measures=['MAE', 'RMSE'], cv=5, verbose=True)
cross_val_results

Evaluating MAE, RMSE of algorithm CoClustering on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.1550  0.1605  0.1579  0.1591  0.1612  0.1588  0.0022  
RMSE (testset)    0.3663  0.3823  0.3710  0.3806  0.3850  0.3770  0.0071  
Fit time          4.09    4.05    4.07    4.02    4.12    4.07    0.03    
Test time         0.08    0.07    0.07    0.07    0.07    0.07    0.00    


{'test_mae': array([0.155016  , 0.1605373 , 0.15788728, 0.15914305, 0.16116803]),
 'test_rmse': array([0.36627502, 0.38227159, 0.37101123, 0.38060565, 0.38496119]),
 'fit_time': (4.085868835449219,
  4.054419755935669,
  4.06904411315918,
  4.02198052406311,
  4.1226043701171875),
 'test_time': (0.07525157928466797,
  0.06616806983947754,
  0.07065582275390625,
  0.0683903694152832,
  0.06694817543029785)}

In [79]:
predicts = cocluster.test(testset)
predicts[:5]

[Prediction(uid=1.2420687566653663, iid=1.2987623731977087, r_ui=2.0, est=2.0021636779339316, details={'was_impossible': False}),
 Prediction(uid=-0.6484151835398171, iid=-1.5403022941811848, r_ui=2.0, est=1.8337907952986645, details={'was_impossible': False}),
 Prediction(uid=-1.1418742086117013, iid=-1.5724184103280048, r_ui=2.0, est=2.113004307072533, details={'was_impossible': False}),
 Prediction(uid=-0.24248671179864179, iid=-1.5403022941811848, r_ui=2.0, est=1.9179596886437444, details={'was_impossible': False}),
 Prediction(uid=-1.0887782141911377, iid=-0.3744872780516165, r_ui=2.0, est=1.9179596886437444, details={'was_impossible': False})]

In [80]:
accuracy.mae(predicts)
accuracy.rmse(predicts)

MAE:  0.1283
RMSE: 0.2557


0.2556748199163615

In [81]:
cocluster_grid = GridSearchCV(CoClustering, param_grid=params1, measures=['MAE', 'RMSE'], cv=5)
cocluster_grid.fit(data)

In [82]:
best_parameters.append(cocluster_grid.best_params['rmse'])
best_mae.append(cocluster_grid.best_score['mae'])
best_rmse.append(cocluster_grid.best_score['rmse'])
mean_fit_time.append(cocluster_grid.cv_results['mean_fit_time'].mean())
mean_test_fit_time.append(cocluster_grid.cv_results['mean_test_time'].mean())

##### Results Summary DF

In [49]:
results_df =  ''

In [50]:
results_df = pd.DataFrame({'Models': models,
                         'Best Params': best_parameters,
                         'Best MAE': best_mae,
                         'Best RMSE': best_rmse,
                         'Mean Fit Time': mean_fit_time,
                         'Mean Test Time': mean_test_fit_time}).set_index('Models')
pd.set_option('display.max_colwidth', None)

In [51]:
results_df

Unnamed: 0_level_0,Best Params,Best MAE,Best RMSE,Mean Fit Time,Mean Test Time
Models,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Baseline,"{'bsl_options': {'method': 'als', 'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}}",0.158807,0.37083,0.171407,0.096361
SVD,"{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}",0.156456,0.371596,1.186085,0.100763
SlopeOne,{},0.156518,0.377987,0.472478,0.166293
CoClustering,{'n_epochs': 5},0.159542,0.378949,1.936132,0.102242
