# **Building and Testing Recommender Systems for E-commerce Products With Surprise - SVD Model and ALS Model**

In [1]:
!pip install scikit-surprise



In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from surprise import Reader, Dataset, SVD, BaselineOnly, accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
df_france = pd.read_excel('/content/drive/MyDrive/Datasets/df_france_rs.xlsx')

In [5]:
df_france

Unnamed: 0,InvoiceNo,StockCode,year_month,Description,Quantity,hour,InvoiceDate,UnitPrice,CustomerID,Country,...,day,dayofweek,dayofyear,weekofyear,quarter,birth_year,Age,age_category,Rating,TimeStamp
0,536370,22728,201012,Alarm Clock Bakelike Pink,24,8,2010-12-01 08:45:00,3.75,12583,France,...,1,3,335,48,4,1976,35,Millenials,3,1291161600
1,536370,22727,201012,Alarm Clock Bakelike Red,24,8,2010-12-01 08:45:00,3.75,12583,France,...,1,3,335,48,4,1976,35,Millenials,3,1291161600
2,536370,22726,201012,Alarm Clock Bakelike Green,12,8,2010-12-01 08:45:00,3.75,12583,France,...,1,3,335,48,4,1976,35,Millenials,5,1291161600
3,536370,21724,201012,Panda And Bunnies Sticker Sheet,12,8,2010-12-01 08:45:00,0.85,12583,France,...,1,3,335,48,4,1976,35,Millenials,5,1291161600
4,536370,21883,201012,Stars Gift Tape,24,8,2010-12-01 08:45:00,0.65,12583,France,...,1,3,335,48,4,1976,35,Millenials,5,1291161600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8013,579870,22636,201111,Childs Breakfast Set Circus Parade,2,16,2011-11-30 16:47:00,8.50,12437,France,...,30,3,334,48,4,1993,18,Generation Z,3,1322611200
8014,579870,84279P,201111,Cherry Blossom Decorative Flask,4,16,2011-11-30 16:47:00,3.75,12437,France,...,30,3,334,48,4,1993,18,Generation Z,5,1322611200
8015,579870,22551,201111,Plasters In Tin Spaceboy,12,16,2011-11-30 16:47:00,1.65,12437,France,...,30,3,334,48,4,1993,18,Generation Z,3,1322611200
8016,579870,22554,201111,Plasters In Tin Woodland Animals,12,16,2011-11-30 16:47:00,1.65,12437,France,...,30,3,334,48,4,1993,18,Generation Z,5,1322611200


In [6]:
df = df_france.copy()
df = df[['StockCode','CustomerID','Rating','TimeStamp']]
df

Unnamed: 0,StockCode,CustomerID,Rating,TimeStamp
0,22728,12583,3,1291161600
1,22727,12583,3,1291161600
2,22726,12583,5,1291161600
3,21724,12583,5,1291161600
4,21883,12583,5,1291161600
...,...,...,...,...
8013,22636,12437,3,1322611200
8014,84279P,12437,5,1322611200
8015,22551,12437,3,1322611200
8016,22554,12437,5,1322611200


In [7]:
user_item_rating_matrix = df.pivot_table(values = 'Rating', columns = 'StockCode', index = 'CustomerID')
user_item_rating_matrix # 'NaN' indicates products that have not been purchased by the customer

StockCode,10002,10120,10125,10135,11001,15036,15039,16012,16048,16218,...,85232D,90030B,90030C,90184B,90184C,90201B,90201C,C2,M,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12413,,,,,,,,,,,...,,,,,,,,,,4.0
12437,,,,,,,,3.0,,,...,,,,,,,,,4.0,4.0
12441,,,,,,,,,,,...,,,,,,,,,,4.0
12488,,,,4.0,,,,,,,...,,,,,,,,,,3.0
12489,,,,,,,,,,,...,,,,,,,,,,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12734,,,,,,,,,,,...,,,,,,,,,,
12735,,,4.0,,,,,,,,...,,,,,,,,,,4.0
12736,,,,,,,,,,,...,,,,,,,,,,3.0
12740,,,,,,,,,,,...,,,,,,,,,,


In [8]:
reader = Reader(rating_scale = (0,5)) # Rating scale: rating scale from the customer on the product purchased
data = Dataset.load_from_df(df[['CustomerID', 'StockCode', 'Rating']], reader)

In [9]:
data.df.head()

Unnamed: 0,CustomerID,StockCode,Rating
0,12583,22728,3
1,12583,22727,3
2,12583,22726,5
3,12583,21724,5
4,12583,21883,5


## **Validation**

In [10]:
trainset, testset = train_test_split(data, test_size = 0.25, random_state = 101)

**SVD**

In [11]:
algo = SVD()

algo.fit(trainset)
prediction = algo.test(testset)

In [12]:
accuracy.rmse(prediction)

RMSE: 0.8422


0.8422435344119258

**ALS**

In [13]:
bsl_option = {
    'method': 'als', #als atau sgd, gradient distance
    'n_epoch': 5, #berapakali iterasi / ulang
    'reg_u': 12,
    'reg_i': 5
}

algo = BaselineOnly(bsl_options = bsl_option)
algo.fit(trainset)
prediction = algo.test(testset)

Estimating biases using als...


In [14]:
accuracy.rmse(prediction)

RMSE: 0.8317


0.8317414512869512

based on train-test validation, ALS is better than SVD (lower rmse is better)

## **Cross Validation**

**SVD**

In [15]:
algo = SVD()
cv_svd = cross_validate(algo, data, measures = ['RMSE', 'MAE'], cv = 5, verbose = True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8475  0.8402  0.8453  0.8322  0.8378  0.8406  0.0054  
MAE (testset)     0.7251  0.7209  0.7304  0.7098  0.7160  0.7204  0.0071  
Fit time          1.00    0.89    0.92    0.83    0.97    0.92    0.06    
Test time         0.02    0.02    0.02    0.02    0.02    0.02    0.00    


In [16]:
print('mae cv mean', cv_svd['test_mae'].mean())

mae cv mean 0.7204258377945749


In [17]:
print('rmse cv mean', cv_svd['test_rmse'].mean())

rmse cv mean 0.8405900039676364


**ALS**

In [18]:
bsl_option = {
    'method': 'als',
    'n_epoch': 5, 
    'reg_u': 12,
    'reg_i': 5
}

algo = BaselineOnly(bsl_options = bsl_option)

cv_als = cross_validate(algo, data, measures = ['RMSE', 'MAE'], cv = 5)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


In [19]:
print('rmse cv mean', cv_als['test_rmse'].mean())

rmse cv mean 0.8308436670028658


Based on cross validation, ALS is better

## **HyperParam Tuning**

In [20]:
# Still using SVD

In [21]:
param_grid = {
    'n_epochs': [5,10],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.4, 0.6]
}

grid = GridSearchCV(SVD, param_grid, measures = ['rmse', 'mae'], cv = 3)
grid.fit(data)

In [22]:
# print('rmse cv mean', grid.best_score['rmse'])
print('rmse cv mean', grid.best_score['rmse'].mean())

rmse cv mean 0.820212662605189


In [23]:
grid.best_params['rmse']

{'lr_all': 0.002, 'n_epochs': 10, 'reg_all': 0.6}

## **Prediction Result**
- We will recommend Items 
  - '21731'	: Red Toadstool Led Night Light
  - '23084'	: Rabbit Night Light
  - '22556'	: Plasters In Tin Circus Parade
  - '22439'	: 6 Rocket Balloons
- to Customers 
  - '12681': Claudetta Guirau
  - '12683': Jaquelyn Eckh
  - '12509': Emmet Piggens

In [24]:
# display(df['CustomerID'].value_counts())
# display(df['StockCode'].value_counts())
# display(df.sample())

In [25]:
# product_list = df_france[['StockCode','Description']]
# display(product_list[product_list['StockCode'] == 21731].value_counts())
# display(product_list[product_list['StockCode'] == 23084].value_counts())
# display(product_list[product_list['StockCode'] == 22556].value_counts())
# display(product_list[product_list['StockCode'] == 22439].value_counts())

# customer_list = df_france[['CustomerID','CustomerName']]
# display(customer_list[customer_list['CustomerID'] == 12681].value_counts())
# display(customer_list[customer_list['CustomerID'] == 12683].value_counts())
# display(customer_list[customer_list['CustomerID'] == 12509].value_counts())

In [26]:
product_list = pd.DataFrame({
    'StockCode':[21731,23084,22556,22439],
    'Description':['Red Toadstool Led Night Light','Rabbit Night Light','Plasters In Tin Circus Parade','6 Rocket Balloons']
})

customer_list = pd.DataFrame({
    'CustomerID':[12681,12683,12509],
    'CustomerName':['Claudetta Guirau','Jaquelyn Eckh','Emmet Piggens']
})

display(product_list)
display(customer_list)

Unnamed: 0,StockCode,Description
0,21731,Red Toadstool Led Night Light
1,23084,Rabbit Night Light
2,22556,Plasters In Tin Circus Parade
3,22439,6 Rocket Balloons


Unnamed: 0,CustomerID,CustomerName
0,12681,Claudetta Guirau
1,12683,Jaquelyn Eckh
2,12509,Emmet Piggens


In [27]:
df_test = pd.DataFrame(columns = ['CustomerID', 'StockCode'])

for i in [12681, 12683, 12509]: # i for CustomerID
    for j in [21731, 23084, 22556, 22439]: # j for StockCode of Product
        df_test = df_test.append({'CustomerID': i, 'StockCode': j}, ignore_index = True)

In [28]:
df_test

Unnamed: 0,CustomerID,StockCode
0,12681,21731
1,12681,23084
2,12681,22556
3,12681,22439
4,12683,21731
5,12683,23084
6,12683,22556
7,12683,22439
8,12509,21731
9,12509,23084


In [29]:
algo = SVD()
algo.fit(trainset)
y = []

for _, row in df_test.iterrows():
    est = algo.predict(row.CustomerID, row.StockCode)
    y.append(est[3])

df_test['Rating'] = y

In [30]:
df_test.sort_values(['CustomerID', 'StockCode'], ascending = [True, False], inplace = True)
df_test

Unnamed: 0,CustomerID,StockCode,Rating
9,12509,23084,3.883017
10,12509,22556,4.086317
11,12509,22439,3.970995
8,12509,21731,3.891494
1,12681,23084,4.018838
2,12681,22556,3.839192
3,12681,22439,3.788879
0,12681,21731,4.067315
5,12683,23084,3.9645
6,12683,22556,4.041105


**Merge the data**

In [32]:
merge = pd.merge(df_test,customer_list,on='CustomerID',how='left')
df_merge = pd.merge(merge,product_list,on='StockCode',how='left')

In [33]:
df_merge

Unnamed: 0,CustomerID,StockCode,Rating,CustomerName,Description
0,12509,23084,3.883017,Emmet Piggens,Rabbit Night Light
1,12509,22556,4.086317,Emmet Piggens,Plasters In Tin Circus Parade
2,12509,22439,3.970995,Emmet Piggens,6 Rocket Balloons
3,12509,21731,3.891494,Emmet Piggens,Red Toadstool Led Night Light
4,12681,23084,4.018838,Claudetta Guirau,Rabbit Night Light
5,12681,22556,3.839192,Claudetta Guirau,Plasters In Tin Circus Parade
6,12681,22439,3.788879,Claudetta Guirau,6 Rocket Balloons
7,12681,21731,4.067315,Claudetta Guirau,Red Toadstool Led Night Light
8,12683,23084,3.9645,Jaquelyn Eckh,Rabbit Night Light
9,12683,22556,4.041105,Jaquelyn Eckh,Plasters In Tin Circus Parade


**Recommedation for every customers:**

In [34]:
customer_12509 = df_merge[df_merge['CustomerID'] == 12509]
customer_12509 = customer_12509[['CustomerID','CustomerName','StockCode','Description','Rating']]
customer_12509.sort_values('Rating', ascending=False)

Unnamed: 0,CustomerID,CustomerName,StockCode,Description,Rating
1,12509,Emmet Piggens,22556,Plasters In Tin Circus Parade,4.086317
2,12509,Emmet Piggens,22439,6 Rocket Balloons,3.970995
3,12509,Emmet Piggens,21731,Red Toadstool Led Night Light,3.891494
0,12509,Emmet Piggens,23084,Rabbit Night Light,3.883017


**Conclusion**: Plasters In Tin Circus Parade, 6 Rocket Balloons, Red Toadstool Led Night Light, and Rabbit Night Light are the products to recommend for Customer 12509 (Emmet Piggens).

In [35]:
customer_12681 = df_merge[df_merge['CustomerID'] == 12681]
customer_12681 = customer_12681[['CustomerID','CustomerName','StockCode','Description','Rating']]
customer_12681.sort_values('Rating', ascending=False)

Unnamed: 0,CustomerID,CustomerName,StockCode,Description,Rating
7,12681,Claudetta Guirau,21731,Red Toadstool Led Night Light,4.067315
4,12681,Claudetta Guirau,23084,Rabbit Night Light,4.018838
5,12681,Claudetta Guirau,22556,Plasters In Tin Circus Parade,3.839192
6,12681,Claudetta Guirau,22439,6 Rocket Balloons,3.788879


**Conclusion**: Red Toadstool Led Night Light, Rabbit Night Light, Plasters In Tin Circus Parade, and 6 Rocket Balloons are the products to recommend for Customer 12681 (Claudetta Guirau).

In [36]:
customer_12683 = df_merge[df_merge['CustomerID'] == 12683]
customer_12683 = customer_12683[['CustomerID','CustomerName','StockCode','Description','Rating']]
customer_12683.sort_values('Rating', ascending=False)

Unnamed: 0,CustomerID,CustomerName,StockCode,Description,Rating
9,12683,Jaquelyn Eckh,22556,Plasters In Tin Circus Parade,4.041105
11,12683,Jaquelyn Eckh,21731,Red Toadstool Led Night Light,4.025162
8,12683,Jaquelyn Eckh,23084,Rabbit Night Light,3.9645
10,12683,Jaquelyn Eckh,22439,6 Rocket Balloons,3.771898


**Conclusion**: Plasters In Tin Circus Parade, Red Toadstool Led Night Light, Rabbit Night Light, and 6 Rocket Balloons are the products to recommend for Customer 12683 (Jaquelyn Eckh).