In [1]:
!pip install surprise
import pandas as pd
!pip install openpyxl
from surprise import Reader, SVD, Dataset, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate
pd.set_option('display.max_columns', None)

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 4.3 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1630232 sha256=2a206566fc7a72c93ef3dad39ed8bce6388ab5d5191e268e17bde3dee0952a81
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [6]:
raw=pd.read_csv('/content/FinalCapistoneDataCSV_mydata.csv',usecols=['Category','Product Name','Product ID ','Ratings ','Customer ID','Order ID'],encoding = "ISO-8859-1", engine='python')
df = raw.set_axis(['Category', 'ProductName', 'ProdID', 'Ratings', 'CustomerID','OrderID'], axis=1, inplace=False)

In [7]:
product_ids = [202229578,202229553,202229514,202229490,202229415]
products = ["Pizza Pasta Seasoning","Piri Piri","Perfume Deluxe - Royale Fabric Conditioner","Cheese Slices - Made From Cow Milk","Chicken Seekh Kebab"]

In [8]:
sample_df = df[df.ProdID.isin(product_ids)]
sample_df.shape

(3653, 6)

In [9]:
sample_df.head()

Unnamed: 0,Category,ProductName,ProdID,Ratings,CustomerID,OrderID
17414,Snacks Branded Foods,Chicken Seekh Kebab,202229415,3.2,27415,3483
17489,Bakery Cakes Dairy,Cheese Slices - Made From Cow Milk,202229490,3.0,27490,3498
17513,Cleaning Household,Perfume Deluxe - Royale Fabric Conditioner,202229514,4.0,27514,3503
17552,Foodgrains Oil Masala,Piri Piri,202229553,2.8,27553,3511
17577,Foodgrains Oil Masala,Pizza Pasta Seasoning,202229578,4.7,27578,3516


In [None]:
#Creating customer_prod_df

In [10]:
customer_prod_df = sample_df.pivot_table(index=["CustomerID"], columns=["ProductName"], values="Ratings")
customer_prod_df.shape

(1073, 5)

In [11]:
customer_prod_df.head()

ProductName,Cheese Slices - Made From Cow Milk,Chicken Seekh Kebab,Perfume Deluxe - Royale Fabric Conditioner,Piri Piri,Pizza Pasta Seasoning
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10001,,4.0,4.0,,4.0
10002,,4.3,4.3,,4.3
10003,,3.3,3.3,,3.3
10004,,5.0,5.0,,5.0
10005,,4.9,4.9,,4.9


In [None]:
# User-Item matrix is created with users in rows and movies in columns.

# Here we scale the ratings to the 1-5 range with the Reader object.

In [12]:
reader = Reader(rating_scale=(1, 5))

In [14]:
data = Dataset.load_from_df(sample_df[['CustomerID', 'ProdID', 'Ratings']], reader)
data

<surprise.dataset.DatasetAutoFolds at 0x7fcbce07aed0>

In [None]:
# In this library, Dataset.load is used to read data, similar to read.csv in pandas.

In [15]:
trainset, testset = train_test_split(data, test_size=.25)
svd_model = SVD()
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fcbce679e90>

In [None]:
# Unknown p and q matrices were learned from the existing data. It pretends to have 100 latent factors by default

In [16]:
predictions = svd_model.test(testset)

accuracy.rmse(predictions)

RMSE: 0.5204


0.5203500148757839

In [None]:
# We tested the learned weights with test data and found the RMSE(Root Mean Square Error ) and MAE(Mean Absolute Error) values.

# Now we will apply cross validation method in order to validate th eresults.

In [17]:
cross_validate(svd_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.4681  0.4953  0.5249  0.4690  0.4962  0.4907  0.0210  
MAE (testset)     0.3796  0.3993  0.4401  0.3870  0.4072  0.4026  0.0210  
Fit time          0.15    0.14    0.15    0.15    0.14    0.15    0.00    
Test time         0.00    0.00    0.01    0.00    0.00    0.01    0.00    


{'fit_time': (0.15370821952819824,
  0.14489340782165527,
  0.1485586166381836,
  0.14536356925964355,
  0.14358305931091309),
 'test_mae': array([0.37958604, 0.39928693, 0.44006847, 0.38696728, 0.40723491]),
 'test_rmse': array([0.4680823 , 0.49527352, 0.52487328, 0.46896829, 0.49620183]),
 'test_time': (0.004249095916748047,
  0.0041620731353759766,
  0.011564254760742188,
  0.0042231082916259766,
  0.004598140716552734)}

In [None]:
# Let's try the svd_model, that we built, for user Id=1 and movie ID=202229415 ("Chicken Seekh Kebab"). Let the model give us a suggestion.

In [20]:
svd_model.predict(uid=27416.0, iid=202229415, verbose=True)

user: 27416.0    item: 202229415  r_ui = None   est = 3.63   {'was_impossible': False}


Prediction(uid=27416.0, iid=202229415, r_ui=None, est=3.6265537755149415, details={'was_impossible': False})

In [None]:
# Normally, this user didn't rate this movie but we estimated it as 3.62

In [21]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005]}

gs = GridSearchCV(SVD,
                  param_grid,
                  measures=['rmse', 'mae'],
                  cv=3,
                  n_jobs=-1,
                  joblib_verbose=True)

gs.fit(data)
gs.best_score['rmse']

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    2.1s finished


0.7014215678087495

In [None]:
# It was 0.70142 before tuning.

In [22]:
gs.best_params['rmse']

{'lr_all': 0.005, 'n_epochs': 10}

In [23]:
svd_model = SVD(**gs.best_params['rmse'])

data = data.build_full_trainset()
svd_model.fit(data)

svd_model.predict(uid=27416.0, iid=202229415, verbose=True)

user: 27416.0    item: 202229415  r_ui = None   est = 3.63   {'was_impossible': False}


Prediction(uid=27416.0, iid=202229415, r_ui=None, est=3.6256761624345453, details={'was_impossible': False})