In [5]:
import numpy as np
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import SVD, KNNBasic, NMF, SlopeOne, CoClustering, BaselineOnly
from surprise import accuracy
import matplotlib.pyplot as plt


In [7]:
data = pd.read_csv('ratings_small.csv')
df = pd.DataFrame(data)
df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182


In [8]:
df = df.drop(columns = 'timestamp')
df.head(3)

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0


In [9]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
userId,100004.0,347.01131,195.163838,1.0,182.0,367.0,520.0,671.0
movieId,100004.0,12548.664363,26369.198969,1.0,1028.0,2406.5,5418.0,163949.0
rating,100004.0,3.543608,1.058064,0.5,3.0,4.0,4.0,5.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100004 non-null  int64  
 1   movieId  100004 non-null  int64  
 2   rating   100004 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.3 MB


In [11]:
df.isna().sum()

userId     0
movieId    0
rating     0
dtype: int64

## Reading the data with surprise Library

In [17]:
reader = Reader(rating_scale=(0,5))
data = Dataset.load_from_df(df , reader)

## Making a train and test data for evaluation

In [18]:
train_set , test_set = train_test_split(data ,test_size = 0.2 )

## I am going to use these algorithms on the data to see which works better

In [19]:
algorithms = [SVD(), KNNBasic(), NMF(), SlopeOne(), CoClustering(), BaselineOnly()]

## Making a loop to test the algorithms

In [20]:
rmse_vals = []
mae_vals = []


for algo in algorithms:

    algo.fit(train_set)


    preds = algo.test(test_set)


    rmse = accuracy.rmse(preds)
    mae = accuracy.mae(preds)


    rmse_vals.append(rmse)
    mae_vals.append(mae)

RMSE: 0.9069
MAE:  0.6947
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9760
MAE:  0.7493
RMSE: 0.9614
MAE:  0.7391
RMSE: 0.9424
MAE:  0.7178
RMSE: 0.9803
MAE:  0.7561
Estimating biases using als...
RMSE: 0.9022
MAE:  0.6952


In [21]:
combined_acc = [(rmse + mae) / 2 for rmse, mae in zip(rmse_vals, mae_vals)]

In [22]:
best_algo_index = combined_acc.index(min(combined_acc))
best_algo_name = algorithms[best_algo_index].__class__.__name__

In [23]:
print(f"Best Algorithm: {best_algo_name}")
print(f"RMSE: {rmse_vals[best_algo_index]}")
print(f"MAE: {mae_vals[best_algo_index]}")
print(f"Combined Score: {combined_acc[best_algo_index]}")

Best Algorithm: BaselineOnly
RMSE: 0.9022259959169588
MAE: 0.6952012543545946
Combined Score: 0.7987136251357767


In [33]:
print(f"Length of bar_positions: {len(bar_positions)}")
print(f"Length of rmse_vals: {len(rmse_vals)}")
print(f"Length of mae_vals: {len(mae_vals)}")


Length of bar_positions: 6
Length of rmse_vals: 12
Length of mae_vals: 12


In [24]:
bsl = BaselineOnly()
data_train = data.build_full_trainset()
bsl.fit(data_train)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x78271b965d50>

In [25]:
bsl.predict(3, 872)

Prediction(uid=3, iid=872, r_ui=None, est=3.494574995057245, details={'was_impossible': False})