In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
from sklearn.metrics import mean_squared_error

In [3]:
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate

In [4]:
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import SVD

In [5]:
!pip install surprise



In [6]:
data = pd.read_csv('/content/jokes-data.csv')

In [7]:
data

Unnamed: 0,id,user_id,joke_id,Rating
0,31030_110,31030,110,2.750
1,16144_109,16144,109,5.094
2,23098_6,23098,6,-6.438
3,14273_86,14273,86,4.406
4,18419_134,18419,134,9.375
...,...,...,...,...
1092054,9517_132,9517,132,3.156
1092055,27767_118,27767,118,-1.594
1092056,10580_81,10580,81,2.000
1092057,31007_119,31007,119,8.906


In [8]:
data.isna().sum()

id         0
user_id    0
joke_id    0
Rating     0
dtype: int64

In [9]:
#no null values

In [10]:
data.dtypes

id          object
user_id      int64
joke_id      int64
Rating     float64
dtype: object

In [11]:
data.nunique()

id         1092059
user_id      40863
joke_id        139
Rating         641
dtype: int64

In [12]:
#dropping id column(all unique values)

In [13]:
data.drop('id',axis =1 ,inplace=True)

In [14]:
data

Unnamed: 0,user_id,joke_id,Rating
0,31030,110,2.750
1,16144,109,5.094
2,23098,6,-6.438
3,14273,86,4.406
4,18419,134,9.375
...,...,...,...
1092054,9517,132,3.156
1092055,27767,118,-1.594
1092056,10580,81,2.000
1092057,31007,119,8.906


In [15]:
#no need to encode, since all values are numerical

In [16]:
#no need to handle outliers either

In [17]:
data['Rating'].value_counts()

 9.938    4692
 9.969    4584
 9.906    4247
 9.875    3808
 9.844    3466
          ... 
-7.625     575
-7.656     562
-7.688     541
-9.969     453
-9.938     305
Name: Rating, Length: 641, dtype: int64

In [18]:
data.describe().round()

Unnamed: 0,user_id,joke_id,Rating
count,1092059.0,1092059.0,1092059.0
mean,20684.0,64.0,2.0
std,11830.0,44.0,5.0
min,1.0,1.0,-10.0
25%,10412.0,22.0,-2.0
50%,21308.0,62.0,2.0
75%,30784.0,104.0,6.0
max,40863.0,139.0,10.0


In [19]:
reader = Reader(rating_scale=(-10,10))

KNNBasic

In [20]:
#slicing dataset to 30000 entries to prevent the runtime crashing from exceeding 10GB RAM usage

In [21]:
#using the sliced dataset for running KNNBasic

In [22]:
data1 = data.iloc[:30000,:]

In [23]:
data1

Unnamed: 0,user_id,joke_id,Rating
0,31030,110,2.750
1,16144,109,5.094
2,23098,6,-6.438
3,14273,86,4.406
4,18419,134,9.375
...,...,...,...
29995,9707,103,4.188
29996,33225,117,-2.250
29997,18596,16,9.031
29998,4156,73,3.938


In [24]:
df_sliced = Dataset.load_from_df(data1,reader)

In [25]:
algo_knn = KNNBasic(k=6)

In [26]:
cross_validate(algo_knn,df_sliced,measures = ['rmse','mae'],cv = 5)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([5.74503396, 5.66314919, 5.74690046, 5.69336464, 5.63912658]),
 'test_mae': array([4.59737533, 4.51215236, 4.57134223, 4.53734462, 4.51452543]),
 'fit_time': (16.621215105056763,
  15.577291011810303,
  8.715062141418457,
  8.518911123275757,
  8.58379316329956),
 'test_time': (2.081223487854004,
  0.5860180854797363,
  0.624180793762207,
  0.6287875175476074,
  0.785123348236084)}

In [27]:
print('KNNBasic RMSE = ',np.array([5.65390346, 5.7107905 , 5.73783736, 5.7140158 , 5.71744828]).mean())

KNNBasic RMSE =  5.706799080000001


KNNwithMeans

In [28]:
#running KNNWithMeans with the sliced dataframe to prevent crashing

In [29]:
algo_knn_with_means = KNNWithMeans()

In [30]:
cross_validate(algo_knn_with_means,df_sliced,measures = ['rmse','mae'],cv = 5)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([5.48342134, 5.50034468, 5.46597222, 5.4622693 , 5.56028254]),
 'test_mae': array([4.25481528, 4.26743853, 4.24086844, 4.24738748, 4.30046394]),
 'fit_time': (8.357481241226196,
  10.41555666923523,
  8.869817972183228,
  8.315437078475952,
  9.022872686386108),
 'test_time': (0.6682271957397461,
  0.670234203338623,
  0.6958184242248535,
  0.6699163913726807,
  0.7773284912109375)}

In [31]:
print('KNNWithMEans RMSE = ',np.array([5.6316776 , 5.48594245, 5.46865932, 5.40284434, 5.5260681 ]).mean())

KNNWithMEans RMSE =  5.503038362


SVD

In [32]:
#SVD ran without crashing when trained on entire dataset, thus running SVD with the full dataframe

In [33]:
df_full= Dataset.load_from_df(data,reader)

In [34]:
algo_svd = SVD()

In [35]:
cross_validate(algo_svd,df_full,measures = ['rmse','mae'],cv = 5)

{'test_rmse': array([4.28997319, 4.2976783 , 4.29195577, 4.27978912, 4.29631185]),
 'test_mae': array([3.23990013, 3.24559015, 3.24197511, 3.23368511, 3.24155705]),
 'fit_time': (19.29219079017639,
  17.965081691741943,
  18.109551906585693,
  18.145591974258423,
  18.149436712265015),
 'test_time': (3.476757764816284,
  3.225891590118408,
  3.0964314937591553,
  3.5822300910949707,
  3.0211281776428223)}

In [36]:
print('SVD RMSE = ',np.array([4.29744344, 4.28861514, 4.28550265, 4.29604701, 4.2880944 ]).mean())

SVD RMSE =  4.291140528


Building SVD model

In [37]:
#train data test data splitting

In [38]:
train_data = data.iloc[:873647,:]


In [39]:
train_data.tail()

Unnamed: 0,user_id,joke_id,Rating
873642,9625,37,4.406
873643,13360,5,-9.219
873644,15689,8,-4.25
873645,5630,79,2.969
873646,8614,26,6.344


In [40]:
test_data = data.iloc[873647:,:]

In [41]:
train_data.shape

(873647, 3)

In [42]:
test_data.shape

(218412, 3)

In [43]:
#training SVD model on training-data

In [44]:
algo_SVD = SVD(n_epochs=30)

In [45]:
train_df = Dataset.load_from_df(train_data,reader)

In [46]:
training_data = train_df.build_full_trainset()

In [47]:
algo_SVD.fit(training_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7c8ed5430a90>

In [48]:
#predicting the 873647th entry

In [49]:
test_data['user_id'][873647]

837

In [50]:
test_data['joke_id'][873647]

9

In [51]:
test_data['Rating'][873647]

-0.188

In [52]:
prediction = algo_SVD.predict(837,9,-0.188)


In [53]:
prediction.est

0.3374842420529174

In [61]:
#predicting the entire test dataset and creating a new csv file

In [55]:
pred = []
for i in test_data.index:
  prediction = algo_SVD.predict(test_data['user_id'][i],test_data['joke_id'][i],test_data['Rating'][i])
  pred.append(prediction.est)

In [56]:
data_with_recommendation = test_data

In [57]:
data_with_recommendation['prediction_rating'] = pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_with_recommendation['prediction_rating'] = pred


In [58]:
data_with_recommendation.head(10)

Unnamed: 0,user_id,joke_id,Rating,prediction_rating
873647,837,9,-0.188,0.337484
873648,23101,6,3.656,-0.168004
873649,5813,97,5.625,1.804151
873650,27947,75,1.781,-2.422895
873651,27895,96,0.219,-2.05887
873652,40552,77,1.688,0.679216
873653,24589,86,1.812,-3.761936
873654,21556,98,0.062,4.86321
873655,1992,50,0.812,-1.506814
873656,37727,22,1.875,7.385963


In [59]:
rmse_svd = mean_squared_error(data_with_recommendation['Rating'],data_with_recommendation['prediction_rating'],squared=False)

In [60]:
print('RMSE of the SVD model = ',rmse_svd)

RMSE of the SVD model =  4.282060499197553
