In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [5]:
from sklearn.metrics import mean_squared_error

In [6]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3156242 sha256=218cb533d6e8948205c119bfa56bd35af9629344ab6b7776f8876df53d518a58
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [7]:
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate

In [8]:
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import SVD

In [9]:
data = pd.read_csv('/content/jokes-data.csv')

In [10]:
data

Unnamed: 0,id,user_id,joke_id,Rating
0,31030_110,31030,110,2.750
1,16144_109,16144,109,5.094
2,23098_6,23098,6,-6.438
3,14273_86,14273,86,4.406
4,18419_134,18419,134,9.375
...,...,...,...,...
1092054,9517_132,9517,132,3.156
1092055,27767_118,27767,118,-1.594
1092056,10580_81,10580,81,2.000
1092057,31007_119,31007,119,8.906


In [11]:
data.isna().sum()

id         0
user_id    0
joke_id    0
Rating     0
dtype: int64

In [12]:
#no null values

In [13]:
data.dtypes

id          object
user_id      int64
joke_id      int64
Rating     float64
dtype: object

In [14]:
data.nunique()

id         1092059
user_id      40863
joke_id        139
Rating         641
dtype: int64

In [15]:
#dropping id column(all unique values)

In [16]:
data.drop('id',axis =1 ,inplace=True)

In [17]:
data

Unnamed: 0,user_id,joke_id,Rating
0,31030,110,2.750
1,16144,109,5.094
2,23098,6,-6.438
3,14273,86,4.406
4,18419,134,9.375
...,...,...,...
1092054,9517,132,3.156
1092055,27767,118,-1.594
1092056,10580,81,2.000
1092057,31007,119,8.906


In [18]:
#no need to encode, since all values are numerical

In [19]:
#no need to handle outliers either

In [20]:
data['Rating'].value_counts()

 9.938    4692
 9.969    4584
 9.906    4247
 9.875    3808
 9.844    3466
          ... 
-7.625     575
-7.656     562
-7.688     541
-9.969     453
-9.938     305
Name: Rating, Length: 641, dtype: int64

In [21]:
data.describe().round()

Unnamed: 0,user_id,joke_id,Rating
count,1092059.0,1092059.0,1092059.0
mean,20684.0,64.0,2.0
std,11830.0,44.0,5.0
min,1.0,1.0,-10.0
25%,10412.0,22.0,-2.0
50%,21308.0,62.0,2.0
75%,30784.0,104.0,6.0
max,40863.0,139.0,10.0


In [22]:
reader = Reader(rating_scale=(-10,10))

KNNBasic

In [23]:
#slicing dataset to 30000 entries to prevent the runtime crashing from exceeding 10GB RAM usage

In [24]:
#using the sliced dataset for running KNNBasic

In [25]:
data1 = data.iloc[:30000,:]

In [26]:
data1

Unnamed: 0,user_id,joke_id,Rating
0,31030,110,2.750
1,16144,109,5.094
2,23098,6,-6.438
3,14273,86,4.406
4,18419,134,9.375
...,...,...,...
29995,9707,103,4.188
29996,33225,117,-2.250
29997,18596,16,9.031
29998,4156,73,3.938


In [27]:
df_sliced = Dataset.load_from_df(data1,reader)

In [28]:
algo_knn = KNNBasic(k=6)

In [29]:
cross_validate(algo_knn,df_sliced,measures = ['rmse','mae'],cv = 5)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([5.65704173, 5.73734085, 5.75553369, 5.62999227, 5.71211271]),
 'test_mae': array([4.50645226, 4.58062514, 4.60107035, 4.49612568, 4.55947848]),
 'fit_time': (9.073894500732422,
  13.799484491348267,
  8.220375061035156,
  12.300037384033203,
  8.041123390197754),
 'test_time': (2.4307050704956055,
  0.6661794185638428,
  1.317793846130371,
  1.1017603874206543,
  0.7977464199066162)}

In [64]:
print('KNNBasic RMSE = ',np.array([5.65704173, 5.73734085, 5.75553369, 5.62999227, 5.71211271]).mean())

KNNBasic RMSE =  5.698404249999999


KNNwithMeans

In [31]:
#running KNNWithMeans with the sliced dataframe to prevent crashing

In [32]:
algo_knn_with_means = KNNWithMeans()

In [33]:
cross_validate(algo_knn_with_means,df_sliced,measures = ['rmse','mae'],cv = 5)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([5.47823527, 5.4772508 , 5.53062945, 5.52101405, 5.49520208]),
 'test_mae': array([4.25659315, 4.2576426 , 4.31760184, 4.29424616, 4.28300846]),
 'fit_time': (8.064087390899658,
  10.620236873626709,
  8.415210485458374,
  9.537786722183228,
  8.275581359863281),
 'test_time': (0.7350142002105713,
  0.7604458332061768,
  0.7212886810302734,
  1.1994438171386719,
  0.726325511932373)}

In [65]:
print('KNNWithMEans RMSE = ',np.array([5.47823527, 5.4772508 , 5.53062945, 5.52101405, 5.49520208]).mean())

KNNWithMEans RMSE =  5.500466329999999


SVD

In [35]:
#SVD ran without crashing when trained on entire dataset, thus running SVD with the full dataframe

In [36]:
df_full= Dataset.load_from_df(data,reader)

In [37]:
algo_svd = SVD()

In [38]:
cross_validate(algo_svd,df_full,measures = ['rmse','mae'],cv = 5)

{'test_rmse': array([4.30942393, 4.27956573, 4.28823539, 4.29478296, 4.2773332 ]),
 'test_mae': array([3.25300357, 3.23392999, 3.2350523 , 3.24140241, 3.23395879]),
 'fit_time': (21.755882740020752,
  20.175960779190063,
  20.79176926612854,
  20.839345693588257,
  21.14991283416748),
 'test_time': (3.868865728378296,
  2.8504934310913086,
  2.325472354888916,
  2.2819888591766357,
  2.3322043418884277)}

In [66]:
print('SVD RMSE = ',np.array([4.30942393, 4.27956573, 4.28823539, 4.29478296, 4.2773332 ]).mean())

SVD RMSE =  4.289868242


Building SVD model

In [40]:
#train data test data splitting

In [41]:
train_data = data.iloc[:873647,:]


In [42]:
train_data.tail()

Unnamed: 0,user_id,joke_id,Rating
873642,9625,37,4.406
873643,13360,5,-9.219
873644,15689,8,-4.25
873645,5630,79,2.969
873646,8614,26,6.344


In [43]:
test_data = data.iloc[873647:,:]

In [44]:
train_data.shape

(873647, 3)

In [45]:
test_data.shape

(218412, 3)

In [46]:
#training SVD model on training-data

In [47]:
algo_SVD = SVD(n_epochs=30)

In [48]:
train_df = Dataset.load_from_df(train_data,reader)

In [49]:
training_data = train_df.build_full_trainset()

In [50]:
algo_SVD.fit(training_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7dd628b86cb0>

In [51]:
#predicting the 873647th entry

In [52]:
test_data['user_id'][873647]

837

In [53]:
test_data['joke_id'][873647]

9

In [54]:
test_data['Rating'][873647]

-0.188

In [55]:
prediction = algo_SVD.predict(837,9,-0.188)


In [56]:
prediction.est

-0.7727938269681316

In [57]:
#predicting the entire test dataset and creating a new csv file

In [58]:
pred = []
for i in test_data.index:
  prediction = algo_SVD.predict(test_data['user_id'][i],test_data['joke_id'][i],test_data['Rating'][i])
  pred.append(prediction.est)

In [59]:
data_with_recommendation = test_data

In [60]:
data_with_recommendation['prediction_rating'] = pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_with_recommendation['prediction_rating'] = pred


In [61]:
data_with_recommendation.head(10)

Unnamed: 0,user_id,joke_id,Rating,prediction_rating
873647,837,9,-0.188,-0.772794
873648,23101,6,3.656,-0.318486
873649,5813,97,5.625,1.898832
873650,27947,75,1.781,0.328696
873651,27895,96,0.219,-0.675316
873652,40552,77,1.688,0.261748
873653,24589,86,1.812,3.41216
873654,21556,98,0.062,4.433592
873655,1992,50,0.812,-2.641491
873656,37727,22,1.875,7.235264


In [62]:
rmse_svd = mean_squared_error(data_with_recommendation['Rating'],data_with_recommendation['prediction_rating'],squared=False)

In [63]:
print('RMSE of the SVD model = ',rmse_svd)

RMSE of the SVD model =  4.286174580347215
