In [1]:
import pandas as pd
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import train_test_split, cross_validate

In [2]:
df = pd.read_csv("Amazon - Movies and TV Ratings.csv")

In [3]:
df.head()

Unnamed: 0,user_id,Movie1,Movie2,Movie3,Movie4,Movie5,Movie6,Movie7,Movie8,Movie9,...,Movie197,Movie198,Movie199,Movie200,Movie201,Movie202,Movie203,Movie204,Movie205,Movie206
0,A3R5OBKS7OM2IR,5.0,5.0,,,,,,,,...,,,,,,,,,,
1,AH3QC2PC1VTGP,,,2.0,,,,,,,...,,,,,,,,,,
2,A3LKP6WPMP9UKX,,,,5.0,,,,,,...,,,,,,,,,,
3,AVIY68KEPQ5ZD,,,,5.0,,,,,,...,,,,,,,,,,
4,A1CV1WROP5KTTW,,,,,5.0,,,,,...,,,,,,,,,,


In [4]:
df.shape

(4848, 207)

In [5]:
# Top movies with maximum ratings 
df.drop('user_id', axis=1).max().sort_values(ascending=False).head()

Movie206    5.0
Movie75     5.0
Movie86     5.0
Movie85     5.0
Movie84     5.0
dtype: float64

In [6]:
# Average rating for each movie
df.mean()

Movie1      5.000000
Movie2      5.000000
Movie3      2.000000
Movie4      5.000000
Movie5      4.103448
              ...   
Movie202    4.333333
Movie203    3.000000
Movie204    4.375000
Movie205    4.628571
Movie206    4.923077
Length: 206, dtype: float64

In [7]:
# Top 5 movies with the least audience
df.isna().sum().sort_values(ascending=False).head()

Movie146    4847
Movie153    4847
Movie67     4847
Movie66     4847
Movie2      4847
dtype: int64

In [8]:
melt_df = df.melt(id_vars=df.columns[0], value_vars=df.columns[1:], var_name='movie', value_name='rating')
melt_df

Unnamed: 0,user_id,movie,rating
0,A3R5OBKS7OM2IR,Movie1,5.0
1,AH3QC2PC1VTGP,Movie1,
2,A3LKP6WPMP9UKX,Movie1,
3,AVIY68KEPQ5ZD,Movie1,
4,A1CV1WROP5KTTW,Movie1,
...,...,...,...
998683,A1IMQ9WMFYKWH5,Movie206,5.0
998684,A1KLIKPUF5E88I,Movie206,5.0
998685,A5HG6WFZLO10D,Movie206,5.0
998686,A3UU690TWXCG1X,Movie206,5.0


In [9]:
reader = Reader(rating_scale=(-1,10))
data = Dataset.load_from_df(melt_df.fillna(0), reader)

train, test = train_test_split(data)

algo = SVD()
algo.fit(train)
pred = algo.test(test)
accuracy.rmse(pred)

RMSE: 0.2797


0.27969694167769793

In [10]:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.2844  0.2773  0.2863  0.2827  0.0039  
MAE (testset)     0.0426  0.0427  0.0430  0.0428  0.0002  
Fit time          86.54   80.78   91.15   86.16   4.24    
Test time         7.22    7.98    5.89    7.03    0.86    


{'test_rmse': array([0.28436746, 0.27729083, 0.28634751]),
 'test_mae': array([0.04261828, 0.04269332, 0.04298409]),
 'fit_time': (86.53712844848633, 80.78038120269775, 91.14971208572388),
 'test_time': (7.223807334899902, 7.978830575942993, 5.890473365783691)}

In [11]:
pred

[Prediction(uid='A2ZPPV9EVM45UH', iid='Movie89', r_ui=0.0, est=0.20651787285463113, details={'was_impossible': False}),
 Prediction(uid='A14JTAXK8OEVPD', iid='Movie190', r_ui=0.0, est=-0.008073033572904587, details={'was_impossible': False}),
 Prediction(uid='A3NQU1649SH0Q4', iid='Movie86', r_ui=0.0, est=-0.0012709530296565984, details={'was_impossible': False}),
 Prediction(uid='A2G151QE16KTID', iid='Movie23', r_ui=0.0, est=-0.007699979805080918, details={'was_impossible': False}),
 Prediction(uid='A1V44USQTM2WX4', iid='Movie206', r_ui=0.0, est=-0.012809070191770414, details={'was_impossible': False}),
 Prediction(uid='AX8TY93A2U80I', iid='Movie96', r_ui=0.0, est=-0.011842886256064937, details={'was_impossible': False}),
 Prediction(uid='A3KMOJ9UPB5TH9', iid='Movie113', r_ui=0.0, est=-0.0014423548455943735, details={'was_impossible': False}),
 Prediction(uid='A32Z8ZV7E5N8AV', iid='Movie29', r_ui=0.0, est=-0.18380923348773975, details={'was_impossible': False}),
 Prediction(uid='A3LDEB