In [12]:
import pandas as pd
import numpy as np
from funk_svd import SVD
from sklearn.metrics import mean_absolute_error
import matplotlib as plt

In [2]:
# Data pipeline
PATH = r"D:\Projects\moveilens"
df_rating = pd.DataFrame(pd.read_csv(fr"{PATH}\rating.csv"))
pivoted_df = df_rating.iloc[:50000].pivot(index="userId", columns="movieId", values='rating')

In [67]:
user_item = pivoted_df.values
#user_item[np.isnan(user_item)] = 0
print(user_item)

[[0.  3.5 0.  ... 0.  0.  0. ]
 [0.  0.  4.  ... 0.  0.  0. ]
 [4.  0.  0.  ... 0.  0.  0. ]
 ...
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  4.  0.  ... 0.  0.  0. ]
 [4.5 4.  0.  ... 0.  0.  0. ]]


In [4]:
df_rating.rename(columns={'userId': 'u_id', 'movieId': 'i_id'}, inplace=True)
train = df_rating.sample(frac=0.8)
val = df_rating.drop(train.index.tolist()).sample(frac=0.6)
test = df_rating.drop(train.index.tolist()).drop(val.index.tolist())

In [5]:
test.head(10)

Unnamed: 0,u_id,i_id,rating,timestamp
5,1,112,3.5,2004-09-10 03:09:00
8,1,253,4.0,2005-04-02 23:35:40
10,1,293,4.0,2005-04-02 23:31:43
14,1,367,3.5,2005-04-02 23:53:00
28,1,1136,3.5,2005-04-02 23:30:09
37,1,1217,3.5,2005-04-02 23:33:30
44,1,1258,4.0,2004-09-10 03:13:14
56,1,1358,4.0,2005-04-02 23:43:39
59,1,1387,4.0,2005-04-02 23:35:13
69,1,2100,4.0,2005-04-02 23:52:35


In [7]:
# Constants
LEARNING_RATE = 0.007
EPOCH = 50
LATENT_FACTORS = 20
REG = 0.005

In [8]:
# Training
model_svd = SVD(lr=LEARNING_RATE, reg=REG, n_epochs=EPOCH, n_factors=LATENT_FACTORS, early_stopping=False, shuffle=False, min_rating=1, max_rating=5)
model_svd.fit(X=train, X_val=val)

Preprocessing data...

Preprocessing data...

Epoch 1/50  | val_loss: 0.76 - val_rmse: 0.87 - val_mae: 0.67 - took 3.9 sec
Epoch 2/50  | val_loss: 0.74 - val_rmse: 0.86 - val_mae: 0.66 - took 2.6 sec
Epoch 3/50  | val_loss: 0.71 - val_rmse: 0.84 - val_mae: 0.65 - took 2.6 sec
Epoch 4/50  | val_loss: 0.69 - val_rmse: 0.83 - val_mae: 0.64 - took 2.6 sec
Epoch 5/50  | val_loss: 0.67 - val_rmse: 0.82 - val_mae: 0.63 - took 2.6 sec
Epoch 6/50  | val_loss: 0.65 - val_rmse: 0.81 - val_mae: 0.62 - took 2.7 sec
Epoch 7/50  | val_loss: 0.64 - val_rmse: 0.80 - val_mae: 0.61 - took 3.1 sec
Epoch 8/50  | val_loss: 0.63 - val_rmse: 0.80 - val_mae: 0.61 - took 2.7 sec
Epoch 9/50  | val_loss: 0.63 - val_rmse: 0.79 - val_mae: 0.60 - took 2.7 sec
Epoch 10/50 | val_loss: 0.62 - val_rmse: 0.79 - val_mae: 0.60 - took 2.7 sec
Epoch 11/50 | val_loss: 0.62 - val_rmse: 0.79 - val_mae: 0.60 - took 2.7 sec
Epoch 12/50 | val_loss: 0.62 - val_rmse: 0.79 - val_mae: 0.60 - took 2.7 sec
Epoch 13/50 | val_loss: 0.62 -

<funk_svd.svd.SVD at 0x23c0034b688>

In [13]:
# Testing
predicted = model_svd.predict(test)
metric = mean_absolute_error(test['rating'], predicted)
print(f'Test MAE: {metric:.2f}')

Test MAE: 0.60


In [66]:
predicted = model_svd.predict()

TypeError: 'int' object is not subscriptable

In [None]:
df_movie = pd.DataFrame(pd.read_csv(fr"{PATH}\movie.csv"))

In [62]:
# Recommending
import random
number_of_recommends = 10
random_user = random.randint(1,50000)
user_access = df_rating.loc[df_rating['u_id'] == random_user]
predicted = model_svd.predict(user_access)
ind = 0
print("Recommendation is:")
for r in range(number_of_recommends):
    for index, row in user_access.iterrows():
        if predicted[ind] > 4.5:
            film_row = df_movie.loc[df_movie['movieId'] == row['i_id']]
            print(film_row['title'])
print(predicted)

Recommendation is:
[4.294850298279461, 2.1154673999224904, 5, 4.5126854632264335, 5, 4.755279968441896, 4.045976065079307, 1.978948783126258, 4.210673537891176, 2.3143863939478817, 5, 3.9813482936807687, 5, 4.83662146547618, 4.668597809126402, 3.917443429812702, 4.704381470259394, 4.572389222368271, 3.213604937076598, 5, 4.078226546040213]
