In [1]:
import polars as pl
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import math

In [2]:
df_movies = pl.read_csv('./movies.csv')
df_ratings = pl.read_csv('./ratings.csv')

In [3]:
df_ratings = df_ratings.drop('timestamp')

In [4]:
df_ratings = df_ratings.sort('userId')

In [5]:
df_ratings.schema

Schema([('userId', Int64), ('movieId', Int64), ('rating', Float64)])

In [6]:
df_ratings.head(5)

userId,movieId,rating
i64,i64,f64
1,1,4.0
1,3,4.0
1,6,4.0
1,47,5.0
1,50,5.0


In [7]:
df_ratings.tail()

userId,movieId,rating
i64,i64,f64
610,166534,4.0
610,168248,5.0
610,168250,5.0
610,168252,5.0
610,170875,3.0


In [8]:
df_ratings.describe()

statistic,userId,movieId,rating
str,f64,f64,f64
"""count""",100836.0,100836.0,100836.0
"""null_count""",0.0,0.0,0.0
"""mean""",326.127564,19435.295718,3.501557
"""std""",182.618491,35530.987199,1.042529
"""min""",1.0,1.0,0.5
"""25%""",177.0,1199.0,3.0
"""50%""",325.0,2991.0,3.5
"""75%""",477.0,8121.0,4.0
"""max""",610.0,193609.0,5.0


In [9]:
users = df_ratings.select('userId').sort('userId').unique()
itens = df_ratings.select('movieId').sort('movieId').unique()

In [16]:
class FunkSVDModelResult:
  def __init__(self, global_mean, users_bias, items_bias, P_matrix, Q_matrix, errors, iterations, sq_errors):
    self.global_mean = global_mean
    self.users_bias = users_bias
    self.items_bias = items_bias
    self.P_matrix = P_matrix
    self.Q_matrix = Q_matrix
    errors_array = np.array([errors, list(range(1, iterations+1))])
    self.sq_errors = sq_errors
    self.errors = pl.DataFrame(errors_array, schema=[("error", pl.Float64), ("iteration", pl.Int64)], orient="col")

  def predict(self, user_id, item_id):
    return round(self.global_mean + self.users_bias[user_id] + self.items_bias[item_id] + np.dot(self.P_matrix[user_id], self.Q_matrix[item_id]), 2)


In [17]:
def funk_svd(rating_matrix, k, learning_rate=0.05, regulation=0.02, iterations=10):
  rating_n_rows = rating_matrix.shape[0]

  global_mean = rating_matrix.select('rating').mean()['rating'][0]

  n_users = rating_matrix.select('userId').max().rows()[0][0]
  n_items = rating_matrix.select('movieId').max().rows()[0][0]

  users_bias = np.zeros(shape=n_users)
  items_bias = np.zeros(shape=n_items)

  P_matrix = np.full(shape=(n_users, k), fill_value=0.1, dtype=np.float64)
  Q_matrix = np.full(shape=(n_items, k), fill_value=0.1, dtype=np.float64)

  errors = list()
  sq_errors = list()

  for n_iteration in range(iterations):
    sq_error = 0
    for r_matrix_row in range(rating_n_rows):

      user = rating_matrix[r_matrix_row].select('userId').rows()[0][0] - 1
      item = rating_matrix[r_matrix_row].select('movieId').rows()[0][0] - 1
      real_r = rating_matrix[r_matrix_row].select('rating').rows()[0][0]

      pred_r = global_mean + users_bias[user] + items_bias[item] + np.dot(P_matrix[user], Q_matrix[item])
      error_ui = real_r - pred_r
      sq_error = sq_error + error_ui**2
      sq_errors.append(sq_error)

      users_bias[user] = users_bias[user] + learning_rate*(error_ui - regulation*users_bias[user])
      items_bias[item] = items_bias[item] + learning_rate*(error_ui - regulation*items_bias[item])

      for factor in range(k):
        temp_uf = P_matrix[user, factor]
        P_matrix[user, factor] = P_matrix[user, factor] + learning_rate*(error_ui*Q_matrix[item, factor] - regulation*P_matrix[user, factor])
        Q_matrix[item, factor] = Q_matrix[item, factor] + learning_rate*(error_ui*temp_uf - regulation*Q_matrix[item, factor])
    errors.append(math.sqrt(sq_error/rating_n_rows))  # RMSE

  return FunkSVDModelResult(global_mean, users_bias, items_bias, P_matrix, Q_matrix, errors, iterations, sq_errors)


In [14]:
# sample = df_ratings.group_by("userId").agg([
#     pl.col("movieId").first().alias("movieId"),
#     pl.col("rating").first().alias("rating")
# ])[:400]
X = df_ratings.select(['userId', 'movieId'])
y = df_ratings.select('rating')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
model = funk_svd(sample, iterations=20, k=40, learning_rate=0.001, regulation=0.02)

In [59]:
model.predict(user_id=1, item_id=3)

np.float64(4.0)