In [None]:
!rm -rf sample_data

In [None]:
!pip install polars numpy seaborn



In [None]:
!gdown 1nNrnLYdD6KljcsoNK16CjpWAvJHoNje7 \
&& gdown 1vj2EyN6W0lbA51FJ5a5xnLCoa5kmlRrh \
&& gdown 1E9LHoC11q_dXZeW71bwnXyY9hxLdMf1w \

Downloading...
From (original): https://drive.google.com/uc?id=1nNrnLYdD6KljcsoNK16CjpWAvJHoNje7
From (redirected): https://drive.google.com/uc?id=1nNrnLYdD6KljcsoNK16CjpWAvJHoNje7&confirm=t&uuid=5d6511ce-5584-46af-b472-840e65015867
To: /content/ratings.csv
100% 877M/877M [00:10<00:00, 80.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1vj2EyN6W0lbA51FJ5a5xnLCoa5kmlRrh
To: /content/movies.csv
100% 494k/494k [00:00<00:00, 7.15MB/s]
usage: gdown [-h] [-V] [-O OUTPUT] [-q] [--fuzzy] [--id] [--proxy PROXY] [--speed SPEED]
             [--no-cookies] [--no-check-certificate] [--continue] [--folder] [--remaining-ok]
             [--format FORMAT] [--user-agent USER_AGENT]
             url_or_id
gdown: error: unrecognized arguments: \


In [None]:
import polars as pl
import numpy as np
import seaborn as sns
import math

In [29]:
df_movies = pl.read_csv('./movies.csv')
df_ratings = pl.read_csv('./ratings.csv')

In [30]:
sample = df_ratings.group_by("userId").agg([
    pl.col("movieId").first().alias("movieId"),
    pl.col("rating").first().alias("rating")
])

In [32]:
sample.shape

(610, 3)

In [33]:
df_ratings.shape

(100836, 4)

In [None]:
df_ratings = df_ratings.drop('timestamp')

In [None]:
df_ratings = df_ratings.sort('userId')

In [None]:
df_ratings.schema

Schema([('userId', Int64), ('movieId', Int64), ('rating', Float64)])

In [None]:
df_ratings.head(5)

userId,movieId,rating
i64,i64,f64
1,17,4.0
1,25,1.0
1,29,2.0
1,30,5.0
1,32,5.0


In [None]:
df_ratings.tail()

userId,movieId,rating
i64,i64,f64
200948,79702,4.5
200948,79796,1.0
200948,80350,0.5
200948,80463,3.5
200948,87304,4.5


In [None]:
df_ratings.describe()

statistic,userId,movieId,rating
str,f64,f64,f64
"""count""",32000204.0,32000204.0,32000204.0
"""null_count""",0.0,0.0,0.0
"""mean""",100278.506411,29318.610122,3.540396
"""std""",57949.046233,50958.16088,1.058986
"""min""",1.0,1.0,0.5
"""25%""",50053.0,1233.0,3.0
"""50%""",100297.0,3452.0,3.5
"""75%""",150451.0,44199.0,4.0
"""max""",200948.0,292757.0,5.0


In [None]:
users = df_ratings.select('userId').sort('userId').unique()
itens = df_ratings.select('movieId').sort('movieId').unique()

In [None]:
class FunkSVDModelResult:
  def __init__(self, global_mean, users_bias, itens_bias, P_matrix, Q_matrix, errors, iterations):
    self.global_mean = global_mean
    self.users_bias = users_bias
    self.itens_bias = itens_bias
    self.P_matrix = P_matrix
    self.Q_matrix = Q_matrix
    errors_array = np.array([errors, list(range(1, iterations+1))])
    self.errors = pl.DataFrame(errors_array, schema=[("error", pl.Float32), ("iteration", pl.Int64)], orient="col")

  def predict(self, user_id, item_id):
    return round(self.global_mean + self.users_bias[user_id] + self.itens_bias[item_id] + np.dot(self.P_matrix[user_id], self.Q_matrix[item_id]), 2)


In [None]:
def funkSVD(rating_matrix, users, itens, k, learning_rate=0.05, regulation=0.02, iterations=10):
  rating_n_rows = rating_matrix.shape[0]

  global_mean = rating_matrix.select('rating').mean()['rating'][0]

  n_users = rating_matrix.select('userId').max().rows()[0][0]
  n_items = rating_matrix.select('movieId').max().rows()[0][0]

  users_bias = np.zeros(shape=n_users)
  itens_bias = np.zeros(shape=n_items)

  P_matrix = np.full(shape=(n_users, k), fill_value=0.1)
  Q_matrix = np.full(shape=(n_items, k), fill_value=0.1)

  errors = list()

  for n_iteration in range(iterations):
    sq_error = 0
    for r_matrix_row in range(rating_n_rows):

      user = rating_matrix[r_matrix_row].select('userId').rows()[0][0] - 1
      item = rating_matrix[r_matrix_row].select('movieId').rows()[0][0] - 1
      real_r = rating_matrix[r_matrix_row].select('rating').rows()[0][0]

      pred_r = (global_mean + users_bias[user]
       + itens_bias[item] + np.dot(P_matrix[user], Q_matrix[item]))
      error_ui = real_r - pred_r
      sq_error = sq_error + error_ui**2

      users_bias[user] = users_bias[user] + learning_rate*(error_ui - regulation*users_bias[user])
      itens_bias[item] = itens_bias[item] + learning_rate*(error_ui - regulation*itens_bias[item])

      for factor in range(k):
        temp_uf = P_matrix[user, factor]
        P_matrix[user, factor] = P_matrix[user, factor] + learning_rate*(error_ui*Q_matrix[item, factor] - regulation*P_matrix[user, factor])
        Q_matrix[item, factor] = Q_matrix[item, factor] + learning_rate*(error_ui*temp_uf - regulation*Q_matrix[item, factor])
    errors.append(math.sqrt(sq_error/rating_n_rows))  # RMSE

  return FunkSVDModelResult(global_mean, users_bias, itens_bias, P_matrix, Q_matrix, errors, iterations)


In [None]:
sample = df_ratings.group_by("userId").agg([
    pl.col("movieId").first().alias("movieId"),
    pl.col("rating").first().alias("rating")
])

In [None]:
model = funkSVD(sample, users, itens, k=50)

10 11


In [None]:
model.errors

In [None]:
# Prevendo avaliação do usuário 500 para o item 100
model.predict(user_id=5, item_id=10)