# Data preparation

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF

In [None]:
movies = pd.read_csv("../data/ml-latest-small/movies.csv")
ratings = pd.read_csv("../data/ml-latest-small/ratings.csv")

### Exclude movies with less than 20 ratings

In [None]:
num_ratings_per_movie = ratings.groupby("movieId")[["rating"]].count()
popular_movie_ids = num_ratings_per_movie[num_ratings_per_movie["rating"] > 20].index

df = ratings[ratings["movieId"].isin(popular_movie_ids)].copy()
df

### Remake user and movie ids since they are not sequential

In [None]:
user_ids = df["userId"].unique()
user_id_map = {v: k for k, v in enumerate(user_ids)}
df["user_id"] = df["userId"].map(user_id_map)

movie_ids = df["movieId"].unique()
movie_id_map = {v: k for k, v in enumerate(movie_ids)}
df["movie_id"] = df["movieId"].map(movie_id_map)

df

In [None]:
# Change IDs in movies table, too
df_movies = movies[movies["movieId"].isin(movie_ids)].copy()
df_movies["movie_id"] = df_movies["movieId"].map(movie_id_map)
df_movies

In [None]:
# Add average rating column
ratings = df.groupby("movie_id")[["rating"]].mean().reset_index()
ratings.head()

In [None]:
df_movies = df_movies.merge(ratings, how="left", left_on="movie_id", right_on="movie_id")

### Save prepared data

In [None]:
df[["user_id", "movie_id", "rating", "timestamp"]].sort_values(
    ["user_id", "movie_id"]
).to_csv("../data/ratings_prepared.csv", index=False)

In [None]:
df_movies[["movie_id", "title", "genres", "rating"]].sort_values("movie_id").to_csv(
    "../data/movies_prepared.csv", index=False
)