In [2]:
from movie_lens_lib import *
import pandas as pd
from sklearn.model_selection import train_test_split

## Constants

In [3]:
n_movie_clusters = 5
rating_multiplier = 5
year_multiplier = 0.05
test_sample_size = 100_000
weight_genre, weight_cluster, weight_movie = 0.35, 0.45, 0.2
train_size = 0.9
random_state = 42

## Importing & Split of the dataset

In [4]:
ratings_df = pd.read_csv("data/ratings.csv")
movies_df = pd.read_csv("data/movies.csv", index_col="movieId")

X = ratings_df.drop(["rating"], axis=1)
y = ratings_df["rating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=random_state)
ratings_train_df = pd.concat([X_train, y_train], axis=1)
ratings_test_df = pd.concat([X_test, y_test], axis=1)

## Preprocessing & Fitting

In [5]:
movies_hot_df = PreProcessingAggregated().transform((movies_df, ratings_train_df))

genre_based_regressor = GenreBasedRegressor(movies_hot_df).fit(X_train, y_train)
cluster_based_regressor = ClusterBasedRegressor(
    movies_hot_df,
    n_movie_clusters,
    rating_multiplier,
    year_multiplier
).fit(X_train, y_train)
movie_based_regressor = MovieBasedRegressor().fit(movies_hot_df)

  super()._check_params_vs_input(X, default_n_init=10)


## Testing

In [6]:
x_test_sample = ratings_test_df.sample(frac=1, random_state=42)
x_test_sample = x_test_sample[:test_sample_size]
y_true = x_test_sample["rating"].values

### Prediction based on genre

In [7]:
genre_predictions = genre_based_regressor.predict(x_test_sample, False)
print_stats(get_performance_stats(y_true, genre_predictions))

MSE: 0.864
MAE: 0.714
ACCURACY: 0.749


### Prediction based on clustering

In [8]:
cluster_predictions = cluster_based_regressor.predict(x_test_sample, False)
print_stats(get_performance_stats(y_true, cluster_predictions))

MSE: 0.812
MAE: 0.673
ACCURACY: 0.786


### Prediction based on movie rating

In [9]:
movie_predictions = movie_based_regressor.predict(x_test_sample, False)
print_stats(get_performance_stats(y_true, cluster_predictions))

MSE: 0.812
MAE: 0.673
ACCURACY: 0.786


### Prediction based on hybrid

In [10]:
weights = np.array([weight_genre, weight_cluster, weight_movie])

y_pred = np.array([
    np.array([genre_pred, cluster_pred, movie_pred]).dot(weights)
    for genre_pred, cluster_pred, movie_pred
    in zip(genre_predictions, cluster_predictions, movie_predictions)
])

print_stats(get_performance_stats(y_true, y_pred))

MSE: 0.757
MAE: 0.667
ACCURACY: 0.778
