# Samples of library usage
Here you can find a set of use cases of the library.

## Imports

In [1]:
from movie_lens_lib import *
import pandas as pd
from sklearn.model_selection import train_test_split

## Constants

In [2]:
n_movie_clusters = 5
rating_multiplier = 5
year_multiplier = 0.05
test_sample_size = 100_000
weight_genre, weight_cluster, weight_movie = 0.35, 0.45, 0.2
train_size = 0.9
random_state = 42

## Import & Split of the dataset

In [3]:
ratings_df = pd.read_csv("data/ratings.csv")
movies_df = pd.read_csv("data/movies.csv", index_col="movieId")

X = ratings_df.drop(["rating"], axis=1)
y = ratings_df["rating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=random_state)
ratings_train_df = pd.concat([X_train, y_train], axis=1)
ratings_test_df = pd.concat([X_test, y_test], axis=1)

## Preprocess

In [4]:
PreProcessingBase().fit_transform(movies_df).head()

Unnamed: 0_level_0,Genres_Split,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,"[Adventure, Animation, Children, Comedy, Fantasy]",0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"[Adventure, Children, Fantasy]",0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[Comedy, Romance]",0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,"[Comedy, Drama, Romance]",0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
5,[Comedy],0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
movies_hot_df = PreProcessingAggregated().transform((movies_df, ratings_train_df))
movies_hot_df.head()

Unnamed: 0_level_0,Genres_Split,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,rating_mean,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,"[Adventure, Animation, Children, Comedy, Fantasy]",0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,3.893497,1995.0
2,"[Adventure, Children, Fantasy]",0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,3.278157,1995.0
3,"[Comedy, Romance]",0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,3.16946,1995.0
4,"[Comedy, Drama, Romance]",0,0,0,0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,2.866337,1995.0
5,[Comedy],0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,3.079414,1995.0


## Regression

In [6]:
genre_based_regressor = GenreBasedRegressor(movies_hot_df).fit(X_train, y_train)
# cluster_based_regressor = ClusterBasedRegressor(
#     movies_hot_df,
#     n_movie_clusters,
#     rating_multiplier,
#     year_multiplier,
#     random_state
# ).fit(X_train, y_train)
movie_based_regressor = MovieBasedRegressor().fit(movies_hot_df)

## Prediction

In [7]:
X_test_sample = ratings_test_df.sample(test_sample_size, random_state=42)
y_true = X_test_sample["rating"].values

In [8]:
genre_predictions = genre_based_regressor.predict(X_test_sample, False)
# cluster_predictions = cluster_based_regressor.predict(x_test_sample, False)
movie_predictions = movie_based_regressor.predict(X_test_sample, False)

# weights = np.array([weight_genre, weight_cluster, weight_movie])
# hybrid_predictions = np.array([
#     np.array([genre_pred, cluster_pred, movie_pred]).dot(weights)
#     for genre_pred, cluster_pred, movie_pred
#     in zip(genre_predictions, cluster_predictions, movie_predictions)
# ])

## Evaluation

### Prediction based on genre

In [9]:
print("-" * 20)
print("Genre based prediction")
print_stats(get_performance_stats(y_true, genre_predictions))

# print("-" * 20)
# print("\nClustering based prediction")
# print_stats(get_performance_stats(y_true, cluster_predictions))

print("-" * 20)
print("Movie based prediction")
print_stats(get_performance_stats(y_true, movie_predictions))

# print("-" * 20)
# print("\nHybrid prediction")
# print_stats(get_performance_stats(y_true, hybrid_predictions))
print("-" * 20)

--------------------
Genre based prediction
MSE: 0.864
MAE: 0.714
ACCURACY: 0.749
--------------------
Movie based prediction
MSE: 0.933
MAE: 0.746
ACCURACY: 0.725
--------------------
