In [1]:
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from itertools import islice
from sklearn.preprocessing import StandardScaler

In [67]:
def validate_input(user_id, movie_id, default=3.5):
    user_flag = user_id in users_df.index.values
    movie_flag = movie_id in movies_hot_df.index.values
    if not user_flag and not movie_flag:
        return default
    if not user_flag and movie_flag:
        return movies_hot_df.loc[movie_id]["rating_mean"]
    if user_flag and not movie_flag:
        return users_df.loc[user_id]["rating_mean"]
    return None

def cluster_user_prediction(user_id, movie_id, rounded=True):
    if res := validate_input(user_id, movie_id):
        return res
    cluster = int(movies_hot_df.loc[movie_id]["Cluster"])
    user_pred = users_df.loc[user_id][f"Cluster_mean_{cluster}"]
    return round(user_pred * 2) / 2 if rounded else user_pred

def film_prediction(user_id, movie_id, rounded=True):
    if res := validate_input(user_id, movie_id):
        return res
    validate_input(user_id, movie_id)
    film_pred = movies_hot_df.loc[movie_id]["rating_mean"]
    return round(film_pred * 2) / 2 if rounded else film_pred

def cluster_film_prediction(user_id, movie_id, rounded=True):
    if res := validate_input(user_id, movie_id):
        return res
    cluster = int(movies_hot_df.loc[movie_id]["Cluster"])
    class_pred = cluster_ratings[cluster]
    return round(class_pred * 2) / 2 if rounded else class_pred

def genre_prediction(user_id, movie_id, rounded=True):
    if res := validate_input(user_id, movie_id):
        return res
    movie_genres = movies_hot_rule_df.loc[movie_id]["Genres_Split"]
    movie_genres = ["average_rating_"+x for x in movie_genres]
    res = user_genre_df.loc[user_id][movie_genres].mean()
    return round(res * 2) / 2 if rounded else res

class ClusterRuleBasedEstimator(BaseEstimator):
    def fit(self, X, y=None):
        return self

    def predict(self, X):
        return np.array([cluster_user_prediction(user, movie) for user, movie in X])

def custom_mse_scorer(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def custom_mae_scorer(y_true, y_pred):
    return np.mean(abs(y_true - y_pred))

def custom_accuracy_scorer(y_true, y_pred, tol=(0.5 + 1e-9)):
    accuracy = np.isclose(y_pred, y_true, atol=tol).mean()
    return accuracy

In [3]:
RATING_MULTIPLIER   = 3
YEAR_MULTIPLIER     = 1 / 20
N_MOVIE_CLUSTERS    = 5

In [4]:
ratings_df = pd.read_csv("data/ratings.csv")
movies_df = pd.read_csv("data/movies.csv", index_col="movieId")

X = ratings_df.drop(["rating"], axis=1)
y = ratings_df["rating"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=42)

ratings_train_df = pd.concat([X_train, y_train], axis=1)
ratings_test_df = pd.concat([X_test, y_test], axis=1)

movies_df['Genres_Split'] = movies_df['genres'].apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(movies_df['Genres_Split'])
binary_df = pd.DataFrame(binary_matrix, columns=mlb.classes_)

movies_df.reset_index(inplace=True)
movies_hot_df = pd.concat([movies_df, binary_df], axis=1)
movies_hot_df = movies_hot_df.drop(['genres', 'Genres_Split'], axis=1)
movies_hot_df = movies_hot_df.merge(ratings_train_df.groupby("movieId")["rating"].mean().reset_index(), on="movieId")
movies_hot_df.rename(columns={"rating": "rating_mean"}, inplace=True)
movies_hot_df = movies_hot_df.drop(["title"], axis=1).set_index("movieId")

movies_hot_df["rating_mean"] *= RATING_MULTIPLIER

# movies_hot_year_df = movies_hot_df.copy()
# years = movies_df[movies_df['movieId'].isin(movies_hot_df.index)]['title'].str.extract(r'\((\d{4})\)')
# years.index = movies_hot_year_df.index
# years[0] = pd.to_numeric(years[0], errors='coerce')
# years = years.fillna(years.median())
# scaler = StandardScaler()
# scaled_years = scaler.fit_transform(years)
# scaled_years_df = pd.DataFrame(scaled_years, index=years.index, columns=years.columns)
# movies_hot_df['year'] = scaled_years_df

movies_hot_year_df = movies_hot_df.copy()
years = movies_df[movies_df['movieId'].isin(movies_hot_df.index)]['title'].str.extract(r'\((\d{4})\)')
years.index = movies_hot_year_df.index
years[0] = pd.to_numeric(years[0], errors='coerce')
years = years.fillna(years.median())
scaled_years_df = years * YEAR_MULTIPLIER
movies_hot_df['year'] = scaled_years_df

kmeans = KMeans(n_clusters=N_MOVIE_CLUSTERS, random_state=42)
movies_hot_df['Cluster'] = kmeans.fit_predict(movies_hot_df)

movies_hot_df["rating_mean"] /= RATING_MULTIPLIER

ratings_with_clusters = ratings_train_df.merge(movies_hot_df, left_on='movieId', right_index=True)

user_cluster_stats = ratings_with_clusters.groupby(['userId', 'Cluster'])['rating'].agg(['count', 'sum']).reset_index()
user_cluster_pivot = user_cluster_stats.pivot(index='userId', columns='Cluster', values=['count', 'sum']).fillna(0)
user_cluster_pivot.columns = [f'Cluster_{stat}_{cluster}' for stat, cluster in user_cluster_pivot.columns]

for cluster in range(N_MOVIE_CLUSTERS):
    count_col = f'Cluster_count_{cluster}'
    sum_col = f'Cluster_sum_{cluster}'
    mean_col = f'Cluster_mean_{cluster}'
    if count_col in user_cluster_pivot.columns and sum_col in user_cluster_pivot.columns:
        user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]
    else:
        user_cluster_pivot[count_col] = 0
        user_cluster_pivot[sum_col] = 0
        user_cluster_pivot[mean_col] = 0

user_cluster_pivot = user_cluster_pivot.fillna(0)
users_df = user_cluster_pivot.reset_index()
users_df.set_index("userId", inplace=True)
cluster_ratings = ratings_train_df.merge(
    movies_hot_df.reset_index()[["Cluster", "movieId"]],
    on="movieId") \
    .groupby("Cluster")["rating"].mean()

  super()._check_params_vs_input(X, default_n_init=10)


In [56]:
users_df

Unnamed: 0_level_0,Cluster_count_0,Cluster_count_1,Cluster_count_2,Cluster_count_3,Cluster_count_4,Cluster_sum_0,Cluster_sum_1,Cluster_sum_2,Cluster_sum_3,Cluster_sum_4,Cluster_mean_0,Cluster_mean_1,Cluster_mean_2,Cluster_mean_3,Cluster_mean_4
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,14.0,35.0,0.0,0.0,6.0,52.5,145.5,0.0,0.0,22.5,3.750000,4.157143,0.0,0.000000,3.750000
2,50.0,31.0,0.0,6.0,0.0,168.0,126.0,0.0,15.0,0.0,3.360000,4.064516,0.0,2.500000,0.000000
3,2.0,27.0,0.0,0.0,0.0,10.0,132.0,0.0,0.0,0.0,5.000000,4.888889,0.0,0.000000,0.000000
4,2.0,24.0,0.0,0.0,1.0,7.5,106.5,0.0,0.0,4.0,3.750000,4.437500,0.0,0.000000,4.000000
5,9.0,23.0,0.0,5.0,1.0,27.0,93.0,0.0,11.0,2.0,3.000000,4.043478,0.0,2.200000,2.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330971,4.0,41.0,0.0,0.0,0.0,12.5,172.5,0.0,0.0,0.0,3.125000,4.207317,0.0,0.000000,0.000000
330972,61.0,35.0,0.0,18.0,2.0,199.0,134.0,0.0,53.0,7.0,3.262295,3.828571,0.0,2.944444,3.500000
330973,10.0,6.0,0.0,3.0,0.0,35.5,22.5,0.0,7.0,0.0,3.550000,3.750000,0.0,2.333333,0.000000
330974,45.0,52.0,0.0,2.0,0.0,128.0,201.5,0.0,2.5,0.0,2.844444,3.875000,0.0,1.250000,0.000000


In [62]:
sum_sums = users_df[["Cluster_sum_"+str(x) for x in range(N_MOVIE_CLUSTERS)]].sum(axis=1)
count_sums = users_df[["Cluster_count_"+str(x) for x in range(N_MOVIE_CLUSTERS)]].sum(axis=1)
users_df["rating_mean"] = sum_sums / count_sums

In [5]:
ratings_df = pd.read_csv("data/ratings.csv")
movies_df = pd.read_csv("data/movies.csv", index_col="movieId")

X = ratings_df.drop(["rating"], axis=1)
y = ratings_df["rating"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=42)
ratings_train_df = pd.concat([X_train, y_train], axis=1)
ratings_test_df = pd.concat([X_test, y_test], axis=1)

movies_df['Genres_Split'] = movies_df['genres'].apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(movies_df['Genres_Split'])
binary_df = pd.DataFrame(binary_matrix, columns=mlb.classes_)
movies_hot_rule_df = movies_df.join(binary_df)

user_genre_df = pd.DataFrame({"userId": ratings_train_df["userId"].unique()})
genres = movies_hot_rule_df["Genres_Split"].explode().unique()

for genre in genres:
    user_genre_df[genre] = pd.Series(np.zeros(user_genre_df.shape[0]))
    user_genre_df["count_"+genre] = pd.Series(np.zeros(user_genre_df.shape[0]))

user_genre_df.set_index("userId", inplace=True)

movies_df = movies_df.explode('Genres_Split').rename(columns={'Genres_Split': 'genre'})
merged_df = ratings_train_df.merge(movies_df, on='movieId')

agg_df = merged_df.groupby(['userId', 'genre']).agg(
    total_rating=('rating', 'sum'),
    count=('rating', 'count')
).reset_index()

agg_df['average_rating'] = agg_df['total_rating'] / agg_df['count']

user_genre_df = agg_df.pivot(index='userId', columns='genre', values=['average_rating', 'count'])
user_genre_df.columns = [f'{stat}_{genre}' for stat, genre in user_genre_df.columns]
user_genre_df = user_genre_df.reset_index()
user_genre_df.fillna(3.5, inplace=True)
user_genre_df.drop(["count_"+col for col in genres], axis=1, inplace=True)
user_genre_df.set_index("userId", inplace=True)

In [6]:
y_pred, y_true = [], []
unknown_count = 0

for i, row in islice(ratings_test_df.iterrows(), 0, 100_000):
    try:
        y_pred.append(cluster_user_prediction(row["userId"], row["movieId"]))
    except:
        y_pred.append(3.5)
        unknown_count += 1

    y_true.append(row["rating"])

acc_mse = custom_mse_scorer(np.array(y_pred), np.array(y_true))
acc = custom_mae_scorer(np.array(y_pred), np.array(y_true))

print("MSE:", acc_mse)
print("MAE:", acc)

MSE: 0.838685
MAE: 0.66491


In [81]:
y_pred, y_true = [], []
unknown_count = 0

for i, row in islice(ratings_test_df.iterrows(), 0, 100_000):
    try:
        y_pred.append(cluster_film_prediction(row["userId"], row["movieId"]))
    except:
        y_pred.append(3.5)
        unknown_count += 1

    y_true.append(row["rating"])

acc_mse = custom_mse_scorer(np.array(y_pred), np.array(y_true))
acc = custom_mae_scorer(np.array(y_pred), np.array(y_true))
print("MSE:", acc_mse)
print("MAE:", acc)

MSE: 0.9946639852942936
MAE: 0.7516203481472026


## Hybrid

In [74]:
genre_prediction_arr = []
cluster_user_prediction_arr = []
film_prediction_arr = []
cluster_film_prediction_arr = []

for i, row in islice(ratings_test_df.iterrows(), 0, 100_000):
    genre_prediction_arr.append(genre_prediction(row["userId"], row["movieId"], False))
    cluster_user_prediction_arr.append(cluster_user_prediction(row["userId"], row["movieId"], False))
    film_prediction_arr.append(film_prediction(row["userId"], row["movieId"], False))
    cluster_film_prediction_arr.append(cluster_film_prediction(row["userId"], row["movieId"], False))

In [75]:
y_true = ratings_test_df["rating"].values[:100_000]

In [76]:
weight_combinations = []
coefs = [x/20 for x in range(0, 21)]
for coef_a in coefs:
    for coef_b in coefs:
        for coef_c in coefs:
            if coef_a + coef_b + coef_c > 1:
                continue
            coef_d = 1 - coef_a - coef_b - coef_c
            weight_combinations.append((coef_a, coef_b, coef_c, coef_d))

In [78]:
results = {}
for coef_a, coef_b, coef_c, coef_d in weight_combinations:
    y_pred = []
    
    for i in range(100_000):
        y_pred.append(
            genre_prediction_arr[i] * coef_a +
            cluster_user_prediction_arr[i] * coef_b +
            film_prediction_arr[i] * coef_c +
            cluster_film_prediction_arr[i] * coef_d
        )
        
    mse = custom_mse_scorer(np.array(y_pred), np.array(y_true))
    mae = custom_mae_scorer(np.array(y_pred), np.array(y_true))
    results[(coef_a, coef_b, coef_c, coef_d)] = {"mae": mae, "mse": mse}

In [79]:
print(results)

{(0.0, 0.0, 0.0): {'mae': 0.7697289022027327, 'mse': 0.9772664791403862}, (0.0, 0.0, 0.05): {'mae': 0.7678282479853855, 'mse': 0.9732774658638881}, (0.0, 0.0, 0.1): {'mae': 0.7659360548099654, 'mse': 0.9695023595923564}, (0.0, 0.0, 0.15): {'mae': 0.7640765128727499, 'mse': 0.9659411603257919}, (0.0, 0.0, 0.2): {'mae': 0.76226771097501, 'mse': 0.9625938680641944}, (0.0, 0.0, 0.25): {'mae': 0.760511936859007, 'mse': 0.9594604828075636}, (0.0, 0.0, 0.3): {'mae': 0.7588553629684116, 'mse': 0.9565410045558994}, (0.0, 0.0, 0.35): {'mae': 0.7573158025628091, 'mse': 0.9538354333092022}, (0.0, 0.0, 0.4): {'mae': 0.7558859658140964, 'mse': 0.9513437690674722}, (0.0, 0.0, 0.45): {'mae': 0.7545488834512597, 'mse': 0.949066011830709}, (0.0, 0.0, 0.5): {'mae': 0.7533265508562422, 'mse': 0.9470021615989124}, (0.0, 0.0, 0.55): {'mae': 0.7521816970898658, 'mse': 0.945152218372083}, (0.0, 0.0, 0.6): {'mae': 0.7511384213435287, 'mse': 0.9435161821502203}, (0.0, 0.0, 0.65): {'mae': 0.7502122512837441, 'ms

In [80]:
sorted(results.items(), key=lambda x: (x[1]["mse"], x[1]["mae"]))

[((0.35, 0.45, 0.2), {'mae': 0.6688697670875462, 'mse': 0.7570771423276302}),
 ((0.3, 0.45, 0.25), {'mae': 0.6692086348028187, 'mse': 0.7572918944780845}),
 ((0.35, 0.4, 0.25), {'mae': 0.6702461780015321, 'mse': 0.7573681292624124}),
 ((0.3, 0.5, 0.2), {'mae': 0.6679888521215285, 'mse': 0.7575390735717824}),
 ((0.4, 0.4, 0.2), {'mae': 0.6701771474848601, 'mse': 0.7579023510912829}),
 ((0.25, 0.5, 0.25), {'mae': 0.6686062479619439, 'mse': 0.7585027997015618}),
 ((0.35, 0.5, 0.15), {'mae': 0.6681445056317913, 'mse': 0.7585062116227378}),
 ((0.4, 0.35, 0.25), {'mae': 0.6716992732547293, 'mse': 0.7587315040545453}),
 ((0.3, 0.4, 0.3), {'mae': 0.671084015925607, 'mse': 0.7587647716142766}),
 ((0.4, 0.45, 0.15), {'mae': 0.6693061379814416, 'mse': 0.7587932543579103}),
 ((0.25, 0.55, 0.2), {'mae': 0.6675186789949804, 'mse': 0.7592881448237397}),
 ((0.35, 0.35, 0.3), {'mae': 0.6722700881126599, 'mse': 0.7593791724270844}),
 ((0.25, 0.45, 0.3), {'mae': 0.6703299369541929, 'mse': 0.7594375108092

# Baseline

In [2]:
def custom_mse_scorer(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def custom_mae_scorer(y_true, y_pred):
    return np.mean(abs(y_true - y_pred))

In [3]:
ratings_df = pd.read_csv("data/ratings.csv")
movies_df = pd.read_csv("data/movies.csv", index_col="movieId")

In [4]:
acc_mse = custom_mse_scorer(ratings_df["rating"], np.zeros(ratings_df.shape[0]) + 2.5)
acc_mae = custom_mae_scorer(ratings_df["rating"], np.zeros(ratings_df.shape[0]) + 2.5)

In [5]:
print("MSE:", acc_mse)
print("MAE:", acc_mae)

MSE: 2.218898410926266
MAE: 1.296070437354846


In [None]:

X = ratings_df.drop(["rating"], axis=1)
y = ratings_df["rating"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=42)

ratings_train_df = pd.concat([X_train, y_train], axis=1)
ratings_test_df = pd.concat([X_test, y_test], axis=1)