In [1]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from itertools import islice

In [2]:
def validate_input(user_id, movie_id, default=3.5):
    user_flag = user_id in users_df.index.values
    movie_flag = movie_id in movies_hot_df.index.values
    if not user_flag and not movie_flag:
        return default
    if not user_flag and movie_flag:
        return movies_hot_df.loc[movie_id]["rating_mean"]
    if user_flag and not movie_flag:
        return users_df.loc[user_id]["rating_mean"]
    return None

def cluster_user_prediction(user_id, movie_id, rounded=True):
    # if res := validate_input(user_id, movie_id):
    #     return res
    try:
        cluster = int(movies_hot_df.loc[movie_id]["Cluster"])
        user_pred = users_df.loc[user_id][f"Cluster_mean_{cluster}"]
        return round(user_pred * 2) / 2 if rounded else user_pred
    except:
        return 3.5

def film_prediction(user_id, movie_id, rounded=True):
    # if res := validate_input(user_id, movie_id):
    #     return res
    try:
        validate_input(user_id, movie_id)
        film_pred = movies_hot_df.loc[movie_id]["rating_mean"]
        return round(film_pred * 2) / 2 if rounded else film_pred
    except:
        return 3.5

def genre_prediction(X, rounded=True):
    # if res := validate_input(user_id, movie_id):
    #     return res
    # try:
    #     movie_genres = movies_hot_rule_df.loc[movie_id]["Genres_Split"]
    #     movie_genres = ["average_rating_"+x for x in movie_genres]
    #     res = user_genre_df.loc[user_id][movie_genres].mean()
    #     return round(res * 2) / 2 if rounded else res
    # except:
    #     return 3.5
    # try:
    movie_genres = movies_hot_rule_df.loc[X["movieId"]]["Genres_Split"]
    print(type(movie_genres))
    movie_genres.apply(lambda cell: ["average_rating_"+x for x in cell])
    res = user_genre_df.loc[user_idd][movie_genres].mean()
    return round(res * 2) / 2 if rounded else res
    # except:
    #     return 3.5

class ClusterRuleBasedEstimator(BaseEstimator):
    def fit(self, X, y=None):
        return self

    def predict(self, X):
        return np.array([cluster_user_prediction(user, movie) for user, movie in X])

def custom_mse_scorer(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def custom_mae_scorer(y_true, y_pred):
    return np.mean(abs(y_true - y_pred))

def custom_accuracy_scorer(y_true, y_pred, tol=(0.5 + 1e-9)):
    accuracy = np.isclose(y_pred, y_true, atol=tol).mean()
    return accuracy

In [3]:
class GenreBasedClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, movies_hot_rule_df):
        self.user_genre_df = None
        self.movies_hot_rule_df = movies_hot_rule_df
        
    def fit(self, X, y=None):
        ratings_train_df = pd.concat([X, y], axis=1)
        self.user_genre_df = pd.DataFrame({"userId": ratings_train_df["userId"].unique()})
        genres = self.movies_hot_rule_df["Genres_Split"].explode().unique()
        
        for genre in genres:
            self.user_genre_df[genre] = pd.Series(np.zeros(self.user_genre_df.shape[0]))
            self.user_genre_df["count_"+genre] = pd.Series(np.zeros(self.user_genre_df.shape[0]))
        
        self.user_genre_df.set_index("userId", inplace=True)
        
        movies_exploded_df = self.movies_hot_rule_df.explode('Genres_Split').rename(columns={'Genres_Split': 'genre'})
        merged_df = ratings_train_df.merge(movies_exploded_df, on='movieId')
        
        agg_df = merged_df.groupby(['userId', 'genre']).agg(
            total_rating=('rating', 'sum'),
            count=('rating', 'count')
        ).reset_index()
        
        agg_df['average_rating'] = agg_df['total_rating'] / agg_df['count']
        
        self.user_genre_df = agg_df.pivot(index='userId', columns='genre', values=['average_rating', 'count'])
        self.user_genre_df.columns = [f'{stat}_{genre}' for stat, genre in self.user_genre_df.columns]
        self.user_genre_df = self.user_genre_df.reset_index()
        self.user_genre_df.fillna(3.5, inplace=True)
        self.user_genre_df.drop(["count_"+col for col in genres], axis=1, inplace=True)
        self.user_genre_df.set_index("userId", inplace=True)
        
        return self
        
    def predict(self, X, rounded=True):
        y_pred = []
        for i, row in X.iterrows():
            y_pred.append(self._predict(row["userId"], row["movieId"], rounded))
        return y_pred
        
    def _predict(self, user_id, movie_id, rounded=True):
        try:
            movie_genres = self.movies_hot_rule_df.loc[movie_id]["Genres_Split"]
            movie_genres = ["average_rating_"+x for x in movie_genres]
            res = self.user_genre_df.loc[user_id][movie_genres].mean()
            return round(res * 2) / 2 if rounded else res
        except:
            return 3.5

class PreProcessingBase(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):     
        X['Genres_Split'] = X['genres'].apply(lambda x: x.split('|'))
        mlb = MultiLabelBinarizer()
        binary_matrix = mlb.fit_transform(X['Genres_Split'])
        binary_df = pd.DataFrame(binary_matrix, columns=mlb.classes_)
        movies_hot_rule_df = X.join(binary_df)
        return movies_hot_rule_df
    
    
class PreProcessingAggregated(PreProcessingBase):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):     
        movies_hot_rule_df = super().transform(X)
        ...
        return ...

In [4]:
ratings_df = pd.read_csv("data/ratings.csv")
movies_df = pd.read_csv("data/movies.csv", index_col="movieId")

X = ratings_df.drop(["rating"], axis=1)
y = ratings_df["rating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=42)
ratings_train_df = pd.concat([X_train, y_train], axis=1)
ratings_test_df = pd.concat([X_test, y_test], axis=1)

movies_hot_rule_df = PreProcessingBase().transform(movies_df)

In [5]:
genre_based_classifier = GenreBasedClassifier(movies_hot_rule_df).fit(X_train, y_train)

In [6]:
ratings_test_validate_df = ratings_test_df.sample(frac=1, random_state=42)
ratings_test_validate_df = ratings_test_validate_df[:100_000]
y_true = ratings_test_validate_df["rating"].values[:100_000]
genre_prediction_arr = genre_based_classifier.predict(ratings_test_validate_df, False)

mse = custom_mse_scorer(np.array(genre_prediction_arr), np.array(y_true))
mae = custom_mae_scorer(np.array(genre_prediction_arr), np.array(y_true))
print("MSE:", mse)
print("MAE:", mae)

MSE: 0.863739474299469
MAE: 0.7136988382163852


# Testing

In [None]:
# ratings_df = pd.read_csv("data/ratings.csv")
# movies_df = pd.read_csv("data/movies.csv", index_col="movieId")
# 
# X = ratings_df.drop(["rating"], axis=1)
# y = ratings_df["rating"]
# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=42)
# ratings_train_df = pd.concat([X_train, y_train], axis=1)
# ratings_test_df = pd.concat([X_test, y_test], axis=1)
# 
# 
# movies_df['Genres_Split'] = movies_df['genres'].apply(lambda x: x.split('|'))
# mlb = MultiLabelBinarizer()
# binary_matrix = mlb.fit_transform(movies_df['Genres_Split'])
# binary_df = pd.DataFrame(binary_matrix, columns=mlb.classes_)
# movies_hot_rule_df = movies_df.join(binary_df)

In [None]:
movie_genres = movies_hot_rule_df.loc[X["movieId"]]["Genres_Split"]
print(type(movie_genres))

In [None]:
movie_genres = movie_genres.apply(lambda cell: ["average_rating_"+x for x in cell])
movie_genres.head()

In [None]:
X

In [None]:
genre_based_classifier.user_genre_df

In [None]:
X["userId"].apply(lambda x: (x in genre_based_classifier.user_genre_df.index.values))

In [None]:
res = genre_based_classifier.user_genre_df.loc[X["userId"]][movie_genres].mean()

# Hand clustering by genre

In [None]:
user_genre_df = pd.DataFrame({"userId": ratings_train_df["userId"].unique()})
genres = movies_hot_rule_df["Genres_Split"].explode().unique()

for genre in genres:
    user_genre_df[genre] = pd.Series(np.zeros(user_genre_df.shape[0]))
    user_genre_df["count_"+genre] = pd.Series(np.zeros(user_genre_df.shape[0]))

user_genre_df.set_index("userId", inplace=True)

movies_df = movies_df.explode('Genres_Split').rename(columns={'Genres_Split': 'genre'})
merged_df = ratings_train_df.merge(movies_df, on='movieId')

agg_df = merged_df.groupby(['userId', 'genre']).agg(
    total_rating=('rating', 'sum'),
    count=('rating', 'count')
).reset_index()

agg_df['average_rating'] = agg_df['total_rating'] / agg_df['count']

user_genre_df = agg_df.pivot(index='userId', columns='genre', values=['average_rating', 'count'])
user_genre_df.columns = [f'{stat}_{genre}' for stat, genre in user_genre_df.columns]
user_genre_df = user_genre_df.reset_index()
user_genre_df.fillna(3.5, inplace=True)
user_genre_df.drop(["count_"+col for col in genres], axis=1, inplace=True)
user_genre_df.set_index("userId", inplace=True)

In [None]:
ratings_test_validate_df = ratings_test_df.sample(frac=1, random_state=42)
y_true = ratings_test_validate_df["rating"].values[:100_000]

genre_prediction_arr = genre_prediction(ratings_test_validate_df[:100_000])

mse = custom_mse_scorer(np.array(genre_prediction_arr), np.array(y_true))
mae = custom_mae_scorer(np.array(genre_prediction_arr), np.array(y_true))
print("MSE:", mse)
print("MAE:", mae)

In [None]:
# weight_combinations = []
# coefs = [x/20 for x in range(0, 21)]
# for coef_a in coefs:
#     for coef_b in coefs:
#         if coef_a + coef_b  > 1:
#             continue
#         coef_c = 1 - coef_a - coef_b
#         weight_combinations.append((coef_a, coef_b, coef_c))

In [None]:
N_MOVIE_CLUSTERS = 5
RATING_MULTIPLIER = 5
YEAR_MULTIPLIER = 0.05
coef_a, coef_b, coef_c = 0.35, 0.45, 0.2

In [None]:
movies_hot_df, users_df = None, None

# for N_MOVIE_CLUSTERS in range(2, 11):
#     for RATING_MULTIPLIER in range(1, 6):
#         for YEAR_MULTIPLIER in range(5, 26, 5):
#             YEAR_MULTIPLIER = 1 / YEAR_MULTIPLIER

ratings_df = pd.read_csv("data/ratings.csv")
movies_df = pd.read_csv("data/movies.csv", index_col="movieId")

X = ratings_df.drop(["rating"], axis=1)
y = ratings_df["rating"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=42)

ratings_train_df = pd.concat([X_train, y_train], axis=1)
ratings_test_df = pd.concat([X_test, y_test], axis=1)




movies_df['Genres_Split'] = movies_df['genres'].apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(movies_df['Genres_Split'])
binary_df = pd.DataFrame(binary_matrix, columns=mlb.classes_)



movies_df.reset_index(inplace=True)
movies_hot_df = pd.concat([movies_df, binary_df], axis=1)
movies_hot_df = movies_hot_df.drop(['genres', 'Genres_Split'], axis=1)
movies_hot_df = movies_hot_df.merge(ratings_train_df.groupby("movieId")["rating"].mean().reset_index(), on="movieId")
movies_hot_df.rename(columns={"rating": "rating_mean"}, inplace=True)
movies_hot_df = movies_hot_df.drop(["title"], axis=1).set_index("movieId")

movies_hot_df["rating_mean"] *= RATING_MULTIPLIER

movies_hot_year_df = movies_hot_df.copy()
years = movies_df[movies_df['movieId'].isin(movies_hot_df.index)]['title'].str.extract(r'\((\d{4})\)')
years.index = movies_hot_year_df.index
years[0] = pd.to_numeric(years[0], errors='coerce')
years = years.fillna(years.median())
scaled_years_df = years * YEAR_MULTIPLIER
movies_hot_df['year'] = scaled_years_df

kmeans = KMeans(n_clusters=N_MOVIE_CLUSTERS, random_state=42)
movies_hot_df['Cluster'] = kmeans.fit_predict(movies_hot_df)

movies_hot_df["rating_mean"] /= RATING_MULTIPLIER

ratings_with_clusters = ratings_train_df.merge(movies_hot_df, left_on='movieId', right_index=True)

user_cluster_stats = ratings_with_clusters.groupby(['userId', 'Cluster'])['rating'].agg(['count', 'sum']).reset_index()
user_cluster_pivot = user_cluster_stats.pivot(index='userId', columns='Cluster', values=['count', 'sum']).fillna(0)
user_cluster_pivot.columns = [f'Cluster_{stat}_{cluster}' for stat, cluster in user_cluster_pivot.columns]

for cluster in range(N_MOVIE_CLUSTERS):
    count_col = f'Cluster_count_{cluster}'
    sum_col = f'Cluster_sum_{cluster}'
    mean_col = f'Cluster_mean_{cluster}'
    if count_col in user_cluster_pivot.columns and sum_col in user_cluster_pivot.columns:
        user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]
    else:
        user_cluster_pivot[count_col] = 0
        user_cluster_pivot[sum_col] = 0
        user_cluster_pivot[mean_col] = 0

user_cluster_pivot = user_cluster_pivot.fillna(0)
users_df = user_cluster_pivot.reset_index()
users_df.set_index("userId", inplace=True)

sum_sums = users_df[["Cluster_sum_"+str(x) for x in range(N_MOVIE_CLUSTERS)]].sum(axis=1)
count_sums = users_df[["Cluster_count_"+str(x) for x in range(N_MOVIE_CLUSTERS)]].sum(axis=1)
users_df["rating_mean"] = sum_sums / count_sums


    
    # results[(coef_a, coef_b, coef_c)] = {"mae": mae, "mse": mse}
                
                
                # with open("overnight_testing.txt", "a") as file:
                #     file.write(str(RATING_MULTIPLIER) + " " + str(YEAR_MULTIPLIER) + " " + str(N_MOVIE_CLUSTERS) + "\n")
                #     file.write(str(sorted(results.items(), key=lambda x: (x[1]["mse"], x[1]["mae"]))) + "\n")
                
                # results = {}

In [None]:
ratings_test_grouped = ratings_train_df.copy()
ratings_test_grouped["rating_count"] = np.ones(ratings_test_grouped.shape[0])
ratings_test_grouped = ratings_test_grouped.groupby("userId")["rating_count"].sum().to_frame()
ratings_test_df = ratings_test_df.merge(ratings_test_grouped, on="userId")
ratings_test_df.head()

In [None]:
cluster_user_prediction_arr = []
film_prediction_arr = []
genre_prediction_arr = []

ratings_test_a_df = ratings_test_df[ratings_test_df["rating_count"] >= 1000]
ratings_test_validate_df = ratings_test_a_df.sample(frac=1, random_state=42)
y_true = ratings_test_validate_df["rating"].values[:100_000]

for i, row in islice(ratings_test_validate_df.iterrows(), 0, 100_000):
    cluster_user_prediction_arr.append(cluster_user_prediction(row["userId"], row["movieId"], False))
    film_prediction_arr.append(film_prediction(row["userId"], row["movieId"], False))
    genre_prediction_arr.append(genre_prediction(row["userId"], row["movieId"], False))
    
    
# results = {}
# for coef_a, coef_b, coef_c in weight_combinations:

In [None]:
y_pred = []

for i in range(100_000):
    y_pred.append(
        genre_prediction_arr[i] * coef_a +
        cluster_user_prediction_arr[i] * coef_b +
        film_prediction_arr[i] * coef_c
    )

mse = custom_mse_scorer(np.array(y_pred), np.array(y_true))
mae = custom_mae_scorer(np.array(y_pred), np.array(y_true))
print("MSE:", mse)
print("MAE:", mae)

# Baseline

In [None]:
def custom_mse_scorer(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def custom_mae_scorer(y_true, y_pred):
    return np.mean(abs(y_true - y_pred))

In [None]:
ratings_df = pd.read_csv("data/ratings.csv")
movies_df = pd.read_csv("data/movies.csv", index_col="movieId")

In [None]:
acc_mse = custom_mse_scorer(ratings_df["rating"], np.zeros(ratings_df.shape[0]) + 2.5)
acc_mae = custom_mae_scorer(ratings_df["rating"], np.zeros(ratings_df.shape[0]) + 2.5)

In [None]:
print("MSE:", acc_mse)
print("MAE:", acc_mae)

In [None]:
X = ratings_df.drop(["rating"], axis=1)
y = ratings_df["rating"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=42)

ratings_train_df = pd.concat([X_train, y_train], axis=1)
ratings_test_df = pd.concat([X_test, y_test], axis=1)

In [None]:
y_pred, y_true = [], []
unknown_count = 0

for i, row in islice(ratings_test_df.iterrows(), 0, 100_000):
    try:
        y_pred.append(cluster_user_prediction(row["userId"], row["movieId"]))
    except:
        y_pred.append(3.5)
        unknown_count += 1

    y_true.append(row["rating"])

acc_mse = custom_mse_scorer(np.array(y_pred), np.array(y_true))
acc = custom_mae_scorer(np.array(y_pred), np.array(y_true))

print("MSE:", acc_mse)
print("MAE:", acc)

y_pred, y_true = [], []
unknown_count = 0

for i, row in islice(ratings_test_df.iterrows(), 0, 100_000):
    try:
        y_pred.append(cluster_film_prediction(row["userId"], row["movieId"]))
    except:
        y_pred.append(3.5)
        unknown_count += 1

    y_true.append(row["rating"])

acc_mse = custom_mse_scorer(np.array(y_pred), np.array(y_true))
acc = custom_mae_scorer(np.array(y_pred), np.array(y_true))
print("MSE:", acc_mse)
print("MAE:", acc)