In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.base import BaseEstimator
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
from sklearn.cluster import KMeans

In [2]:
def cluster_rule_based_prediction(user_id, movie_id):
    cluster = int(movies_hot_df.loc[movie_id]["Cluster"])
    return round(users_df.loc[user_id][f"Cluster_mean_{cluster}"] * 2) / 2

class ClusterRuleBasedEstimator(BaseEstimator):
    def fit(self, X, y=None):
        return self

    def predict(self, X):
        return np.array([cluster_rule_based_prediction(user, movie) for user, movie in X])

def custom_scorer(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def custom_accuracy_scorer(y_true, y_pred, tol=(0.5 + 1e-9)):
    accuracy = np.isclose(y_pred, y_true, atol=tol).mean()
    return accuracy

In [3]:
ratings_df = pd.read_csv("data/ratings.csv")
movies_df = pd.read_csv("data/movies.csv", index_col="movieId")
n_movie_clusters = 600
n_user_clusters = 100

movies_df['Genres_Split'] = movies_df['genres'].apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(movies_df['Genres_Split'])
binary_df = pd.DataFrame(binary_matrix, columns=mlb.classes_)

movies_df.reset_index(inplace=True)
movies_hot_df = pd.concat([movies_df, binary_df], axis=1)
movies_hot_df = movies_hot_df.drop(['genres', 'Genres_Split'], axis=1)
movies_hot_df = movies_hot_df.merge(ratings_df.groupby("movieId")["rating"].mean().reset_index(), on="movieId")
movies_hot_df.rename(columns={"rating": "rating_mean"}, inplace=True)
movies_hot_df = movies_hot_df.drop(["title"], axis=1).set_index("movieId")
kmeans = KMeans(n_clusters=n_movie_clusters, random_state=42)
movies_hot_df['Cluster'] = kmeans.fit_predict(movies_hot_df)

ratings_with_clusters = ratings_df.merge(movies_hot_df, left_on='movieId', right_index=True)

user_cluster_stats = ratings_with_clusters.groupby(['userId', 'Cluster'])['rating'].agg(['count', 'sum']).reset_index()
user_cluster_pivot = user_cluster_stats.pivot(index='userId', columns='Cluster', values=['count', 'sum']).fillna(0)
user_cluster_pivot.columns = [f'Cluster_{stat}_{cluster}' for stat, cluster in user_cluster_pivot.columns]

for cluster in range(n_movie_clusters):
    count_col = f'Cluster_count_{cluster}'
    sum_col = f'Cluster_sum_{cluster}'
    mean_col = f'Cluster_mean_{cluster}'
    if count_col in user_cluster_pivot.columns and sum_col in user_cluster_pivot.columns:
        user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]
    else:
        user_cluster_pivot[count_col] = 0
        user_cluster_pivot[sum_col] = 0
        user_cluster_pivot[mean_col] = 0

user_cluster_pivot = user_cluster_pivot.fillna(0)
users_df = user_cluster_pivot.reset_index()
users_df.set_index("userId", inplace=True)
users_df.head()

  super()._check_params_vs_input(X, default_n_init=10)
  user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]
  user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]
  user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]
  user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]
  user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]
  user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]
  user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]
  user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]
  user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]
  user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]
  user_cluster_

Unnamed: 0_level_0,Cluster_count_0,Cluster_count_1,Cluster_count_2,Cluster_count_3,Cluster_count_4,Cluster_count_5,Cluster_count_6,Cluster_count_7,Cluster_count_8,Cluster_count_9,...,Cluster_mean_590,Cluster_mean_591,Cluster_mean_592,Cluster_mean_593,Cluster_mean_594,Cluster_mean_595,Cluster_mean_596,Cluster_mean_597,Cluster_mean_598,Cluster_mean_599
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,5.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0


In [4]:
n = 100_000

mse_scorer = make_scorer(custom_scorer, greater_is_better=False)
X = ratings_df[['userId', 'movieId']].values[:n]
y = ratings_df['rating'].values[:n]
estimator = ClusterRuleBasedEstimator()
scores = cross_val_score(estimator, X, y, scoring=mse_scorer, cv=5)
print(f"Cross-validated MSE: {-scores.mean()}")

accuracy_scorer = make_scorer(custom_accuracy_scorer)
accuracy_scores = cross_val_score(estimator, X, y, scoring=accuracy_scorer, cv=5)
print(f"Cross-validated Accuracy: {accuracy_scores.mean()}")

estimator.fit(X, y)
y_pred = estimator.predict(X)
num_exactly_correct = (y_pred == y).sum()
print(f"Number of exactly correct predictions: {num_exactly_correct}")

Cross-validated MSE: 0.328895
Cross-validated Accuracy: 0.84142
Number of exactly correct predictions: 55206


In [9]:
ratings_df = pd.read_csv("data/ratings.csv")
movies_df = pd.read_csv("data/movies.csv", index_col="movieId")
n_movie_clusters = 10
n_user_clusters = 100

movies_df['Genres_Split'] = movies_df['genres'].apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(movies_df['Genres_Split'])
binary_df = pd.DataFrame(binary_matrix, columns=mlb.classes_)

movies_hot_df = movies_df.join(binary_df)
movies_hot_df = movies_hot_df.drop(['genres', 'Genres_Split'], axis=1)
kmeans = KMeans(n_clusters=n_movie_clusters, random_state=42)
movies_hot_df['Cluster'] = kmeans.fit_predict(binary_df)
ratings_with_clusters = ratings_df.merge(movies_hot_df, left_on='movieId', right_index=True)

user_cluster_stats = ratings_with_clusters.groupby(['userId', 'Cluster'])['rating'].agg(['count', 'sum']).reset_index()
user_cluster_pivot = user_cluster_stats.pivot(index='userId', columns='Cluster', values=['count', 'sum']).fillna(0)
user_cluster_pivot.columns = [f'Cluster_{stat}_{cluster}' for stat, cluster in user_cluster_pivot.columns]

for cluster in range(n_movie_clusters):
    count_col = f'Cluster_count_{cluster}'
    sum_col = f'Cluster_sum_{cluster}'
    mean_col = f'Cluster_mean_{cluster}'
    if count_col in user_cluster_pivot.columns and sum_col in user_cluster_pivot.columns:
        user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]
    else:
        user_cluster_pivot[count_col] = 0
        user_cluster_pgivot[sum_col] = 0
        user_cluster_pivot[mean_col] = 0

user_cluster_pivot = user_cluster_pivot.fillna(0)
users_df = user_cluster_pivot.reset_index()
users_df.set_index("userId", inplace=True)
users_df.head()

  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0_level_0,Cluster_count_0,Cluster_count_1,Cluster_count_2,Cluster_count_3,Cluster_count_4,Cluster_count_5,Cluster_count_6,Cluster_count_7,Cluster_count_8,Cluster_count_9,...,Cluster_mean_0,Cluster_mean_1,Cluster_mean_2,Cluster_mean_3,Cluster_mean_4,Cluster_mean_5,Cluster_mean_6,Cluster_mean_7,Cluster_mean_8,Cluster_mean_9
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,13.0,10.0,14.0,0.0,0.0,5.0,1.0,3.0,11.0,...,3.9,4.076923,3.7,4.035714,0.0,0.0,3.9,5.0,4.333333,4.090909
2,18.0,16.0,7.0,15.0,0.0,0.0,1.0,2.0,25.0,7.0,...,3.555556,3.9375,2.571429,3.533333,0.0,0.0,3.0,3.5,3.44,3.857143
3,3.0,3.0,2.0,0.0,0.0,0.0,3.0,1.0,7.0,11.0,...,5.0,5.0,4.0,0.0,0.0,0.0,5.0,5.0,4.857143,5.0
4,1.0,5.0,3.0,5.0,0.0,0.0,4.0,0.0,5.0,7.0,...,4.0,4.1,4.333333,4.6,0.0,0.0,4.625,0.0,4.1,4.5
5,6.0,16.0,1.0,4.0,1.0,0.0,1.0,3.0,10.0,1.0,...,2.333333,3.8125,5.0,3.75,4.0,0.0,3.0,3.0,3.3,3.0


In [10]:
n = 100_000

mse_scorer = make_scorer(custom_scorer, greater_is_better=False)
X = ratings_df[['userId', 'movieId']].values[:n]
y = ratings_df['rating'].values[:n]
estimator = ClusterRuleBasedEstimator()
scores = cross_val_score(estimator, X, y, scoring=mse_scorer, cv=5)
print(f"Cross-validated MSE: {-scores.mean()}")

accuracy_scorer = make_scorer(custom_accuracy_scorer)
accuracy_scores = cross_val_score(estimator, X, y, scoring=accuracy_scorer, cv=5)
print(f"Cross-validated Accuracy: {accuracy_scores.mean()}")

estimator.fit(X, y)
y_pred = estimator.predict(X)
num_exactly_correct = (y_pred == y).sum()
print(f"Number of exactly correct predictions: {num_exactly_correct}")

Cross-validated MSE: 0.772095
Cross-validated Accuracy: 0.65478
Number of exactly correct predictions: 26858


In [6]:
movies_df["movieId"].unique()

array([     1,      2,      3, ..., 288975, 288977, 288983], dtype=int64)

# Part with years

In [10]:
ratings_df = pd.read_csv("data/ratings.csv")
movies_df = pd.read_csv("data/movies.csv", index_col="movieId")
n_movie_clusters = 100
n_user_clusters = 100

movies_df['Genres_Split'] = movies_df['genres'].apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(movies_df['Genres_Split'])
binary_df = pd.DataFrame(binary_matrix, columns=mlb.classes_)

movies_df.reset_index(inplace=True)
movies_hot_df = pd.concat([movies_df, binary_df], axis=1)
movies_hot_df = movies_hot_df.drop(['genres', 'Genres_Split'], axis=1)
movies_hot_df = movies_hot_df.merge(ratings_df.groupby("movieId")["rating"].mean().reset_index(), on="movieId")
movies_hot_df.rename(columns={"rating": "rating_mean"}, inplace=True)
movies_hot_df = movies_hot_df.drop(["title"], axis=1).set_index("movieId")

movies_hot_df["rating_mean"] *= 2

# movies_hot_year_df = movies_hot_df.copy()
# years = movies_df[movies_df['movieId'].isin(movies_hot_df.index)]['title'].str.extract(r'\((\d{4})\)')
# years.index = movies_hot_year_df.index
# years[0] = pd.to_numeric(years[0], errors='coerce')
# years = years.fillna(years.median())
# scaler = StandardScaler()
# scaled_years = scaler.fit_transform(years)
# scaled_years_df = pd.DataFrame(scaled_years, index=years.index, columns=years.columns)
# movies_hot_df['year'] = scaled_years_df

# movies_hot_year_df = movies_hot_df.copy()
# years = movies_df[movies_df['movieId'].isin(movies_hot_df.index)]['title'].str.extract(r'\((\d{4})\)')
# years.index = movies_hot_year_df.index
# years[0] = pd.to_numeric(years[0], errors='coerce')
# years = years.fillna(years.median())
# scaled_years_df = years / 10
# movies_hot_df['year'] = scaled_years_df


kmeans = KMeans(n_clusters=n_movie_clusters, random_state=42)
movies_hot_df['Cluster'] = kmeans.fit_predict(movies_hot_df)

ratings_with_clusters = ratings_df.merge(movies_hot_df, left_on='movieId', right_index=True)

user_cluster_stats = ratings_with_clusters.groupby(['userId', 'Cluster'])['rating'].agg(['count', 'sum']).reset_index()
user_cluster_pivot = user_cluster_stats.pivot(index='userId', columns='Cluster', values=['count', 'sum']).fillna(0)
user_cluster_pivot.columns = [f'Cluster_{stat}_{cluster}' for stat, cluster in user_cluster_pivot.columns]

for cluster in range(n_movie_clusters):
    count_col = f'Cluster_count_{cluster}'
    sum_col = f'Cluster_sum_{cluster}'
    mean_col = f'Cluster_mean_{cluster}'
    if count_col in user_cluster_pivot.columns and sum_col in user_cluster_pivot.columns:
        user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]
    else:
        user_cluster_pivot[count_col] = 0
        user_cluster_pivot[sum_col] = 0
        user_cluster_pivot[mean_col] = 0

user_cluster_pivot = user_cluster_pivot.fillna(0)
users_df = user_cluster_pivot.reset_index()
users_df.set_index("userId", inplace=True)
users_df.head()

  super()._check_params_vs_input(X, default_n_init=10)
  user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]


Unnamed: 0_level_0,Cluster_count_0,Cluster_count_1,Cluster_count_2,Cluster_count_3,Cluster_count_4,Cluster_count_5,Cluster_count_6,Cluster_count_7,Cluster_count_8,Cluster_count_9,...,Cluster_mean_90,Cluster_mean_91,Cluster_mean_92,Cluster_mean_93,Cluster_mean_94,Cluster_mean_95,Cluster_mean_96,Cluster_mean_97,Cluster_mean_98,Cluster_mean_99
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,...,0.0,4.1,0.0,3.75,0.0,0.0,0.0,0.0,4.333333,0.0
2,0.0,1.0,0.0,0.0,0.0,4.0,0.0,0.0,3.0,0.0,...,4.5,3.5,0.0,0.0,0.0,0.0,2.333333,0.0,4.25,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,4.333333,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
5,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,3.0,2.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0


In [11]:
n = 100_000

mse_scorer = make_scorer(custom_scorer, greater_is_better=False)
X = ratings_df[['userId', 'movieId']].values[:n]
y = ratings_df['rating'].values[:n]
estimator = ClusterRuleBasedEstimator()
scores = cross_val_score(estimator, X, y, scoring=mse_scorer, cv=5)
print(f"Cross-validated MSE: {-scores.mean()}")

accuracy_scorer = make_scorer(custom_accuracy_scorer)
accuracy_scores = cross_val_score(estimator, X, y, scoring=accuracy_scorer, cv=5)
print(f"Cross-validated Accuracy: {accuracy_scores.mean()}")

estimator.fit(X, y)
y_pred = estimator.predict(X)
num_exactly_correct = (y_pred == y).sum()
print(f"Number of exactly correct predictions: {num_exactly_correct}")

Cross-validated MSE: 0.5195125
Cross-validated Accuracy: 0.75431
Number of exactly correct predictions: 37818
