In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.base import BaseEstimator
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
from sklearn.cluster import KMeans

In [41]:
ratings_df = pd.read_csv("data/ratings.csv")
movies_df = pd.read_csv("data/movies.csv", index_col="movieId")

### Rule based

In [21]:
def rule_based_prediction(user_id, movie_id):
    user_avg_rating = ratings_df[ratings_df['userId'] == user_id]['rating'].mean()
    film_avg_rating = ratings_df[ratings_df['movieId'] == movie_id]['rating'].mean()
    
    genres = movies_df.loc[movie_id]["genres"].split("|")
    movie_idx = set()
    for genre in genres:
        movie_idx.update(movies_df[movies_df["genres"].str.contains(genre)].index)
    genre_avg_rating = ratings_df[ratings_df['movieId'].isin(movie_idx)]['rating'].mean()

    return (user_avg_rating + film_avg_rating + genre_avg_rating) / 3

In [22]:
%%time
rule_based_prediction(1, 50)

CPU times: total: 2 s
Wall time: 2.91 s


3.9494551923200674

In [3]:
class RuleBasedEstimator(BaseEstimator):
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        return np.array([rule_based_prediction(user, movie) for user, movie in X])

def custom_scorer(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)  # Mean Squared Error

In [76]:
n = 100

mse_scorer = make_scorer(custom_scorer, greater_is_better=False)
X = ratings_df[['userId', 'movieId']].values[:n]
y = ratings_df['rating'].values[:n]
estimator = RuleBasedEstimator()
scores = cross_val_score(estimator, X, y, scoring=mse_scorer, cv=5)

print(f"Cross-validated MSE: {-scores.mean()}")

Cross-validated MSE: 0.7235005480434722


In [4]:
n_movie_clusters = 10
n_user_clusters = 100

In [5]:
# Step 1: Split the string values
movies_df['Genres_Split'] = movies_df['genres'].apply(lambda x: x.split('|'))

# Step 2: Create a binary matrix (One-Hot Encoding)
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(movies_df['Genres_Split'])
binary_df = pd.DataFrame(binary_matrix, columns=mlb.classes_)

# Combine the original DataFrame with the binary matrix
movies_hot_df = movies_df.join(binary_df)

# Drop the original and split columns as they are no longer needed
movies_hot_df = movies_hot_df.drop(['genres', 'Genres_Split'], axis=1)

# Step 3: Clustering
# For demonstration, we'll use K-means with 2 clusters
kmeans = KMeans(n_clusters=n_movie_clusters, random_state=42)
movies_hot_df['Cluster'] = kmeans.fit_predict(binary_df)

  super()._check_params_vs_input(X, default_n_init=10)


In [6]:
movies_hot_df.head()

Unnamed: 0_level_0,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Cluster
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
2,Jumanji (1995),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,9
3,Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
4,Waiting to Exhale (1995),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
5,Father of the Bride Part II (1995),0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [7]:
ratings_with_clusters = ratings_df.merge(movies_hot_df, left_on='movieId', right_index=True)

# Group by userId and cluster to calculate counts and sums
user_cluster_stats = ratings_with_clusters.groupby(['userId', 'Cluster'])['rating'].agg(['count', 'sum']).reset_index()

# Pivot the table to get counts and sums for each cluster as separate columns
user_cluster_pivot = user_cluster_stats.pivot(index='userId', columns='Cluster', values=['count', 'sum']).fillna(0)

# Flatten the MultiIndex columns
user_cluster_pivot.columns = [f'Cluster_{stat}_{cluster}' for stat, cluster in user_cluster_pivot.columns]

# Calculate the mean ratings
for cluster in range(n_movie_clusters):
    count_col = f'Cluster_count_{cluster}'
    sum_col = f'Cluster_sum_{cluster}'
    mean_col = f'Cluster_mean_{cluster}'
    if count_col in user_cluster_pivot.columns and sum_col in user_cluster_pivot.columns:
        user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]
    else:
        user_cluster_pivot[count_col] = 0
        user_cluster_pivot[sum_col] = 0
        user_cluster_pivot[mean_col] = 0

# Fill NaNs with 0s
user_cluster_pivot = user_cluster_pivot.fillna(0)

# Reset the index to make userId a column again
users_df = user_cluster_pivot.reset_index()

In [8]:
users_df.head()

Unnamed: 0,userId,Cluster_count_0,Cluster_count_1,Cluster_count_2,Cluster_count_3,Cluster_count_4,Cluster_count_5,Cluster_count_6,Cluster_count_7,Cluster_count_8,...,Cluster_mean_0,Cluster_mean_1,Cluster_mean_2,Cluster_mean_3,Cluster_mean_4,Cluster_mean_5,Cluster_mean_6,Cluster_mean_7,Cluster_mean_8,Cluster_mean_9
0,1,5.0,13.0,10.0,14.0,0.0,0.0,5.0,1.0,3.0,...,3.9,4.076923,3.7,4.035714,0.0,0.0,3.9,5.0,4.333333,4.090909
1,2,18.0,16.0,7.0,15.0,0.0,0.0,1.0,2.0,25.0,...,3.555556,3.9375,2.571429,3.533333,0.0,0.0,3.0,3.5,3.44,3.857143
2,3,3.0,3.0,2.0,0.0,0.0,0.0,3.0,1.0,7.0,...,5.0,5.0,4.0,0.0,0.0,0.0,5.0,5.0,4.857143,5.0
3,4,1.0,5.0,3.0,5.0,0.0,0.0,4.0,0.0,5.0,...,4.0,4.1,4.333333,4.6,0.0,0.0,4.625,0.0,4.1,4.5
4,5,6.0,16.0,1.0,4.0,1.0,0.0,1.0,3.0,10.0,...,2.333333,3.8125,5.0,3.75,4.0,0.0,3.0,3.0,3.3,3.0


In [9]:
# Identify the columns to standardize
columns_to_standardize = [col for col in users_df.columns if 'Cluster_' in col]

# Apply the scaler to these columns
users_df[columns_to_standardize] = StandardScaler().fit_transform(users_df[columns_to_standardize])

In [10]:
users_df.head()

Unnamed: 0,userId,Cluster_count_0,Cluster_count_1,Cluster_count_2,Cluster_count_3,Cluster_count_4,Cluster_count_5,Cluster_count_6,Cluster_count_7,Cluster_count_8,...,Cluster_mean_0,Cluster_mean_1,Cluster_mean_2,Cluster_mean_3,Cluster_mean_4,Cluster_mean_5,Cluster_mean_6,Cluster_mean_7,Cluster_mean_8,Cluster_mean_9
0,1,-0.286357,-0.15308,-0.14002,0.173657,-0.129211,-0.042162,-0.00633,-0.217978,-0.377171,...,0.671055,0.414833,0.478134,0.618124,-0.536168,-0.225102,0.8158,1.490465,0.775354,0.892812
1,2,0.009511,-0.101292,-0.239875,0.216273,-0.129211,-0.042162,-0.330334,-0.171019,0.164861,...,0.42545,0.300179,-0.278373,0.310484,-0.536168,-0.225102,0.343206,0.67481,0.131259,0.756896
2,3,-0.331876,-0.325708,-0.406301,-0.422965,-0.129211,-0.042162,-0.168332,-0.217978,-0.27862,...,1.455407,1.17392,0.679231,-1.853203,-0.536168,-0.225102,1.393415,1.490465,1.153021,1.421376
3,4,-0.377394,-0.291182,-0.373016,-0.209886,-0.129211,-0.042162,-0.087331,-0.264937,-0.327895,...,0.74236,0.43381,0.902672,0.963672,-0.536168,-0.225102,1.196501,-1.228384,0.60712,1.130666
4,5,-0.263598,-0.101292,-0.439587,-0.252501,-0.02664,-0.042162,-0.330334,-0.124061,-0.204706,...,-0.446051,0.197386,1.349553,0.443162,1.854018,-0.225102,0.343206,0.402925,0.030319,0.258536


In [11]:
# # Group by 'userId' and calculate mean rating
# mean_ratings = ratings_df.groupby('userId')['rating'].mean().reset_index()
# 
# # Convert to DataFrame
# mean_ratings_df = pd.DataFrame(mean_ratings)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=n_user_clusters, random_state=42)
users_df['User_cluster'] = kmeans.fit_predict(users_df[[col for col in users_df.columns if 'Cluster_mean' in col]])

  super()._check_params_vs_input(X, default_n_init=10)


In [12]:
users_df.head()
# users_df["User_cluster"].unique()

Unnamed: 0,userId,Cluster_count_0,Cluster_count_1,Cluster_count_2,Cluster_count_3,Cluster_count_4,Cluster_count_5,Cluster_count_6,Cluster_count_7,Cluster_count_8,...,Cluster_mean_1,Cluster_mean_2,Cluster_mean_3,Cluster_mean_4,Cluster_mean_5,Cluster_mean_6,Cluster_mean_7,Cluster_mean_8,Cluster_mean_9,User_cluster
0,1,-0.286357,-0.15308,-0.14002,0.173657,-0.129211,-0.042162,-0.00633,-0.217978,-0.377171,...,0.414833,0.478134,0.618124,-0.536168,-0.225102,0.8158,1.490465,0.775354,0.892812,34
1,2,0.009511,-0.101292,-0.239875,0.216273,-0.129211,-0.042162,-0.330334,-0.171019,0.164861,...,0.300179,-0.278373,0.310484,-0.536168,-0.225102,0.343206,0.67481,0.131259,0.756896,13
2,3,-0.331876,-0.325708,-0.406301,-0.422965,-0.129211,-0.042162,-0.168332,-0.217978,-0.27862,...,1.17392,0.679231,-1.853203,-0.536168,-0.225102,1.393415,1.490465,1.153021,1.421376,56
3,4,-0.377394,-0.291182,-0.373016,-0.209886,-0.129211,-0.042162,-0.087331,-0.264937,-0.327895,...,0.43381,0.902672,0.963672,-0.536168,-0.225102,1.196501,-1.228384,0.60712,1.130666,46
4,5,-0.263598,-0.101292,-0.439587,-0.252501,-0.02664,-0.042162,-0.330334,-0.124061,-0.204706,...,0.197386,1.349553,0.443162,1.854018,-0.225102,0.343206,0.402925,0.030319,0.258536,83


In [13]:
# clustering_corr = {}
# 
# for user in range(n_user_clusters):
#     user_idx = mean_ratings_df[mean_ratings_df["Cluster"] == user]["userId"]
#     for movie in range(n_movie_clusters):
#         movie_idx = movies_hot_df[movies_hot_df["Cluster"] == movie].index
#         summ = 0
#         count = 0
#         for index, c in ratings_df.iterrows():
#             if c["userId"] in user_idx and c["movieId"] in movie_idx:
#                 summ += c["rating"]
#                 count += 1
#         avg = summ / count
#         clustering_corr[f"{user} {movie}"] = avg

In [14]:
ratings_clustered_df = ratings_df.merge(users_df[['userId', 'User_cluster']], on='userId')
ratings_clustered_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,User_cluster
0,1,1,4.0,1225734739,34
1,1,110,4.0,1225865086,34
2,1,158,4.0,1225733503,34
3,1,260,4.5,1225735204,34
4,1,356,5.0,1225735119,34


In [15]:
ratings_clustered_df = ratings_clustered_df.merge(movies_hot_df[['Cluster']], left_on='movieId', right_index=True)
ratings_clustered_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,User_cluster,Cluster
0,1,1,4.0,1225734739,34,9
62,2,1,5.0,835815971,13,9
304,7,1,4.0,974518024,37,9
653,10,1,3.0,1430666394,13,9
852,12,1,5.0,862500738,13,9


In [16]:
clustering_corr = ratings_clustered_df.groupby(['User_cluster', 'Cluster'])['rating'].mean().unstack(fill_value=0)

print(clustering_corr)

Cluster              0         1         2         3         4         5  \
User_cluster                                                               
0             1.908994  1.013889  1.181004  2.114458  2.983871  1.500000   
1             2.976624  3.355277  3.089355  3.236632  1.076948  1.345771   
2             2.921900  3.357917  2.953490  3.391512  1.930894  0.916667   
3             3.457699  3.748533  3.565713  3.658491  3.106333  0.949153   
4             3.624202  3.972495  3.681002  3.862351  4.102640  4.114321   
...                ...       ...       ...       ...       ...       ...   
95            1.095779  4.097272  1.229870  4.053547  3.691781  0.833333   
96            3.574961  3.848537  1.468954  1.502008  3.500000  2.333333   
97            1.149371  4.063496  1.092885  3.998614  3.632353  0.000000   
98            1.030618  1.008132  1.023702  1.029768  3.129496  1.120000   
99            3.021562  3.331387  3.103715  3.325437  1.161209  1.322917   

Cluster    

In [17]:
clustering_corr[0][0]

1.9089935760171306

In [18]:
movies_hot_df

Unnamed: 0_level_0,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Cluster
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
2,Jumanji (1995),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,9
3,Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
4,Waiting to Exhale (1995),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
5,Father of the Bride Part II (1995),0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288967,State of Siege: Temple Attack (2021),,,,,,,,,,...,,,,,,,,,,1
288971,Ouija Japan (2021),,,,,,,,,,...,,,,,,,,,,7
288975,The Men Who Made the Movies: Howard Hawks (1973),,,,,,,,,,...,,,,,,,,,,4
288977,Skinford: Death Sentence (2023),,,,,,,,,,...,,,,,,,,,,8


In [19]:
class ClusteringBasedEstimator(BaseEstimator):
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        return np.array([get_clustering_prediction(user, movie) for user, movie in X])
    
def get_clustering_prediction(user, movie):
    return clustering_corr[int(movies_hot_df.loc[movie]['Cluster'])][int(users_df.loc[user]['User_cluster'])]

In [20]:
n = 100_000

mse_scorer = make_scorer(custom_scorer, greater_is_better=False)
X = ratings_df[['userId', 'movieId']].values[:n]
y = ratings_df['rating'].values[:n]
estimator = ClusteringBasedEstimator()
scores = cross_val_score(estimator, X, y, scoring=mse_scorer, cv=5)

print(f"Cross-validated MSE: {-scores.mean()}")

Cross-validated MSE: 2.0163583311663547


# New solutions

In [42]:
n_movie_clusters = 10
n_user_clusters = 100


movies_df['Genres_Split'] = movies_df['genres'].apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(movies_df['Genres_Split'])
binary_df = pd.DataFrame(binary_matrix, columns=mlb.classes_)

movies_hot_df = movies_df.join(binary_df)
movies_hot_df = movies_hot_df.drop(['genres', 'Genres_Split'], axis=1)
kmeans = KMeans(n_clusters=n_movie_clusters, random_state=42)
movies_hot_df['Cluster'] = kmeans.fit_predict(binary_df)
ratings_with_clusters = ratings_df.merge(movies_hot_df, left_on='movieId', right_index=True)

user_cluster_stats = ratings_with_clusters.groupby(['userId', 'Cluster'])['rating'].agg(['count', 'sum']).reset_index()
user_cluster_pivot = user_cluster_stats.pivot(index='userId', columns='Cluster', values=['count', 'sum']).fillna(0)
user_cluster_pivot.columns = [f'Cluster_{stat}_{cluster}' for stat, cluster in user_cluster_pivot.columns]

for cluster in range(n_movie_clusters):
    count_col = f'Cluster_count_{cluster}'
    sum_col = f'Cluster_sum_{cluster}'
    mean_col = f'Cluster_mean_{cluster}'
    if count_col in user_cluster_pivot.columns and sum_col in user_cluster_pivot.columns:
        user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]
    else:
        user_cluster_pivot[count_col] = 0
        user_cluster_pivot[sum_col] = 0
        user_cluster_pivot[mean_col] = 0

user_cluster_pivot = user_cluster_pivot.fillna(0)
users_df = user_cluster_pivot.reset_index()

  super()._check_params_vs_input(X, default_n_init=10)


In [43]:
users_df.set_index("userId", inplace=True)
users_df.head()

Unnamed: 0_level_0,Cluster_count_0,Cluster_count_1,Cluster_count_2,Cluster_count_3,Cluster_count_4,Cluster_count_5,Cluster_count_6,Cluster_count_7,Cluster_count_8,Cluster_count_9,...,Cluster_mean_0,Cluster_mean_1,Cluster_mean_2,Cluster_mean_3,Cluster_mean_4,Cluster_mean_5,Cluster_mean_6,Cluster_mean_7,Cluster_mean_8,Cluster_mean_9
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,13.0,10.0,14.0,0.0,0.0,5.0,1.0,3.0,11.0,...,3.9,4.076923,3.7,4.035714,0.0,0.0,3.9,5.0,4.333333,4.090909
2,18.0,16.0,7.0,15.0,0.0,0.0,1.0,2.0,25.0,7.0,...,3.555556,3.9375,2.571429,3.533333,0.0,0.0,3.0,3.5,3.44,3.857143
3,3.0,3.0,2.0,0.0,0.0,0.0,3.0,1.0,7.0,11.0,...,5.0,5.0,4.0,0.0,0.0,0.0,5.0,5.0,4.857143,5.0
4,1.0,5.0,3.0,5.0,0.0,0.0,4.0,0.0,5.0,7.0,...,4.0,4.1,4.333333,4.6,0.0,0.0,4.625,0.0,4.1,4.5
5,6.0,16.0,1.0,4.0,1.0,0.0,1.0,3.0,10.0,1.0,...,2.333333,3.8125,5.0,3.75,4.0,0.0,3.0,3.0,3.3,3.0


In [44]:
def cluster_rule_based_prediction(user_id, movie_id):
    cluster = movies_hot_df.loc[movie_id]["Cluster"]
    return round(users_df.loc[user_id][f"Cluster_mean_{cluster}"] * 2) / 2

class ClusterRuleBasedEstimator(BaseEstimator):
    def fit(self, X, y=None):
        return self

    def predict(self, X):
        return np.array([cluster_rule_based_prediction(user, movie) for user, movie in X])

def custom_scorer(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

In [45]:
n = 100_000

mse_scorer = make_scorer(custom_scorer, greater_is_better=False)
X = ratings_df[['userId', 'movieId']].values[:n]
y = ratings_df['rating'].values[:n]
estimator = ClusterRuleBasedEstimator()
scores = cross_val_score(estimator, X, y, scoring=mse_scorer, cv=5)

print(f"Cross-validated MSE: {-scores.mean()}")

Cross-validated MSE: 0.772095


In [46]:
def custom_accuracy_scorer(y_true, y_pred, tol=(0.5 + 1e-9)):
    accuracy = np.isclose(y_pred, y_true, atol=tol).mean()
    return accuracy

accuracy_scorer = make_scorer(custom_accuracy_scorer)
accuracy_scores = cross_val_score(estimator, X, y, scoring=accuracy_scorer, cv=5)
print(f"Cross-validated Accuracy: {accuracy_scores.mean()}")

estimator.fit(X, y)
y_pred = estimator.predict(X)
num_exactly_correct = (y_pred == y).sum()
print(f"Number of exactly correct predictions: {num_exactly_correct}")

Cross-validated Accuracy: 0.65478
Number of exactly correct predictions: 26858
