In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.base import BaseEstimator
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
from sklearn.cluster import KMeans

In [None]:
ratings_df = pd.read_csv("data/ratings.csv")
movies_df = pd.read_csv("data/movies.csv", index_col="movieId")

### Rule based

In [None]:
def rule_based_prediction(user_id, movie_id):
    user_avg_rating = ratings_df[ratings_df['userId'] == user_id]['rating'].mean()
    film_avg_rating = ratings_df[ratings_df['movieId'] == movie_id]['rating'].mean()
    
    genres = movies_df.loc[movie_id]["genres"].split("|")
    movie_idx = set()
    for genre in genres:
        movie_idx.update(movies_df[movies_df["genres"].str.contains(genre)].index)
    genre_avg_rating = ratings_df[ratings_df['movieId'].isin(movie_idx)]['rating'].mean()

    return (user_avg_rating + film_avg_rating + genre_avg_rating) / 3

In [None]:
%%time
rule_based_prediction(1, 50)

In [None]:
class RuleBasedEstimator(BaseEstimator):
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        return np.array([rule_based_prediction(user, movie) for user, movie in X])

def custom_scorer(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)  # Mean Squared Error

In [None]:
n = 100

mse_scorer = make_scorer(custom_scorer, greater_is_better=False)
X = ratings_df[['userId', 'movieId']].values[:n]
y = ratings_df['rating'].values[:n]
estimator = RuleBasedEstimator()
scores = cross_val_score(estimator, X, y, scoring=mse_scorer, cv=5)

print(f"Cross-validated MSE: {-scores.mean()}")

In [None]:
n_movie_clusters = 10
n_user_clusters = 100

In [None]:
# Step 1: Split the string values
movies_df['Genres_Split'] = movies_df['genres'].apply(lambda x: x.split('|'))

# Step 2: Create a binary matrix (One-Hot Encoding)
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(movies_df['Genres_Split'])
binary_df = pd.DataFrame(binary_matrix, columns=mlb.classes_)

# Combine the original DataFrame with the binary matrix
movies_hot_df = movies_df.join(binary_df)

# Drop the original and split columns as they are no longer needed
movies_hot_df = movies_hot_df.drop(['genres', 'Genres_Split'], axis=1)

# Step 3: Clustering
# For demonstration, we'll use K-means with 2 clusters
kmeans = KMeans(n_clusters=n_movie_clusters, random_state=42)
movies_hot_df['Cluster'] = kmeans.fit_predict(binary_df)

In [None]:
movies_hot_df.head()

In [None]:
ratings_with_clusters = ratings_df.merge(movies_hot_df, left_on='movieId', right_index=True)

# Group by userId and cluster to calculate counts and sums
user_cluster_stats = ratings_with_clusters.groupby(['userId', 'Cluster'])['rating'].agg(['count', 'sum']).reset_index()

# Pivot the table to get counts and sums for each cluster as separate columns
user_cluster_pivot = user_cluster_stats.pivot(index='userId', columns='Cluster', values=['count', 'sum']).fillna(0)

# Flatten the MultiIndex columns
user_cluster_pivot.columns = [f'Cluster_{stat}_{cluster}' for stat, cluster in user_cluster_pivot.columns]

# Calculate the mean ratings
for cluster in range(n_movie_clusters):
    count_col = f'Cluster_count_{cluster}'
    sum_col = f'Cluster_sum_{cluster}'
    mean_col = f'Cluster_mean_{cluster}'
    if count_col in user_cluster_pivot.columns and sum_col in user_cluster_pivot.columns:
        user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]
    else:
        user_cluster_pivot[count_col] = 0
        user_cluster_pivot[sum_col] = 0
        user_cluster_pivot[mean_col] = 0

# Fill NaNs with 0s
user_cluster_pivot = user_cluster_pivot.fillna(0)

# Reset the index to make userId a column again
users_df = user_cluster_pivot.reset_index()

In [None]:
users_df.head()

In [None]:
# Identify the columns to standardize
columns_to_standardize = [col for col in users_df.columns if 'Cluster_' in col]

# Apply the scaler to these columns
users_df[columns_to_standardize] = StandardScaler().fit_transform(users_df[columns_to_standardize])

In [None]:
users_df.head()

In [None]:
# # Group by 'userId' and calculate mean rating
# mean_ratings = ratings_df.groupby('userId')['rating'].mean().reset_index()
# 
# # Convert to DataFrame
# mean_ratings_df = pd.DataFrame(mean_ratings)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=n_user_clusters, random_state=42)
users_df['User_cluster'] = kmeans.fit_predict(users_df[[col for col in users_df.columns if 'Cluster_mean' in col]])

In [None]:
users_df.head()
# users_df["User_cluster"].unique()

In [None]:
# clustering_corr = {}
# 
# for user in range(n_user_clusters):
#     user_idx = mean_ratings_df[mean_ratings_df["Cluster"] == user]["userId"]
#     for movie in range(n_movie_clusters):
#         movie_idx = movies_hot_df[movies_hot_df["Cluster"] == movie].index
#         summ = 0
#         count = 0
#         for index, c in ratings_df.iterrows():
#             if c["userId"] in user_idx and c["movieId"] in movie_idx:
#                 summ += c["rating"]
#                 count += 1
#         avg = summ / count
#         clustering_corr[f"{user} {movie}"] = avg

In [None]:
ratings_clustered_df = ratings_df.merge(users_df[['userId', 'User_cluster']], on='userId')
ratings_clustered_df.head()

In [None]:
ratings_clustered_df = ratings_clustered_df.merge(movies_hot_df[['Cluster']], left_on='movieId', right_index=True)
ratings_clustered_df.head()

In [None]:
clustering_corr = ratings_clustered_df.groupby(['User_cluster', 'Cluster'])['rating'].mean().unstack(fill_value=0)

print(clustering_corr)

In [None]:
clustering_corr[0][0]

In [None]:
weight_combinations = []
coefs = [x/20 for x in range(0, 21)]
for coef_a in coefs:
    for coef_b in coefs:
        for coef_c in coefs:
            if coef_a + coef_b + coef_c > 1:
                continue
            coef_d = 1 - coef_a - coef_b - coef_c
            weight_combinations.append((coef_a, coef_b, coef_c, coef_d))

In [None]:
len(weight_combinations) / 66 * 4

In [None]:
movies_hot_df

In [None]:
class ClusteringBasedEstimator(BaseEstimator):
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        return np.array([get_clustering_prediction(user, movie) for user, movie in X])
    
def get_clustering_prediction(user, movie):
    return clustering_corr[int(movies_hot_df.loc[movie]['Cluster'])][int(users_df.loc[user]['User_cluster'])]

In [None]:
n = 100_000

mse_scorer = make_scorer(custom_scorer, greater_is_better=False)
X = ratings_df[['userId', 'movieId']].values[:n]
y = ratings_df['rating'].values[:n]
estimator = ClusteringBasedEstimator()
scores = cross_val_score(estimator, X, y, scoring=mse_scorer, cv=5)

print(f"Cross-validated MSE: {-scores.mean()}")

# New solutions

In [None]:
n_movie_clusters = 10
n_user_clusters = 100


movies_df['Genres_Split'] = movies_df['genres'].apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(movies_df['Genres_Split'])
binary_df = pd.DataFrame(binary_matrix, columns=mlb.classes_)

movies_hot_df = movies_df.join(binary_df)
movies_hot_df = movies_hot_df.drop(['genres', 'Genres_Split'], axis=1)
kmeans = KMeans(n_clusters=n_movie_clusters, random_state=42)
movies_hot_df['Cluster'] = kmeans.fit_predict(binary_df)
ratings_with_clusters = ratings_df.merge(movies_hot_df, left_on='movieId', right_index=True)

user_cluster_stats = ratings_with_clusters.groupby(['userId', 'Cluster'])['rating'].agg(['count', 'sum']).reset_index()
user_cluster_pivot = user_cluster_stats.pivot(index='userId', columns='Cluster', values=['count', 'sum']).fillna(0)
user_cluster_pivot.columns = [f'Cluster_{stat}_{cluster}' for stat, cluster in user_cluster_pivot.columns]

for cluster in range(n_movie_clusters):
    count_col = f'Cluster_count_{cluster}'
    sum_col = f'Cluster_sum_{cluster}'
    mean_col = f'Cluster_mean_{cluster}'
    if count_col in user_cluster_pivot.columns and sum_col in user_cluster_pivot.columns:
        user_cluster_pivot[mean_col] = user_cluster_pivot[sum_col] / user_cluster_pivot[count_col]
    else:
        user_cluster_pivot[count_col] = 0
        user_cluster_pivot[sum_col] = 0
        user_cluster_pivot[mean_col] = 0

user_cluster_pivot = user_cluster_pivot.fillna(0)
users_df = user_cluster_pivot.reset_index()

In [None]:
users_df.set_index("userId", inplace=True)
users_df.head()

In [None]:
def cluster_rule_based_prediction(user_id, movie_id):
    cluster = movies_hot_df.loc[movie_id]["Cluster"]
    return round(users_df.loc[user_id][f"Cluster_mean_{cluster}"] * 2) / 2

class ClusterRuleBasedEstimator(BaseEstimator):
    def fit(self, X, y=None):
        return self

    def predict(self, X):
        return np.array([cluster_rule_based_prediction(user, movie) for user, movie in X])

def custom_scorer(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

In [None]:
n = 100_000

mse_scorer = make_scorer(custom_scorer, greater_is_better=False)
X = ratings_df[['userId', 'movieId']].values[:n]
y = ratings_df['rating'].values[:n]
estimator = ClusterRuleBasedEstimator()
scores = cross_val_score(estimator, X, y, scoring=mse_scorer, cv=5)

print(f"Cross-validated MSE: {-scores.mean()}")

In [None]:
def custom_accuracy_scorer(y_true, y_pred, tol=(0.5 + 1e-9)):
    accuracy = np.isclose(y_pred, y_true, atol=tol).mean()
    return accuracy

accuracy_scorer = make_scorer(custom_accuracy_scorer)
accuracy_scores = cross_val_score(estimator, X, y, scoring=accuracy_scorer, cv=5)
print(f"Cross-validated Accuracy: {accuracy_scores.mean()}")

estimator.fit(X, y)
y_pred = estimator.predict(X)
num_exactly_correct = (y_pred == y).sum()
print(f"Number of exactly correct predictions: {num_exactly_correct}")