In [77]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.base import BaseEstimator
from sklearn.preprocessing import MultiLabelBinarizer

In [31]:
ratings_df = pd.read_csv("data/ratings.csv")
movies_df = pd.read_csv("data/movies.csv", index_col="movieId")

### Rule based

In [61]:
def rule_based_prediction(user_id, movie_id):
    user_avg_rating = ratings_df[ratings_df['userId'] == user_id]['rating'].mean()
    film_avg_rating = ratings_df[ratings_df['movieId'] == movie_id]['rating'].mean()
    
    genres = movies_df.loc[movie_id]["genres"].split("|")
    movie_idx = set()
    for genre in genres:
        movie_idx.update(movies_df[movies_df["genres"].str.contains(genre)].index)
    genre_avg_rating = ratings_df[ratings_df['movieId'].isin(movie_idx)]['rating'].mean()

    return (user_avg_rating + film_avg_rating + genre_avg_rating) / 3

In [75]:
%%time
rule_based_prediction(1, 50)

CPU times: total: 1.55 s
Wall time: 1.61 s


3.9494551923200674

In [73]:
class RuleBasedEstimator(BaseEstimator):
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        return np.array([rule_based_prediction(user, movie) for user, movie in X])

def custom_scorer(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)  # Mean Squared Error

In [76]:
n = 100

mse_scorer = make_scorer(custom_scorer, greater_is_better=False)
X = ratings_df[['userId', 'movieId']].values[:n]
y = ratings_df['rating'].values[:n]
estimator = RuleBasedEstimator()
scores = cross_val_score(estimator, X, y, scoring=mse_scorer, cv=5)

print(f"Cross-validated MSE: {-scores.mean()}")

Cross-validated MSE: 0.7235005480434722


In [83]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
from sklearn.cluster import KMeans

In [84]:
# Step 1: Split the string values
movies_df['Genres_Split'] = movies_df['genres'].apply(lambda x: x.split('|'))

# Step 2: Create a binary matrix (One-Hot Encoding)
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(movies_df['Genres_Split'])
binary_df = pd.DataFrame(binary_matrix, columns=mlb.classes_)

# Combine the original DataFrame with the binary matrix
movies_hot_df = movies_df.join(binary_df)

# Drop the original and split columns as they are no longer needed
movies_hot_df = movies_hot_df.drop(['genres', 'Genres_Split'], axis=1)

# Step 3: Clustering
# For demonstration, we'll use K-means with 2 clusters
kmeans = KMeans(n_clusters=10, random_state=42)
movies_hot_df['Cluster'] = kmeans.fit_predict(binary_df)

  super()._check_params_vs_input(X, default_n_init=10)


In [85]:
movies_hot_df.head()

Unnamed: 0_level_0,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Cluster
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
2,Jumanji (1995),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,9
3,Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
4,Waiting to Exhale (1995),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
5,Father of the Bride Part II (1995),0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0


userId
1         4.008065
2         3.527473
3         4.900000
4         4.366667
5         3.418605
            ...   
330971    4.068627
330972    3.343750
330973    3.309524
330974    3.356481
330975    2.403270
Name: rating, Length: 330975, dtype: float64

In [94]:
# Group by 'userId' and calculate mean rating
mean_ratings = ratings_df.groupby('userId')['rating'].mean().reset_index()

# Convert to DataFrame
mean_ratings_df = pd.DataFrame(mean_ratings)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=10, random_state=42)
mean_ratings_df['Cluster'] = kmeans.fit_predict(mean_ratings_df[['rating']])

  super()._check_params_vs_input(X, default_n_init=10)


In [97]:
mean_ratings_df.head()
mean_ratings_df["Cluster"].unique()

array([6, 9, 5, 2, 4, 0, 3, 1, 7, 8])

In [99]:
n_movie_clusters = 10
n_user_clusters = 10

clustering_corr = {}

for user in range(n_user_clusters):
    user_idx = mean_ratings_df[mean_ratings_df["Cluster"] == user]["userId"]
    for movie in range(n_movie_clusters):
        movie_idx = movies_hot_df[movies_hot_df["Cluster"] == movie].index
        summ = 0
        count = 0
        for index, c in ratings_df.iterrows():
            if c["userId"] in user_idx and c["movieId"] in movie_idx:
                summ += c["rating"]
                count += 1
        avg = summ / count
        clustering_corr[f"{user} {movie}"] = avg

KeyboardInterrupt: 

In [101]:
ratings_clustered_df = ratings_df.merge(mean_ratings_df[['userId', 'Cluster']], on='userId', suffixes=('', '_user_cluster'))
ratings_clustered_df

Unnamed: 0,userId,movieId,rating,timestamp,Cluster
0,1,1,4.0,1225734739,6
1,1,110,4.0,1225865086,6
2,1,158,4.0,1225733503,6
3,1,260,4.5,1225735204,6
4,1,356,5.0,1225735119,6
...,...,...,...,...,...
33832157,330975,8340,2.0,1091583256,8
33832158,330975,8493,2.5,1091585709,8
33832159,330975,8622,4.0,1091581777,8
33832160,330975,8665,3.0,1091581765,8


In [102]:
ratings_clustered_df = ratings_clustered_df.merge(movies_hot_df[['Cluster']], left_on='movieId', right_index=True, suffixes=('', '_movie_cluster'))
ratings_clustered_df

Unnamed: 0,userId,movieId,rating,timestamp,Cluster,Cluster_movie_cluster
0,1,1,4.0,1225734739,6,9
62,2,1,5.0,835815971,9,9
304,7,1,4.0,974518024,4,9
653,10,1,3.0,1430666394,4,9
852,12,1,5.0,862500738,9,9
...,...,...,...,...,...,...
33814417,330842,196133,0.5,1671069205,4,1
33816126,330852,217236,2.0,1594246738,9,2
33821252,330904,228179,4.5,1667167519,2,1
33821255,330904,261553,3.5,1667165898,2,1


In [103]:
# Merge the cluster information back to the original DataFrame
# ratings_clustered_df = ratings_df.merge(mean_ratings_df[['userId', 'Cluster']], on='userId', suffixes=('', '_user_cluster'))
# ratings_clustered_df = ratings_clustered_df.merge(movies_hot_df[['Cluster']], left_on='movieId', right_index=True, suffixes=('', '_movie_cluster'))

# Group by user and movie clusters and calculate the mean rating
clustering_corr = ratings_clustered_df.groupby(['Cluster', 'Cluster_movie_cluster'])['rating'].mean().unstack(fill_value=0)

print(clustering_corr)

Cluster_movie_cluster         0         1         2         3         4  \
Cluster                                                                   
0                      3.675031  3.982965  3.728013  3.894426  4.037184   
1                      2.501203  2.955822  2.605789  2.806497  3.024446   
2                      4.281988  4.477417  4.338197  4.420701  4.495810   
3                      1.311101  1.332141  1.440994  1.349796  1.149961   
4                      3.167752  3.550879  3.243169  3.442849  3.635586   
5                      4.775610  4.833253  4.787263  4.808381  4.893006   
6                      3.951254  4.206287  4.004413  4.133776  4.250565   
7                      2.882468  3.300822  2.961220  3.172756  3.390074   
8                      2.008568  2.365655  2.132028  2.267168  2.182740   
9                      3.426975  3.770643  3.486926  3.671739  3.837882   

Cluster_movie_cluster         5         6         7         8         9  
Cluster                  

In [114]:
clustering_corr[0][0]

3.675031136535825

In [108]:
movies_hot_df

Unnamed: 0_level_0,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Cluster
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
2,Jumanji (1995),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,9
3,Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
4,Waiting to Exhale (1995),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
5,Father of the Bride Part II (1995),0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288967,State of Siege: Temple Attack (2021),,,,,,,,,,...,,,,,,,,,,1
288971,Ouija Japan (2021),,,,,,,,,,...,,,,,,,,,,7
288975,The Men Who Made the Movies: Howard Hawks (1973),,,,,,,,,,...,,,,,,,,,,4
288977,Skinford: Death Sentence (2023),,,,,,,,,,...,,,,,,,,,,8


In [115]:
class ClusteringBasedEstimator(BaseEstimator):
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        return np.array([get_clustering_prediction(user, movie) for user, movie in X])
    
def get_clustering_prediction(user, movie):
    return clustering_corr[int(movies_hot_df.loc[movie]['Cluster'])][int(mean_ratings_df.loc[user]['Cluster'])]

In [117]:
n = 100_000

mse_scorer = make_scorer(custom_scorer, greater_is_better=False)
X = ratings_df[['userId', 'movieId']].values[:n]
y = ratings_df['rating'].values[:n]
estimator = ClusteringBasedEstimator()
scores = cross_val_score(estimator, X, y, scoring=mse_scorer, cv=5)

print(f"Cross-validated MSE: {-scores.mean()}")

Cross-validated MSE: 1.4078750611104582
