In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!pip install bayesian-optimization --quiet
!pip install yellowbrick --quiet

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from bayes_opt import BayesianOptimization
#from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
import warnings

# Ignore warnings
warnings.filterwarnings("ignore")

In [None]:
#Import the ratings data
df = pd.read_csv('/content/gdrive/MyDrive/ratings.csv')
df.drop(columns=['timestamp'],inplace=True)

# Assuming your original dataframe is named 'df'
movieId_to_idx = {movieId: idx for idx, movieId in enumerate(df.movieId.unique())}
idx_to_movieId = {idx: movieId for movieId, idx in movieId_to_idx.items()}

# Assuming df is your dataframe with columns ['userId', 'movieId', 'rating']
user_movie_matrix = df.pivot(index='userId', columns='movieId', values='rating')

# Fill NaNs with zeros for now
user_movie_matrix.fillna(0, inplace=True)

# Convert dataframe to numpy matrix
ratings = user_movie_matrix.values

# Now, let's split the data into train, validation, and test sets
# We'll use the mask method for this purpose
np.random.seed(0)  # for reproducibility
msk = np.random.rand(*ratings.shape) < 0.8
train_ratings = ratings.copy()
train_ratings[~msk] = 0

test_val = ratings.copy()
test_val[msk] = 0

msk_test_val = np.random.rand(*test_val.shape) < 0.5
val_ratings = test_val.copy()
val_ratings[~msk_test_val] = 0

test_ratings = test_val.copy()
test_ratings[msk_test_val] = 0

In [None]:
# Assuming df is your dataframe with columns ['userId', 'movieId', 'rating']
num_users = df.userId.nunique()
num_movies = df.movieId.nunique()

print(f'Number of unique users: {num_users}')
print(f'Number of unique movies: {num_movies}')

# Rating distribution
print('Rating distribution:')
print(df.rating.describe())

# Missing values (i.e., sparsity of the user-item matrix)
total_possible_ratings = num_users * num_movies
actual_ratings = len(df)
missing_ratings = total_possible_ratings - actual_ratings
sparsity = missing_ratings / total_possible_ratings
print(f'Sparsity of the user-item matrix: {sparsity * 100:.2f}%')

# User activity
user_activity = df.groupby('userId').size()
print('User activity:')
print(user_activity.describe())

# Item popularity
item_popularity = df.groupby('movieId').size()
print('Item popularity:')
print(item_popularity.describe())


Number of unique users: 668
Number of unique movies: 10325
Rating distribution:
count    105339.000000
mean          3.516850
std           1.044872
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64
Sparsity of the user-item matrix: 98.47%
User activity:
count     668.000000
mean      157.693114
std       319.712512
min        20.000000
25%        35.000000
50%        70.500000
75%       153.000000
max      5678.000000
dtype: float64
Item popularity:
count    10325.000000
mean        10.202324
std         22.832341
min          1.000000
25%          1.000000
50%          3.000000
75%          8.000000
max        325.000000
dtype: float64


In [None]:
num_users, num_movies = ratings.shape
num_factors = 10  # you can choose another number

P = np.random.normal(scale=1./num_factors, size=(num_users, num_factors+1))  # +1 for the bias
Q = np.random.normal(scale=1./num_factors, size=(num_movies, num_factors+1))  # +1 for the bias

# Initialize biases to the global average rating
global_average = np.mean(train_ratings[train_ratings != 0])
P[:, 0] = global_average
Q[:, 0] = global_average


In [None]:
def train(matrix, P, Q, num_epochs=50, learning_rate=0.01, regularization=0.1):
    Q = Q.T
    rows, cols = matrix.nonzero()

    for epoch in range(num_epochs):
        for row, col in zip(rows, cols):
            if matrix[row, col] > 0:
                prediction = P[row, :].dot(Q[:, col])
                error = matrix[row, col] - prediction

                # Update P and Q
                P[row, :] += learning_rate * (error * Q[:, col] - regularization * P[row, :])
                Q[:, col] += learning_rate * (error * P[row, :] - regularization * Q[:, col])

    return P, Q.T


In [None]:
def predict(P, Q):
    return np.dot(P, Q.T)


In [None]:
def calculate_rmse(actual, predicted):
    non_zero_indices = actual.nonzero()
    return np.sqrt(mean_squared_error(actual[non_zero_indices], predicted[non_zero_indices]))


In [None]:
def train_and_evaluate(num_epochs, learning_rate, regularization):
    num_epochs = int(num_epochs)
    P_local = P.copy()
    Q_local = Q.copy()

    P_local, Q_local = train(train_ratings, P_local, Q_local, num_epochs=num_epochs, learning_rate=learning_rate, regularization=regularization)
    predictions = predict(P_local, Q_local)
    val_rmse = calculate_rmse(val_ratings, predictions)

    # Bayesian optimization only knows how to maximize, not minimize, so return the negative RMSE
    return -val_rmse

# Bounded region of parameter space
pbounds = {'num_epochs': (10, 50), 'learning_rate': (0.0001, 0.01), 'regularization': (0.001, 0.1)}

optimizer = BayesianOptimization(
    f=train_and_evaluate,
    pbounds=pbounds,
    random_state=1,
)

optimizer.maximize(init_points=2, n_iter=10)

print(optimizer.max)

|   iter    |  target   | learni... | num_ep... | regula... |
-------------------------------------------------------------
| [0m1        [0m | [0m-0.912   [0m | [0m0.004229 [0m | [0m38.81    [0m | [0m0.001011 [0m |
| [0m2        [0m | [0m-0.9333  [0m | [0m0.003093 [0m | [0m15.87    [0m | [0m0.01014  [0m |
| [95m3        [0m | [95m-0.9093  [0m | [95m0.004028 [0m | [95m40.12    [0m | [95m0.03846  [0m |
| [0m4        [0m | [0m-0.9753  [0m | [0m0.008023 [0m | [0m49.73    [0m | [0m0.09437  [0m |
| [0m5        [0m | [0m-1.486   [0m | [0m0.0001   [0m | [0m28.69    [0m | [0m0.1      [0m |
| [0m6        [0m | [0m-0.9468  [0m | [0m0.01     [0m | [0m10.0     [0m | [0m0.001    [0m |
| [0m7        [0m | [0m-1.273   [0m | [0m0.0001   [0m | [0m45.17    [0m | [0m0.001    [0m |
| [0m8        [0m | [0m-1.7     [0m | [0m0.0001   [0m | [0m20.03    [0m | [0m0.1      [0m |
| [0m9        [0m | [0m-0.9369  [0m | [0m0.00938  

In [None]:
P, Q = train(train_ratings, P, Q, num_epochs=40, learning_rate= 0.004028, regularization=0.03846)

predictions = predict(P, Q)

print('Train RMSE:', calculate_rmse(train_ratings, predictions))

print('Test RMSE:', calculate_rmse(test_ratings, predictions))


Train RMSE: 0.8065121443686154
Test RMSE: 0.9332725735485567


In [None]:
movies_with_cluster = pd.read_csv('/content/gdrive/MyDrive/movies_clustered_final.csv')
movies_with_cluster.head()

Unnamed: 0,MovieId,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Horror,Mystery,Sci-Fi,IMAX,War,Musical,Documentary,Western,Film-Noir,cluster_label
0,1,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,4
3,4,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5


In [None]:
movies_df = pd.read_csv('/content/gdrive/MyDrive/movies.csv',delimiter=";")

In [None]:
movies_dict = pd.Series(movies_df.title.values, index=movies_df.movieId).to_dict()

In [None]:
# Get the non-zero indices in the test set
test_indices = np.nonzero(test_ratings)

# Get the corresponding predicted ratings
test_predictions = predictions[test_indices]

# Get the actual ratings
actual_ratings = test_ratings[test_indices]

# Print a few samples
for i in range(10):
    print(f"User: {test_indices[0][i]}, Movie: {test_indices[1][i]}, Actual Rating: {actual_ratings[i]}, Predicted Rating: {test_predictions[i]}")

User: 0, Movie: 98, Actual Rating: 4.0, Predicted Rating: 4.247761366480355
User: 0, Movie: 279, Actual Rating: 4.0, Predicted Rating: 4.464571833296821
User: 0, Movie: 958, Actual Rating: 4.5, Predicted Rating: 4.353730298124919
User: 0, Movie: 974, Actual Rating: 5.0, Predicted Rating: 4.084172830434199
User: 0, Movie: 3859, Actual Rating: 3.5, Predicted Rating: 3.760060786713402
User: 1, Movie: 24, Actual Rating: 3.0, Predicted Rating: 4.120616967979471
User: 1, Movie: 230, Actual Rating: 4.0, Predicted Rating: 4.703929864478155
User: 2, Movie: 121, Actual Rating: 4.0, Predicted Rating: 3.5343065369926534
User: 2, Movie: 326, Actual Rating: 4.0, Predicted Rating: 3.2229231853759037
User: 2, Movie: 365, Actual Rating: 4.0, Predicted Rating: 3.481102195205332


In [None]:
import random

def select_users_with_min_ratings(user_movie_matrix, min_ratings=400, num_users_to_select=10):
    # Filter users who have rated a minimum number of movies
    eligible_users = user_movie_matrix[(user_movie_matrix != 0).sum(axis=1) >= min_ratings].index.tolist()

    if len(eligible_users) > num_users_to_select:
        # Randomly select a subset of users
        selected_users = random.sample(eligible_users, num_users_to_select)
    else:
        selected_users = eligible_users

    return selected_users

# Example usage
selected_users = select_users_with_min_ratings(user_movie_matrix, min_ratings=400, num_users_to_select=10)
print("Selected Users:", selected_users)

Selected Users: [130, 567, 627, 179, 403, 599, 194, 62, 244, 303]


In [None]:
# Let's say we want to recommend movies for a user
user_id = 668

# Get the user's ratings from the user-item matrix
user_ratings = user_movie_matrix.loc[user_id]

# Get the indices of the movies the user has rated
rated_movie_indices = np.where(user_ratings > 0)[0]

# Adjust the indices to match the indices of the user-item matrix
rated_movie_indices = [user_movie_matrix.columns[idx] for idx in rated_movie_indices]

# Sort by the user's ratings
top_rated_movie_ids = np.argsort(user_ratings[rated_movie_indices])[::-1]
top_rated_movie_ids = [idx_to_movieId[id] for id in top_rated_movie_ids]

print("Top 10 rated movies by user:")
for movie_id in top_rated_movie_ids[:20]:  # top 10 rated movies
    movie_name = movies_dict[movie_id]
    movie = movies_with_cluster[movies_with_cluster['MovieId'] == movie_id]
    movie_cluster = movie['cluster_label'].values[0]
    print(f"{movie_name} ({movie_cluster})")

Top 10 rated movies by user:
Mighty Wind, A (2003) (5)
Tommy Boy (1995) (5)
How to Rob a Bank (2007) (10)
Monsters vs. Aliens (2009) (1)
Kung Fu Panda (2008) (5)
Enchanted (2007) (4)
Stardust (2007) (4)
Night at the Museum (2006) (5)
Funny Girl (1968) (6)
Temple Grandin (2010) (8)
Mothman Prophecies, The (2002) (10)
Rise of the Planet of the Apes (2011) (3)
Patch of Blue, A (1965) (6)
Click (2006) (0)
Planes, Trains & Automobiles (1987) (5)
Limitless (2011) (3)
Airplane II: The Sequel (1982) (5)
Replacement Killers, The (1998) (9)
Days of Thunder (1990) (6)
From Justin to Kelly (2003) (4)


In [None]:
# Get the user's ratings from the user-item matrix
user_ratings = user_movie_matrix.loc[user_id]

# Get the indices of the movies the user hasn't rated yet
unseen_movie_indices = np.where(user_ratings == 0)[0]

# Get the user's predicted ratings
user_predictions = predictions[user_id-1]

# Recommend top N unseen movies for the user
recommended_unseen_movie_ids = np.argsort(user_predictions[unseen_movie_indices])[::-1]
recommended_unseen_movie_ids = [idx_to_movieId[id] for id in recommended_unseen_movie_ids]

print("Top 10 recommended unseen movies for user:")
for movie_id in recommended_unseen_movie_ids[:20]:  # top 20 recommended unseen movies
    movie_name = movies_dict[movie_id]
    movie = movies_with_cluster[movies_with_cluster['MovieId'] == movie_id]
    movie_cluster = movie['cluster_label'].values[0]
    print(f"{movie_name} ({movie_cluster})")

Top 10 recommended unseen movies for user:
Man on a Ledge (2012) (9)
Chappie (2015) (3)
Trailer Park Boys: The Movie (2006) (5)
Longest Yard, The (2005) (5)
Tenacious D in The Pick of Destiny (2006) (5)
Far from Heaven (2002) (6)
Hello Ladies: The Movie (2014) (5)
East-West (Est-ouest) (1999) (6)
Men with Brooms (2002) (0)
Bullitt (1968) (9)
Kate & Leopold (2001) (4)
Last Temptation of Christ, The (1988) (8)
Great Locomotive Chase, The (1956) (7)
Three O'Clock High (1987) (5)
Good Girl, The (2002) (5)
Cube Zero (2004) (2)
Junior (1994) (5)
Weight of Water, The (2000) (3)
Girl Walks Home Alone at Night, A (2014) (2)
Machine, The (2013) (3)


**Backup**

SIMPLE ONLINE LEARNING APPROACH

In [None]:
def add_new_user(P):
    avg_user_factors = np.mean(P, axis=0)
    P = np.vstack([P, avg_user_factors])
    return P

def add_new_movie(Q):
    avg_movie_factors = np.mean(Q, axis=0)
    Q = np.vstack([Q, avg_movie_factors])
    return Q

def update_with_new_rating(user_id, movie_id, rating, P, Q, learning_rate=0.01, regularization=0.1):
    prediction = P[user_id, :].dot(Q[movie_id, :])
    error = rating - prediction
    P[user_id, :] += learning_rate * (error * Q[movie_id, :] - regularization * P[user_id, :])
    Q[movie_id, :] += learning_rate * (error * P[user_id, :] - regularization * Q[movie_id, :])
    return P, Q
