In [44]:
import pandas as pd
import tensorflow as tf
import numpy as np
import os
from tensorflow import keras
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Dropout
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [65]:
PROCESSED_DATA_DIR = "../data/processed/"
MODEL_DIR = "../models/"
MODEL_PATH = os.path.join(MODEL_DIR, "recommender_model_02_ranking.keras")

In [46]:
df = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, "movielens_1m_preprocessed.csv"), low_memory=False)

In [47]:
# Create a new dataframe with renamed columns
ratings = df.rename(columns={
    'MovieID': 'movie_id',
    'UserID': 'user_id', 
    'Rating': 'user_rating'
})
ratings = ratings[['user_id', 'movie_id', 'user_rating']]

# Per Movie Ratings Distribution

Snažíme sa získať perspektívu o distribúcii hodnotení jednotlivých filmov. Dataset MOVIELENS 1M nám garantuje, že každý používateľ ohodnotil aspoň 20 filmov. Nemáme však žiadnu garanciu o tom, či každý film dostal aspoň X hodnotení. 
Zrejme bude vhodné zbaviť sa filmov, ktoré dostali menej hodnotení ako X, pričom nechceme vybrať X také, aby sme nevyhodnotili príliš veľký počet filmov. Odstránenie filmov s menej  hodnoteniami zabezpečí spoľahlivejšie odporúčania.

In [48]:
# Get rating counts per movie
movie_rating_counts = ratings.groupby('movie_id')['user_rating'].count().reset_index()
movie_rating_counts = movie_rating_counts.rename(columns={'user_rating': 'rating_count'})

# Create interactive histogram
fig = px.histogram(
    movie_rating_counts, 
    x='rating_count',
    title='Distribution of Ratings per Movie',
    labels={'rating_count': 'Number of Ratings', 'count': 'Number of Movies'},
    nbins=100
)
fig.show()

# Print some statistics
print("Rating count statistics:")
print(movie_rating_counts['rating_count'].describe())

# Get movies with very few ratings (e.g., less than 10)
print(f"\nTotal movie count: {len(movie_rating_counts)}")
low_rated_movies = movie_rating_counts[movie_rating_counts['rating_count'] < 5]
print(f"\nMovies with less than 5 ratings: {len(low_rated_movies)} - {len(low_rated_movies)/len(movie_rating_counts)*100:.2f}%")
low_rated_movies = movie_rating_counts[movie_rating_counts['rating_count'] < 10]
print(f"\nMovies with less than 10 ratings: {len(low_rated_movies)} - {len(low_rated_movies)/len(movie_rating_counts)*100:.2f}%")
low_rated_movies = movie_rating_counts[movie_rating_counts['rating_count'] < 15]
print(f"\nMovies with less than 15 ratings: {len(low_rated_movies)} - {len(low_rated_movies)/len(movie_rating_counts)*100:.2f}%")
low_rated_movies = movie_rating_counts[movie_rating_counts['rating_count'] < 20]
print(f"\nMovies with less than 20 ratings: {len(low_rated_movies)} - {len(low_rated_movies)/len(movie_rating_counts)*100:.2f}%")
low_rated_movies = movie_rating_counts[movie_rating_counts['rating_count'] < 25]
print(f"\nMovies with less than 25 ratings: {len(low_rated_movies)} - {len(low_rated_movies)/len(movie_rating_counts)*100:.2f}%")
low_rated_movies = movie_rating_counts[movie_rating_counts['rating_count'] < 30]
print(f"\nMovies with less than 30 ratings: {len(low_rated_movies)} - {len(low_rated_movies)/len(movie_rating_counts)*100:.2f}%")

Rating count statistics:
count    3706.000000
mean      269.889099
std       384.047838
min         1.000000
25%        33.000000
50%       123.500000
75%       350.000000
max      3428.000000
Name: rating_count, dtype: float64

Total movie count: 3706

Movies with less than 5 ratings: 290 - 7.83%

Movies with less than 10 ratings: 446 - 12.03%

Movies with less than 15 ratings: 572 - 15.43%

Movies with less than 20 ratings: 663 - 17.89%

Movies with less than 25 ratings: 772 - 20.83%

Movies with less than 30 ratings: 870 - 23.48%


## Odstránenie málo hodnotených filmov

Vhodný hodnota thersholdu minima hodnotení filmu sa javí byť 10. Ak odstránime všetky filmy, ktoré majú menej hodnotení ako 10 ostane nám stále 88% filmov a zbavíme sa menej výzbnamných filmov.

In [49]:
MIN_RATINGS = 10
valid_movies = movie_rating_counts[movie_rating_counts['rating_count'] >= MIN_RATINGS]['movie_id']
filtered_ratings = ratings[ratings['movie_id'].isin(valid_movies)]

print(f"Original ratings count: {len(ratings)}")
print(f"Filtered ratings count: {len(filtered_ratings)}")
print(f"Removed movies: {len(ratings) - len(filtered_ratings)}")

Original ratings count: 1000209
Filtered ratings count: 998539
Removed movies: 1670


In [50]:
original_ratings = ratings.copy()
ratings = filtered_ratings.copy()

In [51]:
unique_user_ids = ratings['user_id'].unique()
unique_movie_ids = ratings['movie_id'].unique()

user_id_to_index = {user_id: i for i, user_id in enumerate(unique_user_ids)}
movie_id_to_index = {movie_id: i for i, movie_id in enumerate(unique_movie_ids)}
ratings["user_id"] = ratings["user_id"].map(user_id_to_index)
ratings["movie_id"] = ratings["movie_id"].map(movie_id_to_index)

In [52]:
tf.random.set_seed(42)
train, test = train_test_split(ratings, test_size=0.2, random_state=42)

In [58]:
class RankingModel(tf.keras.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 64

    initializer = tf.keras.initializers.RandomUniform(minval=0.5, maxval=1.0)
    self.user_embeddings = tf.keras.layers.Embedding(
        len(unique_user_ids) + 1, embedding_dimension, embeddings_initializer=initializer
    )
    self.movie_embeddings = tf.keras.layers.Embedding(
        len(unique_movie_ids) + 1, embedding_dimension, embeddings_initializer=initializer
    )

    self.mlp = tf.keras.Sequential([
      tf.keras.layers.Dense(256, activation="relu", kernel_regularizer=l2(1e-6)),
      tf.keras.layers.Dropout(0.5),
      tf.keras.layers.Dense(256, activation="relu", kernel_regularizer=l2(1e-6)),
      tf.keras.layers.Dropout(0.5),
      tf.keras.layers.Dense(256, activation="relu", kernel_regularizer=l2(1e-6)),
      tf.keras.layers.Dropout(0.5),
      tf.keras.layers.Dense(64, activation="relu", kernel_regularizer=l2(1e-6)),
      tf.keras.layers.Dropout(0.5),
      tf.keras.layers.Dense(1),
    ])

    # self.final_dense = tf.keras.layers.Dense(1)

  def call(self, inputs):
    user_id, movie_id = inputs

    user_embedding = tf.nn.l2_normalize(self.user_embeddings(user_id), axis=1)
    movie_embedding = tf.nn.l2_normalize(self.movie_embeddings(movie_id), axis=1)

    mlp_output = self.mlp(tf.concat([user_embedding, movie_embedding], axis=1))
    # dot_product = tf.reduce_sum(user_embedding * movie_embedding, axis=1, keepdims=True)

    # combined = tf.concat([mlp_output, dot_product], axis=1)
    # return self.final_dense(combined)
    return mlp_output


In [59]:
user_id = tf.convert_to_tensor([ratings.iloc[0]['user_id']])
movie_id = tf.convert_to_tensor([ratings.iloc[0]['movie_id']])

print(user_id.shape)
RankingModel()((user_id, movie_id))

(1,)


<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.00720191]], dtype=float32)>

In [60]:
model = RankingModel()

model.compile(
    optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.01),
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)

train_features = {
    'user_id': tf.convert_to_tensor(train['user_id'].values, dtype=tf.int64),
    'movie_id': tf.convert_to_tensor(train['movie_id'].values, dtype=tf.int64)
}
train_labels = tf.convert_to_tensor(train['user_rating'].values, dtype=tf.float32)

test_features = {
    'user_id': tf.convert_to_tensor(test['user_id'].values, dtype=tf.int64),
    'movie_id': tf.convert_to_tensor(test['movie_id'].values, dtype=tf.int64)
}
test_labels = tf.convert_to_tensor(test['user_rating'].values, dtype=tf.float32)

In [61]:
print("Max user ID:", ratings['user_id'].max(), test['user_id'].max())
print("Max movie ID:", max(ratings['movie_id'].max(), test['movie_id'].max()))

Max user ID: 6039 6039
Max movie ID: 3259


In [62]:
history = model.fit(
    x=(train_features['user_id'], train_features['movie_id']),
    y=train_labels,
    batch_size=64,
    epochs=30,
    validation_data=((test_features['user_id'], test_features['movie_id']), test_labels),
    # callbacks=[
    #     tf.keras.callbacks.EarlyStopping(
    #         monitor='val_root_mean_squared_error',
    #         patience=3,
    #         restore_best_weights=True
    #     )
    # ]
)

Epoch 1/30


[1m12482/12482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 4ms/step - loss: 1.2650 - root_mean_squared_error: 1.1220 - val_loss: 1.0606 - val_root_mean_squared_error: 1.0295
Epoch 2/30
[1m12482/12482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 4ms/step - loss: 0.9903 - root_mean_squared_error: 0.9946 - val_loss: 0.9202 - val_root_mean_squared_error: 0.9589
Epoch 3/30
[1m12482/12482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 4ms/step - loss: 0.8834 - root_mean_squared_error: 0.9395 - val_loss: 0.8794 - val_root_mean_squared_error: 0.9373
Epoch 4/30
[1m12482/12482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 4ms/step - loss: 0.8493 - root_mean_squared_error: 0.9211 - val_loss: 0.8612 - val_root_mean_squared_error: 0.9276
Epoch 5/30
[1m12482/12482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 4ms/step - loss: 0.8333 - root_mean_squared_error: 0.9124 - val_loss: 0.8509 - val_root_mean_squared_error: 0.9220
Epoch 6/30
[1m12482/12482

In [63]:

# Create subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=('Model Loss', 'Model RMSE'))

# Add Loss traces
fig.add_trace(
    go.Scatter(y=history.history['loss'], name="Training Loss", mode='lines'),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(y=history.history['val_loss'], name="Validation Loss", mode='lines'),
    row=1, col=1
)

# Add RMSE traces
fig.add_trace(
    go.Scatter(y=history.history['root_mean_squared_error'], name="Training RMSE", mode='lines'),
    row=1, col=2
)
fig.add_trace(
    go.Scatter(y=history.history['val_root_mean_squared_error'], name="Validation RMSE", mode='lines'),
    row=1, col=2
)

# Update layout
fig.update_layout(
    height=500,
    width=1200,
    showlegend=True,
    title_text="Training Metrics Over Time"
)

# Update axes labels
fig.update_xaxes(title_text="Epoch", row=1, col=1)
fig.update_xaxes(title_text="Epoch", row=1, col=2)
fig.update_yaxes(title_text="Loss", row=1, col=1)
fig.update_yaxes(title_text="RMSE", row=1, col=2)

fig.show()

In [66]:
os.makedirs(MODEL_DIR, exist_ok=True)
model.save(MODEL_PATH)
print("Model trained and saved!")

Model trained and saved!


[0.923197329044342,
 0.9144569635391235,
 0.9119369387626648,
 0.9105700254440308,
 0.9095427989959717,
 0.9085302352905273,
 0.9073735475540161,
 0.9059902429580688,
 0.9043887257575989,
 0.9027706384658813,
 0.9012628793716431,
 0.899894654750824,
 0.8986436128616333,
 0.8974978923797607,
 0.8964415788650513,
 0.8954850435256958,
 0.8946393728256226,
 0.8938671946525574,
 0.8931801319122314,
 0.8925918936729431,
 0.8920712471008301,
 0.8916212916374207,
 0.8912313580513,
 0.8909320831298828,
 0.8907355070114136,
 0.8905999660491943,
 0.8905472159385681,
 0.8905727863311768,
 0.8906946182250977,
 0.890887975692749]