<a href="https://colab.research.google.com/github/vipashaaV321/Session-Based-Movie-Recommendation-GRU4RECBE/blob/main/GRU4RecBE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Reshape, Flatten, Concatenate
from tensorflow.keras.optimizers import Adam
from transformers import BertTokenizer, TFBertModel
from sklearn.metrics import ndcg_score

In [None]:
datapath= "dataset/movie_rating_plot.csv"
data= pd.read_csv(datapath)

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,timestamp,title,genres,plot
0,0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,A cowboy doll is profoundly threatened and jea...
1,1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,A cowboy doll is profoundly threatened and jea...
2,2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,A cowboy doll is profoundly threatened and jea...
3,3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,A cowboy doll is profoundly threatened and jea...
4,4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,A cowboy doll is profoundly threatened and jea...


In [None]:
# Generate negative samples for training
def generate_negative_samples(train_data):
    neg_samples = []
    for idx, row in train_data.iterrows():
        user = row['userId']
        interacted_movie = row['movieId']

        # Extract movies not interacted by the user
        negative_movies = list(set(train_data[train_data['userId'] == user]['movieId'].unique()) - set([interacted_movie]))

        # Generate negative samples for each positive interaction
        num_neg_samples = min(4, len(negative_movies))  # Limit negative samples to available movies
        if num_neg_samples > 0:
            neg_movie_sample = np.random.choice(negative_movies, size=num_neg_samples, replace=False)
            for neg_movie in neg_movie_sample:
                neg_samples.append((user, interacted_movie, neg_movie))
    return pd.DataFrame(neg_samples, columns=['userId', 'movieId', 'negative_movieId'])

def bpr_loss(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.squeeze(y_pred)
    loss = -tf.reduce_mean(tf.math.log_sigmoid(y_pred))  # BPR loss
    return loss

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Activation, LeakyReLU, Add
from tensorflow.keras.optimizers import Adam
from transformers import BertTokenizer
from tensorflow.keras.layers import Reshape

# Encode user and movie IDs
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

data['userId'] = user_encoder.fit_transform(data['userId'])
data['movieId'] = movie_encoder.fit_transform(data['movieId'])

# Convert 'plot' column to strings
data['plot'] = data['plot'].astype(str)

# Tokenize and pad movie plots using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenized_plots = tokenizer(data['plot'].tolist(), padding=True, truncation=True, max_length=128, return_tensors='tf')

# Train-test split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Define model architecture
def create_gru4recbe_model():
    K = 50  # Dimensionality of the movie ID embeddings
    B = 128  # Dimensionality of the BERT plot embeddings
    D = 100  # Dimensionality of the hidden state in GRU
    sequence_length = 100
    # Input layers
    input_movie = Input(shape=(1,), name='movie_id_input')
    input_plot = Input(shape=(B,), name='plot_input')
    input_negative_movie = Input(shape=(1,), name='negative_movie_id_input')  # Negative movie ID input

    #input length=seq. Length
    # # Embedding layers
    embedding_movie = Embedding(input_length=sequence_length,input_dim=len(movie_encoder.classes_), output_dim=K, name='movie_embedding')(input_movie)
    embedding_negative_movie = Embedding(input_length=sequence_length,input_dim=len(movie_encoder.classes_), output_dim=K, name='negative_movie_embedding')(input_negative_movie)

    # Projection layer for plot embeddings
    projected_plot = Dense(K, activation=LeakyReLU(), name='plot_projection')(input_plot)
    reshaped_plot = Reshape((1, K))(projected_plot)  # Reshape the plot tensor to match the movie embedding shape

    # Input Aggregation for positive and negative movie IDs
    merged_input = Concatenate(axis=1)([embedding_movie, reshaped_plot])
    merged_input_negative = Concatenate(axis=1)([embedding_negative_movie, reshaped_plot])  # Merged input for negative samples

    # Gated Recurrent Unit (GRU) layer for positive samples
    gru_output = GRU(D, name='gru_layer')(merged_input)

    # Dot product for positive and negative interactions
    reshaped_merged_input = Reshape((D,))(gru_output)
    reshaped_merged_input_negative = Reshape((D,))(merged_input_negative)

    dot_product = tf.keras.layers.Dot(axes=1)([reshaped_merged_input, reshaped_merged_input_negative])

    # Output activation for BPR loss
    output_layer_bpr = Activation('softmax', name='output_bpr')(dot_product)

     # Create the BPR model
    model_bpr = Model(inputs=[input_movie, input_plot, input_negative_movie], outputs=output_layer_bpr)
    model_bpr.compile(optimizer=Adam(learning_rate=0.001), loss=bpr_loss, metrics=['accuracy'])


    return model_bpr

In [None]:
# Compile the model with both losses (binary cross-entropy and BPR)
model = create_gru4recbe_model()
# model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
# Display model summary
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 plot_input (InputLayer)        [(None, 128)]        0           []                               
                                                                                                  
 movie_id_input (InputLayer)    [(None, 1)]          0           []                               
                                                                                                  
 plot_projection (Dense)        (None, 50)           6450        ['plot_input[0][0]']             
                                                                                                  
 movie_embedding (Embedding)    (None, 1, 50)        147250      ['movie_id_input[0][0]']         
                                                                                            

In [None]:
import numpy as np

# Check the shape of input data
print("Shape of movie IDs:", train_data['movieId'].shape)
print("Shape of tokenized plots:", tokenized_plots['input_ids'].numpy().shape)

Shape of movie IDs: (46800,)
Shape of tokenized plots: (58501, 128)


In [None]:
# Generate negative samples for training
negative_samples = generate_negative_samples(train_data)

print("Shape of negative movie IDs:", negative_samples['negative_movieId'].shape)

# Check the shape of target labels
print("Shape of ratings:", train_data['rating'].shape)
print("Shape of binary labels for BPR loss:", np.ones(len(train_data)).shape)

Shape of negative movie IDs: (187116,)
Shape of ratings: (46800,)
Shape of binary labels for BPR loss: (46800,)


In [None]:
# Train the model with positive and negative samples
model.fit(
    x=[train_data['movieId'], tokenized_plots['input_ids'].numpy(), negative_samples['negative_movieId']],
    y=np.ones(len(train_data)),  # Binary labels for BPR loss
    epochs=1,
    batch_size=16,
    validation_split=0.1
)



<keras.callbacks.History at 0x1db2a255c00>

In [None]:
negative_samples

Unnamed: 0,userId,movieId,negative_movieId
0,473,1082,1574
1,473,1082,2881
2,473,1082,503
3,473,1082,1758
4,139,93,2044
...,...,...,...
187111,482,2670,1566
187112,609,2658,509
187113,609,2658,1374
187114,609,2658,126


In [None]:
# Preprocess input data for prediction
user_ids = test_data['userId'].values  # Replace with the appropriate user IDs
movie_ids = test_data['movieId'].values
plots = test_data['plot'].values

user_ids_encoded = user_encoder.transform(user_ids)
movie_ids_encoded = movie_encoder.transform(movie_ids)

# Tokenize and pad movie plots using BERT tokenizer
tokenized_plots_test = tokenizer(plots.tolist(), padding=True, truncation=True, max_length=128, return_tensors='tf')

# Create placeholder data for negative movie IDs
num_samples = len(test_data)
negative_movie_ids_placeholder = np.zeros(num_samples)  # You may need to replace this with the actual negative movie IDs

# Make predictions
predictions = model.predict({
    'movie_id_input': movie_ids_encoded,
    'plot_input': tokenized_plots_test['input_ids'].numpy(),
    'negative_movie_id_input': negative_movie_ids_placeholder  # Provide placeholder data here
})

# Combine user, movie, and predicted ratings into a DataFrame
results_df = pd.DataFrame({'userId': user_ids, 'movieId': movie_ids, 'predicted_rating': predictions.flatten()})

# Get top 5 movie recommendations for each user
top_5_recommendations = results_df.groupby('userId').apply(lambda x: x.nlargest(5, 'predicted_rating')).reset_index(drop=True)

# Display the top 5 recommendations
print(top_5_recommendations[['userId', 'movieId', 'predicted_rating']])


In [None]:
# Assuming user_id_input is the user ID provided by the user
user_id_input = 123  # Replace this with the actual user ID

# Filter recommendations for the specified user
user_recommendations = top_5_recommendations[top_5_recommendations['userId'] == user_id_input]

# Display the recommendations for the specified user
print(f"Top 5 recommendations for User {user_id_input}:")
print(user_recommendations[['userId', 'movieId', 'predicted_rating']])


In [None]:
import numpy as np

def hit_rate(actual, predicted):
    """
    Calculate Hit Rate.

    Parameters:
    - actual: Set of actual items the user interacted with.
    - predicted: Set of items predicted by the model.

    Returns:
    - Hit Rate.
    """
    intersection = set(actual) & set(predicted)
    return len(intersection) / len(actual)

def reciprocal_rank(actual, predicted):
    """
    Calculate Mean Reciprocal Rank (MRR).

    Parameters:
    - actual: Set of actual items the user interacted with.
    - predicted: Ordered list of items predicted by the model.

    Returns:
    - MRR.
    """
    for i, item in enumerate(predicted, 1):
        if item in actual:
            return 1 / i
    return 0

def ndcg(actual, predicted, k=None):
    """
    Calculate Normalized Discounted Cumulative Gain (NDCG).

    Parameters:
    - actual: Set of actual items the user interacted with.
    - predicted: Ordered list of items predicted by the model.
    - k: The maximum number of items to consider.

    Returns:
    - NDCG.
    """
    if k is None:
        k = len(predicted)

    dcg = 0
    idcg = 0

    for i in range(min(k, len(predicted))):
        item = predicted[i]
        if item in actual:
            dcg += 1 / np.log2(i + 2)
        idcg += 1 / np.log2(i + 2)

    return dcg / idcg if idcg != 0 else 0


In [None]:
# Extract actual and predicted items for each user
actual_items = {}
predicted_items = {}

for user_id, group in top_5_recommendations.groupby('userId'):
    actual_items[user_id] = set(group['movieId'])
    predicted_items[user_id] = list(group['movieId'])

# Calculate metrics for each user
hit_rates = [hit_rate(actual_items[user], predicted_items[user]) for user in actual_items]
mrr = np.mean([reciprocal_rank(actual_items[user], predicted_items[user]) for user in actual_items])
ndcg_5 = np.mean([ndcg(actual_items[user], predicted_items[user], k=5) for user in actual_items])
ndcg_10 = np.mean([ndcg(actual_items[user], predicted_items[user], k=10) for user in actual_items])

print("Hit Rates:", hit_rates)
print("MRR:", mrr)
print("NDCG@5:", ndcg_5)
print("NDCG@10:", ndcg_10)

Hit Rates: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0

In [None]:
# Extract actual and predicted items for each user
actual_items = {}
predicted_items = {}

for user_id, group in top_5_recommendations.groupby('userId'):
    actual_items[user_id] = set(group['movieId'])
    predicted_items[user_id] = list(group['movieId'])

    # Optional: Print for debugging
    # print(f"User {user_id} - Actual: {actual_items[user_id]}, Predicted: {predicted_items[user_id]}")

# Calculate metrics for each user
hit_rates = [hit_rate(actual_items[user], predicted_items[user]) for user in actual_items]
mrr = np.mean([reciprocal_rank(actual_items[user], predicted_items[user]) for user in actual_items])
ndcg_5 = np.mean([ndcg(actual_items[user], predicted_items[user], k=5) for user in actual_items])
ndcg_10 = np.mean([ndcg(actual_items[user], predicted_items[user], k=10) for user in actual_items])

print("Hit Rates:", hit_rates)
print("MRR:", mrr)
print("NDCG@5:", ndcg_5)
print("NDCG@10:", ndcg_10)


Hit Rates: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0

In [None]:
# Extract actual and predicted items for each user
actual_items = {}
predicted_items = {}

for user_id, group in test_data.groupby('userId'):
    actual_items[user_id] = set(group['movieId'])
    predicted_items[user_id] = list(top_5_recommendations.loc[top_5_recommendations['userId'] == user_id, 'movieId'])

    # Optional: Print for debugging
    print(f"User {user_id} - Actual: {actual_items[user_id]}, Predicted: {predicted_items[user_id]}")

# Calculate metrics for each user
hit_rates = [hit_rate(actual_items[user], predicted_items[user]) for user in actual_items]
mrr = np.mean([reciprocal_rank(actual_items[user], predicted_items[user]) for user in actual_items])
ndcg_5 = np.mean([ndcg(actual_items[user], predicted_items[user], k=5) for user in actual_items])
ndcg_10 = np.mean([ndcg(actual_items[user], predicted_items[user], k=10) for user in actual_items])

print("Hit Rates:", hit_rates)



User 0 - Actual: {5, 781, 783, 913, 786, 1690, 291, 1956, 1575, 43, 2604, 436, 954, 1083, 956, 828, 1598, 62, 320, 1474, 1219, 1479, 201, 2761, 1996, 461, 1616, 594, 995, 484, 2794, 1903, 1521, 632, 1145, 1916, 2429, 510}, Predicted: [1616, 201, 43, 1956, 62]
User 2 - Actual: {961, 565, 831, 2941, 30, 1823}, Predicted: [2941, 565, 30, 831, 1823]
User 3 - Actual: {897, 2306, 135, 2827, 783, 1423, 1297, 2323, 278, 2077, 1182, 31, 1570, 2083, 1444, 676, 680, 686, 816, 690, 819, 694, 696, 701, 702, 326, 2507, 977, 211, 1752, 2523, 1639, 2411, 2035}, Predicted: [977, 686, 676, 2523, 135]
User 4 - Actual: {32, 260, 198, 398, 251, 464, 337, 52, 123, 413}, Predicted: [32, 464, 198, 398, 251]
User 5 - Actual: {257, 2, 3, 514, 5, 140, 14, 655, 398, 401, 20, 277, 407, 25, 543, 801, 418, 422, 298, 555, 300, 815, 306, 180, 308, 55, 568, 444, 192, 450, 325, 71, 455, 77, 209, 594, 340, 212, 470, 469, 90, 220, 93, 221, 229, 232, 236, 238, 124, 115, 630, 760, 506, 379, 252, 639}, Predicted: [90, 655, 2

In [None]:
def precision_at_k(actual, predicted, k):
    """
    Calculate Precision@k.

    Parameters:
    - actual: Set of actual items the user interacted with.
    - predicted: Ordered list of items predicted by the model.
    - k: The number of top items to consider.

    Returns:
    - Precision@k.
    """
    intersection = set(predicted[:k]) & set(actual)
    return len(intersection) / k if k != 0 else 0

def recall_at_k(actual, predicted, k):
    """
    Calculate Recall@k.

    Parameters:
    - actual: Set of actual items the user interacted with.
    - predicted: Ordered list of items predicted by the model.
    - k: The number of top items to consider.

    Returns:
    - Recall@k.
    """
    intersection = set(predicted[:k]) & set(actual)
    return len(intersection) / len(actual) if len(actual) != 0 else 0

# Extract actual and predicted items for each user
actual_items = {}
predicted_items = {}

for user_id, group in test_data.groupby('userId'):
    actual_items[user_id] = set(group['movieId'])
    predicted_items[user_id] = list(top_5_recommendations.loc[top_5_recommendations['userId'] == user_id, 'movieId'])

# Calculate Precision@k and Recall@k for each user
k_values = [1, 5, 10]  # You can modify this list as needed

precision_at_k_values = {}
recall_at_k_values = {}

for k in k_values:
    precision_at_k_values[k] = np.mean([precision_at_k(actual_items[user], predicted_items[user], k) for user in actual_items])
    recall_at_k_values[k] = np.mean([recall_at_k(actual_items[user], predicted_items[user], k) for user in actual_items])

# Print Precision@k and Recall@k values for each k
for k in k_values:
    print(f"Precision@{k}: {precision_at_k_values[k]:.4f}")
    print(f"Recall@{k}: {recall_at_k_values[k]:.4f}")


Precision@1: 1.0000
Recall@1: 0.1786
Precision@5: 0.8919
Recall@5: 0.5573
Precision@10: 0.4460
Recall@10: 0.5573


In [None]:
mrr = np.mean([reciprocal_rank(actual_items[user], predicted_items[user]) for user in actual_items])
ndcg_5 = np.mean([ndcg(actual_items[user], predicted_items[user], k=5) for user in actual_items])
ndcg_10 = np.mean([ndcg(actual_items[user], predicted_items[user], k=10) for user in actual_items])


In [None]:
# Example post-processing to diversify recommendations
def diversify_recommendations(predictions, diversity_factor=0.5):
    unique_predictions = set(predictions)
    num_additional_items = int(len(predictions) * diversity_factor)

    # Add diverse items to the recommendations
    diverse_recommendations = list(unique_predictions)

    # Ensure the final list has the desired length
    if len(diverse_recommendations) < len(predictions):
        diverse_recommendations.extend(list(unique_predictions)[:num_additional_items])

    return diverse_recommendations

# Usage
predicted_items[user_id] = diversify_recommendations(predicted_items[user_id])


In [None]:
predicted_items

{0: [1616, 201, 43, 1956, 62],
 2: [2941, 565, 30, 831, 1823],
 3: [977, 686, 676, 2523, 135],
 4: [32, 464, 198, 398, 251],
 5: [90, 655, 238, 14, 470],
 6: [1904, 908, 1430, 224, 2006],
 7: [297, 398, 472, 504],
 8: [899, 329, 704, 1486],
 9: [2895, 1004, 257, 506, 2144],
 10: [398, 9, 412, 142, 510],
 11: [2727, 1536, 476, 1290, 1044],
 12: [2475, 43, 899, 2804, 1602],
 13: [297, 472, 314, 163, 325],
 14: [131, 1548, 1485, 1390, 899],
 15: [599, 895, 945, 922, 2795],
 16: [46, 933, 922, 0, 989],
 17: [1294, 33, 2941, 1242, 907],
 18: [376, 358, 1319, 1012, 1627],
 19: [1710, 788, 2475, 649, 1828],
 20: [2887, 2755, 436, 2038, 2438],
 21: [217, 1525, 95, 1710, 923],
 22: [602, 254, 1095, 28, 2941],
 23: [46, 963, 1804, 484, 1052],
 24: [1938],
 25: [32, 398, 509, 314],
 26: [2377, 1556, 1290, 907, 1865],
 27: [2456, 2221, 5, 1374, 2013],
 28: [908, 899, 2156, 275, 1083],
 29: [793, 224, 910, 897, 901],
 30: [815, 9, 297, 302, 1040],
 31: [658, 793, 319, 88, 126],
 32: [32, 1737, 1078

In [None]:
# Extract actual and predicted items for each user
actual_items = {}
predicted_items = {}

for user_id, group in top_5_recommendations.groupby('userId'):
    actual_items[user_id] = set(group['movieId'])
    predicted_items[user_id] = list(group['movieId'])


# Check for data leakage
train_users = set(train_data['userId'])
test_users = set(test_data['userId'])

intersection_users = train_users.intersection(test_users)

mrr = np.mean([reciprocal_rank(actual_items[user], predicted_items[user]) for user in actual_items])

print("MRR:", mrr)

MRR: 1.0


In [None]:
from sklearn.metrics import ndcg_score

# Assuming actual_items and predicted_items are defined
# ...

# Convert the dictionary to a list of lists
actual_list = [list(actual_items[user]) for user in actual_items]
predicted_list = [list(predicted_items[user]) for user in actual_items]

# Flatten the lists
flat_actual = [item for sublist in actual_list for item in sublist]
flat_predicted = [item for sublist in predicted_list for item in sublist]

# Calculate NDCG using sklearn
ndcg_sklearn = ndcg_score([flat_actual], [flat_predicted], k=k_values[-1])

# Print NDCG using sklearn
print(f"NDCG@{k_values[-1]} (sklearn): {ndcg_sklearn:.4f}")


NDCG@10 (sklearn): 0.5328


In [None]:
from sklearn.metrics import ndcg_score

# Assuming actual_items and predicted_items are defined
# ...

# Convert the dictionary to a list of lists
actual_list = [list(actual_items[user]) for user in actual_items]
predicted_list = [list(predicted_items[user]) for user in actual_items]

# Flatten the lists
flat_actual = [item for sublist in actual_list for item in sublist]
flat_predicted = [item for sublist in predicted_list for item in sublist]

# Calculate NDCG using sklearn for k = 1, 5, and 10
k_values = [1, 5, 10]
for k in k_values:
    ndcg_sklearn = ndcg_score([flat_actual], [flat_predicted], k=k)
    print(f"NDCG@{k} (sklearn): {ndcg_sklearn:.4f}")


In [None]:
# Extract actual and predicted items for each user
actual_items = {}
predicted_items = {}

for user_id, group in test_data.groupby('userId'):
    actual_items[user_id] = set(group['movieId'])
    predicted_items[user_id] = list(top_5_recommendations.loc[top_5_recommendations['userId'] == user_id, 'movieId'])


# Calculate metrics for each user
hit_rates = [hit_rate(actual_items[user], predicted_items[user]) for user in actual_items]

print("Hit Rates for 5 Items:", hit_rates)

print(np.mean(hit_rates))

Hit Rates for 5 Items: [0.13157894736842105, 0.8333333333333334, 0.14705882352941177, 0.5, 0.08928571428571429, 0.45454545454545453, 1.0, 1.0, 0.625, 0.29411764705882354, 0.7142857142857143, 0.5555555555555556, 0.8333333333333334, 0.45454545454545453, 0.2777777777777778, 0.2777777777777778, 0.14705882352941177, 0.03164556962025317, 0.16666666666666666, 0.16666666666666666, 0.5555555555555556, 0.20833333333333334, 0.5555555555555556, 1.0, 1.0, 0.2631578947368421, 0.08620689655172414, 0.38461538461538464, 1.0, 0.4166666666666667, 0.21739130434782608, 0.21739130434782608, 1.0, 0.7142857142857143, 0.5, 1.0, 0.3333333333333333, 0.23809523809523808, 0.29411764705882354, 0.29411764705882354, 0.06578947368421052, 0.2, 0.5, 0.07142857142857142, 1.0, 0.4166666666666667, 1.0, 1.0, 0.29411764705882354, 0.09433962264150944, 0.8333333333333334, 1.0, 0.8333333333333334, 1.0, 0.625, 0.052083333333333336, 0.1724137931034483, 0.20833333333333334, 1.0, 1.0, 0.35714285714285715, 0.25, 0.07575757575757576,

In [None]:
def hit_rate_at_k(actual, predicted, k):
    """
    Calculate Hit Rate@k.

    Parameters:
    - actual: Set of actual items the user interacted with.
    - predicted: Ordered list of items predicted by the model.
    - k: The number of top items to consider.

    Returns:
    - Hit Rate@k.
    """
    intersection = set(actual) & set(predicted[:k])
    return len(intersection) / min(k, len(actual))


In [None]:
# Calculate Hit Rate@k for each k
k_values = [ 10]
hit_rate_at_k_values = {}

for k in k_values:
    hit_rate_at_k_values[k] = np.mean([hit_rate_at_k(actual_items[user], predicted_items[user], k) for user in actual_items])

# Print Hit Rate@k values for each k
for k in k_values:
    print(f"Hit Rate@{k}: {hit_rate_at_k_values[k]:.4f}")



Hit Rate@10: 0.6857
