<a href="https://colab.research.google.com/github/vipashaaV321/Session-Based-Movie-Recommendation-GRU4RECBE/blob/main/GRU4REC_baseLine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## GRU4Rec BaseLine Algorithm

In [None]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import matplotlib.pyplot as plt
import string
from tqdm import tqdm
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from collections import Counter

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

#### MovieLens Dataset

In [None]:
movielens_data_file_url = (
    "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
)
movielens_zipped_file = keras.utils.get_file(
    "ml-latest-small.zip", movielens_data_file_url, extract=False
)
keras_datasets_path = Path(movielens_zipped_file).parents[0]
movielens_dir = keras_datasets_path / "ml-latest-small"

# Only extract the data the first time the script is run.
if not movielens_dir.exists():
    with ZipFile(movielens_zipped_file, "r") as zip:
        # Extract files
        print("Extracting all the files now...")
        zip.extractall(path=keras_datasets_path)
        print("Done!")

ratings_file = movielens_dir / "ratings.csv"
ratings = pd.read_csv(ratings_file)

In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import TopKCategoricalAccuracy

# Load MovieLens 100k dataset
df=ratings

# Mapping user and item IDs to sequential integers
user_mapping = {user: idx for idx, user in enumerate(df['userId'].unique())}
item_mapping = {item: idx for idx, item in enumerate(df['movieId'].unique())}

df['user_idx'] = df['userId'].map(user_mapping)
df['item_idx'] = df['movieId'].map(item_mapping)

# Split the data into training and testing sets
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Model parameters
num_users = len(user_mapping)
num_items = len(item_mapping)
embedding_dim = 50
hidden_units = 100
sequence_length = 5  # You can adjust this based on your preference

# Model architecture
model = Sequential([
    Embedding(input_dim=num_items, output_dim=embedding_dim, input_length=sequence_length),
    GRU(hidden_units),
    Dense(num_items, activation='softmax')
])

# Compile the model
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=[TopKCategoricalAccuracy(k=10)])

# Training data preparation
X_train = []
y_train = []

for user_id, group in train.groupby('user_idx'):
    items = group['item_idx'].values
    for i in range(len(items) - sequence_length):
        X_train.append(items[i:i + sequence_length])
        y_train.append(items[i + sequence_length])

X_train = np.array(X_train)
y_train = np.array(y_train)

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)

# Testing data preparation
X_test = []
y_test = []

for user_id, group in test.groupby('user_idx'):
    items = group['item_idx'].values
    for i in range(len(items) - sequence_length):
        X_test.append(items[i:i + sequence_length])
        y_test.append(items[i + sequence_length])

X_test = np.array(X_test)
y_test = np.array(y_test)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f'Test Loss: {test_loss}')
print(f'Top-10 Categorical Accuracy: {test_accuracy}')

# Generate top N recommendations for a user
def generate_recommendations(user_idx, top_n=10):
    user_sequence = X_test[user_idx].reshape(1, -1)
    predictions = model.predict(user_sequence)
    top_n_indices = np.argpartition(predictions[0], -top_n)[-top_n:]
    recommendations = [item for item, idx in item_mapping.items() if idx in top_n_indices]
    return recommendations

# Example: Generate top 5 recommendations for user with index 0
user_idx_to_recommend = 1
top_n_recommendations = generate_recommendations(user_idx_to_recommend, top_n=5)
print(f'Top 5 Recommendations for User {user_idx_to_recommend}: {top_n_recommendations}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 9.482385635375977
Top-10 Categorical Accuracy: 0.056801021099090576
Top 5 Recommendations for User 1: [1080, 1198, 1517, 1580, 3753]


In [None]:
# Generate top N recommendations for a user with movie names and titles
def generate_recommendations_with_names(user_idx, top_n=10):
    user_sequence = X_test[user_idx].reshape(1, -1)
    predictions = model.predict(user_sequence)
    top_n_indices = np.argpartition(predictions[0], -top_n)[-top_n:]

    recommendations = [(item, item_mapping.get(item, 'Unknown')) for item in top_n_indices]
    return recommendations

# Example: Generate top 5 recommendations with movie names and titles for user with index 0
user_idx_to_recommend = 1
top_n_recommendations_with_names = generate_recommendations_with_names(user_idx_to_recommend, top_n=5)

print(f'Top 5 Recommendations for User {user_idx_to_recommend}:')
for item, title in top_n_recommendations_with_names:
    print(f'Movie ID: {item}')

Top 5 Recommendations for User 1:
Movie ID: 1015
Movie ID: 70
Movie ID: 61
Movie ID: 101
Movie ID: 98


In [None]:
import numpy as np

def mean_reciprocal_rank(ranks):
    return np.mean(1.0 / ranks)

def normalized_discounted_cumulative_gain(ranks):
    # Ideal ranking
    ideal_ranks = np.sort(ranks)
    dcg = np.sum(1.0 / np.log2(ideal_ranks + 2))
    idcg = np.sum(1.0 / np.log2(np.arange(2, len(ideal_ranks) + 2)))
    return dcg / idcg if idcg > 0 else 0.0

def hit_rate(ranks, threshold=10):
    return np.mean(ranks <= threshold)

# Testing data preparation for evaluation
X_test_eval = []
y_test_eval = []

for user_id, group in test.groupby('user_idx'):
    items = group['item_idx'].values
    # Extract the actual ratings/preferences for the user from the last item in the sequence
    actual_ranking = items[-1]  # Assuming the last item in the sequence is the user's preference
    # Append the actual ranking to the ground_truth_rankings list
    ground_truth_rankings.append(actual_ranking)

    for i in range(len(items) - sequence_length):
        X_test_eval.append(items[i:i + sequence_length])
        y_test_eval.append(items[i + sequence_length])

X_test_eval = np.array(X_test_eval)
y_test_eval = np.array(y_test_eval)


In [None]:
# Evaluate the model and calculate metrics
predictions = model.predict(X_test_eval)
top_n_indices = np.argpartition(predictions, -10, axis=1)[:, -10:]



In [None]:
top_n_indices

array([[ 232,    0,   65, ...,   42,   34,  101],
       [ 197,   75,  141, ...,   61,  101, 1015],
       [  15,    7,   68, ...,  101,  322,  335],
       ...,
       [ 415, 1500, 1758, ..., 3300,  330,  104],
       [2175,  224, 1470, ..., 2087, 2161, 2188],
       [  59, 1115,  191, ..., 3574, 4870, 1564]], dtype=int64)

In [None]:
type(top_n_recommendations)

list

In [None]:
import numpy as np

# Function to calculate Mean Reciprocal Rank (MRR)
def calculate_mrr(rankings):
    reciprocal_ranks = [1 / (rank + 1) for rank in rankings]
    return np.mean(reciprocal_ranks)

# Function to calculate Hit Rate at a given position
def calculate_hit_rate(rankings, position):
    hits = [1 if rank <= position else 0 for rank in rankings]
    return np.mean(hits)

# Function to calculate Discounted Cumulative Gain (DCG) at a given position
def calculate_dcg(rankings, position):
    dcg = np.sum([(2 ** relevance - 1) / np.log2(rank + 2) for rank, relevance in enumerate(rankings[:position])])
    return dcg

# Function to calculate Normalized Discounted Cumulative Gain (NDCG) at a given position
def calculate_ndcg(rankings, position):
    ideal_rankings = sorted(rankings, reverse=True)
    ideal_dcg = calculate_dcg(ideal_rankings, position)
    ndcg = calculate_dcg(rankings, position) / ideal_dcg if ideal_dcg > 0 else 0
    return ndcg


# Extract movie IDs from recommendations
recommended_movie_ids = [item for item, _ in top_n_recommendations_with_names]

# Find the position of each recommended movie in the ground truth rankings
rankings = [ground_truth_rankings.index(movie_id) + 1 if movie_id in ground_truth_rankings else 0 for movie_id in recommended_movie_ids]

# Calculate and print metrics
print(f'Mean Reciprocal Rank (MRR): {calculate_mrr(rankings)}')
print(f'Hit Rate at position 1: {calculate_hit_rate(rankings, position=1)}')
print(f'Normalized Discounted Cumulative Gain (NDCG) at position 5: {calculate_ndcg(rankings, position=5)}')

Mean Reciprocal Rank (MRR): 0.20541776725366478
Hit Rate at position 1: 0.2
Normalized Discounted Cumulative Gain (NDCG) at position 5: 0.3985488025814338


In [None]:
# Calculate Hit Rate at positions 5 and 10
hit_rate_at_5 = calculate_hit_rate(rankings, position=5)
hit_rate_at_10 = calculate_hit_rate(rankings, position=53)

# Calculate NDCG at positions 5 and 10
ndcg_at_5 = calculate_ndcg(rankings, position=5)
ndcg_at_10 = calculate_ndcg(rankings, position=100)

# Print the updated metrics
print(f'Hit Rate at position 5: {hit_rate_at_5}')
print(f'Hit Rate at position 10: {hit_rate_at_10}')
print(f'NDCG at position 5: {ndcg_at_5}')
print(f'NDCG at position 10: {ndcg_at_10}')

Hit Rate at position 5: 0.2
Hit Rate at position 10: 0.4
NDCG at position 5: 0.3985488025814338
NDCG at position 10: 0.3985488025814338
