# DATA EXPLORATION


In [1]:
# IMPORTS AND SEED SETTING

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt



torch.manual_seed(33)  


<torch._C.Generator at 0x7429599ea310>

In [2]:

ratingsRaw = pd.read_csv("ml-100k/u.data", sep = "\t", header = None, names=["userId", "movieId", "rating", "timestamp"]) 

ratingsRaw.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


I am also going to use the genres of each movie, for this, instead of one hot encoded variables, I will use embeddings. This will allow the model to learn the relationship between similar genres like Action and Adventure and viceversa, how dissimilar some genres are like Comedy and Horror.

In [3]:

# Define column names based on the MovieLens 100k dataset  
column_names = [
    "movieId", "movie title", "release date", "video release date", "IMDb URL",
    "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", 
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", 
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]  

# Read the dataset with the correct separator (|)  
moviesRaw = pd.read_csv("ml-100k/u.item", sep="|", encoding="latin-1", header=None, names=column_names)  

# Extract the last 19 columns (genre columns)
genre_columns = column_names[-19:]  
moviesRaw["genres"] = moviesRaw[genre_columns].values.tolist()

# Display the first few rows  
moviesRaw[["movieId", "movie title", "genres"]].head()


Unnamed: 0,movieId,movie title,genres
0,1,Toy Story (1995),"[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2,GoldenEye (1995),"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,3,Four Rooms (1995),"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,4,Get Shorty (1995),"[0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
4,5,Copycat (1995),"[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [4]:
# Merge ratings with movies on 'movieId', keeping the 'genres_list' column
ratingsRaw = ratingsRaw.merge(moviesRaw[["movieId", "genres"]], on="movieId", how="left")


ratingsRaw[["userId", "movieId", "rating","genres"]].head()

Unnamed: 0,userId,movieId,rating,genres
0,196,242,3,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,186,302,3,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, ..."
2,22,377,1,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,244,51,2,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, ..."
4,166,346,1,"[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [6]:

# Function to translate from movie id to movie title
def idToTitle(movie_id):
    # Filter the movies DataFrame where movieId matches and return the title
    movie = moviesRaw[moviesRaw['movieId'] == movie_id]['movie title'].values
    # Return the first match (or None if no match found)
    return movie[0] if len(movie) > 0 else None

# Example usage
movie_title = idToTitle(2)
print(f"Movie with ID 2: {movie_title}")

Movie with ID 2: GoldenEye (1995)


It is important to check that the distribution of the ID's is sequential. So that we do not confuse the model and make more inneficient.


In [7]:


print("Min User ID:", ratingsRaw["userId"].min(),  "    |    Max User ID:", ratingsRaw["userId"].max())
print("Min Movie ID:", ratingsRaw["movieId"].min(),"   |    Max Movie ID:", ratingsRaw["movieId"].max())

print("\n")

print("Unique User IDs:", len(ratingsRaw["userId"].unique()))
print("Unique Movie IDs:", len(ratingsRaw["movieId"].unique()))


Min User ID: 1     |    Max User ID: 943
Min Movie ID: 1    |    Max Movie ID: 1682


Unique User IDs: 943
Unique Movie IDs: 1682


In [8]:
print("Shape of ratings dataset:",ratingsRaw.shape)


Shape of ratings dataset: (100000, 5)


### Divide the dataset into training, validation and test

In [9]:

# Convert to Pandas DataFrame for splitting
train_data, test_data = train_test_split(ratingsRaw, test_size=0.2, random_state=33)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=33)  # 10% of training as validation


## DATASET 

In [11]:

class MovieLensDataset(Dataset):
    def __init__(self, ratings_df):
        
        self.ratings = ratings_df

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        row = self.ratings.iloc[idx]
        user_id = torch.tensor(row["userId"], dtype=torch.long)   
        movie_id = torch.tensor(row["movieId"], dtype=torch.long)   
        rating = torch.tensor(row["rating"], dtype=torch.float32)
        genres = torch.tensor(row["genres"], dtype=torch.float32)
        return user_id, movie_id, rating, genres



# Convert splits back to Dataset format
train_dataset = MovieLensDataset(train_data)
val_dataset = MovieLensDataset(val_data)
test_dataset = MovieLensDataset(test_data)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


# Check a batch
for batch in train_loader:
    u, m, r ,g = batch
    print("First 5 users:", u[:5])
    print("First 5 mapped movies:", m[:5])
    print("First 5 ratings:", r[:5])
    
    
    break


First 5 users: tensor([927, 924, 359, 537, 174])
First 5 mapped movies: tensor([866, 200, 323, 433, 278])
First 5 ratings: tensor([4., 4., 3., 4., 5.])


# BUILD THE MODEL

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

class RecommenderNN(nn.Module):
    def __init__(self, num_users, num_movies, num_genres, embedding_dim=32):
        super(RecommenderNN, self).__init__()

        # Embedding layers for users and movies
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)

         # Genre Embedding (using Linear Layer)
        self.genre_fc = nn.Linear(num_genres, embedding_dim)


        # Feedforward layers
        self.fc1 = nn.Linear(embedding_dim * 3, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)  # Output is a single predicted rating

        # Activation and dropout
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)

    def forward(self, user_ids, movie_ids,genresOneHot):
        # Get embeddings
        user_embeds = self.user_embedding(user_ids)
        movie_embeds = self.movie_embedding(movie_ids)
        

        # Convert one-hot genres into embeddings
        genre_emb = self.relu(self.genre_fc(genresOneHot))

        # Concatenate user and movie embeddings
        x = torch.cat([user_embeds, movie_embeds,genre_emb], dim=1)

        # Forward pass through feedforward layers
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.fc3(x)  # Output raw prediction

        return x.squeeze()  # Remove extra dimension


In [15]:
num_users = num_movies = len(ratingsRaw) 
num_genres = len(ratingsRaw["genres"][0])

# Initialize the model
model = RecommenderNN(num_users, num_movies, num_genres, embedding_dim=32)

criterion = nn.MSELoss()  # Regression task
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [16]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop with Early Stopping
num_epochs = 50  # Max epochs (can be large since early stopping will prevent overfitting)
patience = 5     # Number of epochs to wait before stopping
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_train_loss = 0

    for user_ids, movie_ids, ratings , genres in train_loader:
        user_ids, movie_ids, ratings, genres = user_ids.to(device), movie_ids.to(device), ratings.to(device), genres.to(device)

        optimizer.zero_grad()
        predictions = model(user_ids, movie_ids,genres)
        loss = criterion(predictions, ratings)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    # Validation step (NO gradient update)
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for user_ids, movie_ids, ratings ,genres in val_loader:
            user_ids, movie_ids, ratings, genres = user_ids.to(device), movie_ids.to(device), ratings.to(device), genres.to(device)
            predictions = model(user_ids, movie_ids,genres)
            loss = criterion(predictions, ratings)
            total_val_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    avg_val_loss = total_val_loss / len(val_loader)

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    # Early stopping logic
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0  # Reset counter
        torch.save(model, "genresModel.pth")  # Save the best model
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print(f"Early stopping triggered at epoch {epoch+1}. Best Validation Loss: {best_val_loss:.4f}")
        break  # Stop training



Epoch 1/50, Train Loss: 1.4212, Val Loss: 1.0284
Epoch 2/50, Train Loss: 1.0377, Val Loss: 0.9571
Epoch 3/50, Train Loss: 0.9763, Val Loss: 0.9178
Epoch 4/50, Train Loss: 0.9431, Val Loss: 0.9033
Epoch 5/50, Train Loss: 0.9149, Val Loss: 0.9042
Epoch 6/50, Train Loss: 0.8972, Val Loss: 0.8913
Epoch 7/50, Train Loss: 0.8842, Val Loss: 0.8848
Epoch 8/50, Train Loss: 0.8725, Val Loss: 0.8793
Epoch 9/50, Train Loss: 0.8633, Val Loss: 0.8943
Epoch 10/50, Train Loss: 0.8510, Val Loss: 0.8798
Epoch 11/50, Train Loss: 0.8416, Val Loss: 0.8823
Epoch 12/50, Train Loss: 0.8316, Val Loss: 0.8837
Epoch 13/50, Train Loss: 0.8164, Val Loss: 0.8764
Epoch 14/50, Train Loss: 0.8099, Val Loss: 0.8929
Epoch 15/50, Train Loss: 0.7967, Val Loss: 0.8772
Epoch 16/50, Train Loss: 0.7830, Val Loss: 0.8843
Epoch 17/50, Train Loss: 0.7740, Val Loss: 0.8876
Epoch 18/50, Train Loss: 0.7635, Val Loss: 0.8833
Early stopping triggered at epoch 18. Best Validation Loss: 0.8764


## Test set evaluation

In [None]:
model.eval()  # Set model to evaluation mode
total_test_loss = 0

for user_ids, movie_ids, ratings in test_loader:
    user_ids, movie_ids, ratings = user_ids.to(device), movie_ids.to(device), ratings.to(device)

    predictions = model(user_ids,movie_ids)
    loss = criterion(predictions, ratings)
    total_test_loss += loss.item()

print(f"MSE for test set: {total_test_loss/len(test_loader):.4f}")

## Reccomendation example

In [None]:
def predict_rating(model, user_id, movie_id):
    # Set model to evaluation mode
    model.eval()
    
    # Convert IDs to tensors
    user = torch.tensor([user_id], dtype=torch.long)
    movie = torch.tensor([movie_id], dtype=torch.long)
    
    # Get prediction
    with torch.no_grad():
        prediction = model(user, movie)
    
    # Convert prediction to rating scale
    predicted_rating = prediction.item()
    
    # Get movie title for better context
    movie_title = idToTitle(movie_id)
    
    return predicted_rating
    
    print(f"Predicted rating for User {user_id} watching '{movie_title}': {predicted_rating:.2f}")

 

In [None]:
def getRecommnedationsForUser(userID):
    
    ratedMovies = {}
    
    for movie_id in np.unique(ratingsRaw["movieId"]):
        rating = predict_rating(model, userID, movie_id)
        ratedMovies[idToTitle(movie_id)] = rating
        
    sorted_movies = dict(sorted(ratedMovies.items(), key=lambda item: item[1], reverse=True)[:10])
    
    for movie in sorted_movies:
        print(f"{movie}, {sorted_movies[movie]:.2f}")

In [None]:
getRecommnedationsForUser(1)

In [None]:
getRecommnedationsForUser(2)

In [None]:
getRecommnedationsForUser(3)

## LOAD THE MODEL

In [None]:
model = torch.load("+genresModel.pth",weights_only=False)