In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.optim as optim
import torch.nn.functional as F

from tqdm import tqdm

import pandas as pd
import numpy as np

import os, time, random
from pathlib import Path
from collections import Counter
from typing import Callable, List
from functools import cache
import re
import sys


In [None]:
cur_dir = Path('.').absolute()
data_dir=cur_dir.parent/ 'data'

In [None]:
list(data_dir.iterdir())

## Exploring Movie df

In [None]:
movie_df = pd.read_csv(data_dir/'movie.csv')

In [None]:
movie_df.info()

In [None]:
movie_df.head()

In [None]:
movie_df.isna().sum() # no nulls

In [None]:
movie_df.duplicated().sum() #no duplicates

In [None]:
movie_df.title.nunique()

In [None]:

# Calculate the value counts for each movie title
title_value_counts = movie_df['title'].value_counts()

# Filter titles that appear more than once
duplicate_titles = title_value_counts[title_value_counts > 1].index.tolist()

print(duplicate_titles)

Some movies have multiple entries with different `movieid` , but it doesn't affect much 

so in genres column there seems to have no spaces bw genres, lets see want unique genres as there

In [None]:
all_genres = movie_df.genres.apply(lambda x : ' '.join(str(x).split('|'))).values.tolist() # split from |
all_genres = ' '.join(set(all_genres)).split() # join all strings and break them into words
all_genres = set(all_genres)  # make a set to find unique ones

In [None]:
print(all_genres, len(all_genres))

there are 20 genres and 1 for movies with no genre (which is (no listed)) which is broken as '(no' and 'listed)'

## Exploraing User data

In [None]:
user_df = pd.read_csv(data_dir/'rating.csv', usecols=['userId','movieId','rating'])

In [None]:
user_df.userId.max(), user_df.userId.min(), user_df.userId.nunique()

In [None]:
# this columns are using too much precision for very low values, lowering the datatype precision
user_df['movieId'] = user_df['movieId'].astype('int32')
user_df['userId'] = user_df['userId'].astype('int32')
user_df['rating'] = user_df['rating'].astype('float32')
print()

In [None]:
user_df.userId.max(), user_df.userId.min(), user_df.userId.nunique()

In [None]:
user_df.shape #(20000263,3)

In [None]:
user_df.info()

In [None]:
user_df.head()

# Text Vectorization

In [None]:
def clean_text(x: str) -> str:
    x = re.sub(r'[^\w\s]', '', x)  # Remove punctuation
    x = x.lower()  # Convert to lowercase
    return x    

In [None]:
a = 'helo#$#@$#%$@%@#$ 44 sir'
clean_text(a)

In [None]:
# Define a custom dataset class
class MovieRatingDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        user_id = torch.tensor(self.dataframe.iloc[idx]['userId'], dtype=torch.int32)
        movie_id = torch.tensor(self.dataframe.iloc[idx]['movieId'], dtype=torch.int32)
        rating = self.dataframe.iloc[idx]['rating']        
        return user_id, movie_id, rating


# Model

In [None]:
import torch
import torch.nn as nn
from pathlib import Path

class RecommenderModel(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim, model_path:Path=None):
        super(RecommenderModel, self).__init__()
        self.model_path = model_path
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)
        self.out = nn.Linear(embedding_dim, 1)
        
    def forward(self, user_ids, movie_tags, debug=False):
        user_ids = user_ids.to(torch.long)  # Convert to Long data type
        movie_tags = movie_tags.to(torch.long)  # Convert to Long data type

        user_emb = self.user_embedding(user_ids)
        movie_emb = self.movie_embedding(movie_tags)
        interaction = user_emb * movie_emb
        x = interaction.mean(dim=1)
        output = self.out(x)

        if debug:
            print('user_emb.shape: ',user_emb.shape)
            print('movie_emb.shape: ',movie_emb.shape)
            print('interaction.shape: ',interaction.shape)
            print('output.shape:',output.shape)

        return output
    
    def load_model(self, model_path=None):
        if model_path is None:
            model_path = self.model_path
        
        try:
            self.load_state_dict(torch.load(model_path))
            print('Model weights loaded.')
        except FileNotFoundError as e:
            print(f'Weights not found. {e}')
        except RuntimeError as e:
            raise(e)
            
            
    def save_model(self, model_path=None):
        if model_path is None:
            model_path = self.model_path
        torch.save(self.state_dict(), model_path)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
model = RecommenderModel(10, 20, 8)
model

In [None]:
test_out = model(torch.randint(1,10,(8,1)), torch.randint(1,20,(8,1)))
test_out.shape

In [None]:
def train_model(model, dataloader, optimizer, loss_function, num_epochs=10, device='cpu', data_percent=1.0, steps_per_epoch=None):
    model.to(device)
    print(f'{model.__class__.__name__} Running on: {device}')

    data_size = int(data_percent * len(dataloader))
    dataloader = iter(dataloader)

    for epoch in range(num_epochs):
        total_loss = 0.0
        total_mse = 0.0
        total_mae = 0.0
        total_samples = 0

        epoch_progress = tqdm(range(data_size), desc=f"Epoch [{epoch+1:2}/{num_epochs:2}]")
        
        if steps_per_epoch is not None:
            epoch_progress = tqdm(range(steps_per_epoch), desc=f"Epoch [{epoch+1:2}/{num_epochs:2}]")

        last_update_time = time.time() - 1.0  # Initialize to ensure the first update
        
        for _ in epoch_progress:
            try:
                batch = next(dataloader)
            except StopIteration:
                dataloader = iter(dataloader)
                batch = next(dataloader)

            user_ids, movie_ids, ratings = batch

            user_ids = user_ids.view(-1, 1)

            user_ids = user_ids.to(device)
            movie_ids = movie_ids.to(device)
            ratings = ratings.to(device)

            optimizer.zero_grad()
            
            outputs = model(user_ids, movie_tags).squeeze()

            loss = loss_function(outputs, ratings)
            
            mse = F.mse_loss(outputs, ratings)
            mae = F.l1_loss(outputs, ratings)
            
            loss.backward()
            optimizer.step()
            
            total_mse += mse.item()
            total_mae += mae.item()
            total_samples += len(ratings)
            total_loss += loss.item()

            formatted_loss = f"{loss.item():.8f}"
            formatted_mse = f"{mse.item():.8f}"
            formatted_mae = f"{mae.item():.8f}"
            
            current_time = time.time()
            if current_time - last_update_time > epoch_progress.mininterval:
                epoch_progress.set_postfix({"Loss": formatted_loss, "MSE": formatted_mse, "MAE": formatted_mae})
                epoch_progress.update()
                last_update_time = current_time

            if steps_per_epoch is not None and _ + 1 >= steps_per_epoch:
                break

        # epoch_progress.close()
        average_loss = total_loss / min(data_size, steps_per_epoch) if steps_per_epoch is not None else total_loss / data_size
        average_mse = total_mse / min(data_size, steps_per_epoch) if steps_per_epoch is not None else total_mse / data_size
        average_mae = total_mae / min(data_size, steps_per_epoch) if steps_per_epoch is not None else total_mae / data_size
        
        print(f"Epoch [{epoch+1:2}/{num_epochs:2}] - Average Loss: {average_loss:.8f} - Average MSE: {average_mse:.8f} - Average MAE: {average_mae:.8f}")
        print()

# training

## make Dataset

In [None]:
user_df.userId.max(), user_df.userId.min(), user_df.userId.nunique()

In [None]:
user_df.movieId.max(), user_df.movieId.min(), user_df.movieId.nunique()

In [None]:
# Set batch size for DataLoader
batch_size = 32

# train
dataset = MovieRatingDataset(user_df)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


# Iterate through the DataLoader during training
for batch in dataloader:
    user_ids, movie_tags, ratings = batch
    print("User IDs:", user_ids)
    print("Movie ids:", movie_tags)
    print("Ratings:", ratings)
    break  # only print the first batch

In [None]:
user_df.userId.nunique(),movie_df.movieId.nunique()

In [None]:
print('max userid value: ',user_df.userId.max())
print('unique userid: ',user_df.userId.nunique())


In [None]:
print('max movieid value: ',movie_df.movieId.max())
print('unique movieid: ',movie_df.movieId.nunique())

In [None]:
# Initialize your model, optimizer, and loss function
num_users = user_df.userId.nunique()  # actual number of users
num_movies = movie_df.movieId.nunique() # actual number of tokens
dim = 8
model = RecommenderModel(num_users, num_movies, dim)
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_function = nn.MSELoss()

# Training loop
num_epochs = 1  # Set the number of training epochs

In [None]:
# set model path
model_dir = cur_dir.parent/'models'
model_path = model_dir/'model.pth'
model.model_path=model_path

In [None]:
# load the model is exists
# model.load_model()

In [None]:
device

In [None]:
train_model(model, dataloader,  optimizer, loss_function, num_epochs=2, device=device, data_percent=0.01, steps_per_epoch=None)

In [None]:
# save the model
model.save_model()

In [None]:
# labels, predictions = predict(model, dataloader)

# Getting recommendations

In [None]:
trained_movie_embedding = model.movie_embedding.weight.data.cpu().numpy()
trained_movie_embedding.shape

In [None]:
a = trained_movie_embedding[0]
a.shape, a

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters=10,random_state=0).fit(trained_movie_embedding)

In [None]:
kmeans.labels_

In [None]:
# for cluster in range(10):
#     print('Cluster: ',cluster)
#     movs = []
    
#     for movidx in np.where(kmeans.labels_==cluster)[0]:
#         print(movidx)
#         movie_id, movie_vector, movie_rating = train_dataset.__getitem__(movidx)
#         print(movie_vector)
#         break

In [None]:
import torch.nn.functional as F

def find_similar_movies(target_movie_embedding, all_movie_embeddings, top_n=5):
    with torch.inference_mode():
        # Calculate cosine similarity
        # print(target_movie_embedding.shape)
        # print(all_movie_embeddings.shape)
        similarity_scores = F.cosine_similarity(target_movie_embedding, all_movie_embeddings, dim=1)
        
        # print('smilarity score')
        # # Sort movies based on similarity scores
        sorted_indices = torch.argsort(similarity_scores, descending=True)
        
        # # Get top N similar movie indices
        top_indices = sorted_indices[:top_n]
        
        # return top_indices
        return top_indices

target_movie_id = 72  # Replace with the target movie's ID
target_movie_embedding = model.movie_embedding(torch.tensor(target_movie_id)).unsqueeze(0)
all_movie_embeddings = model.movie_embedding.weight.data
print('all_movie_embeddings: ',all_movie_embeddings.shape)

# Find similar movies
similar_movie_indices = find_similar_movies(target_movie_embedding, all_movie_embeddings, top_n=51)

# Print or use the similar movie indices
print("Similar movie indices:", similar_movie_indices.shape)
print("Similar movie indices:", similar_movie_indices)

In [None]:
def get_movie_name(idx):
    return movie_df[movie_df.movieId==idx].title.values[0]

def get_movie_id(movie_name):
    return movie_df[movie_df.title==movie_name].movieId.values[0]

In [None]:
a = get_movie_name(4)
b = get_movie_id(a)
a,b

In [None]:
def more_movies(target_movie_id):
    print(get_movie_name(target_movie_id))
    target_movie_embedding = model.movie_embedding(torch.tensor(target_movie_id)).unsqueeze(0)
    all_movie_embeddings = model.movie_embedding.weight.data

    # Find similar movies
    similar_movie_indices = find_similar_movies(target_movie_embedding, all_movie_embeddings, top_n=50)
    # Print or use the similar movie indices
    for num, i in enumerate(similar_movie_indices,1):
        try:
            print(f"{i} :{get_movie_name(int(i.numpy()))}")
        except IndexError :
            print(f'Error at : {i.numpy()}')
            

In [None]:
get_movie_id('War, Inc. (2008)')
get_movie_name(get_movie_id('War, Inc. (2008)'))
more_movies(get_movie_id('War, Inc. (2008)'))

In [None]:
movie_df.sample(33)

In [None]:
movie_df.movieId.max(), movie_df.movieId.nunique()