In [1]:
## read data 

In [157]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
from torch import nn, optim
df_ratings = pd.read_csv("./processed_data/rating_data.csv")
df_users = pd.read_csv("./processed_data/users_data.csv")
df_items = pd.read_csv("./processed_data/movies_data.csv")

In [158]:
# df_items

In [159]:
def combine_genres(df):
    # List all columns that are genres (assuming all except 'movie id', 'movie title', etc. are genre columns)
    genre_columns = df.columns.difference(['movie id', 'movie title', 'release date', 'video release date', 'IMDb URL'])
    # Define a function to combine genres into a single string
    def get_genre_list(row):
        genres = [col for col in genre_columns if row[col] == 1]
        return ', '.join(genres) if genres else 'Unknown'
    # Create a new column 'genres' by applying the get_genre_list function to each row
    df['genres'] = df.apply(get_genre_list, axis=1)
    # Drop the original genre columns
    df = df.drop(columns=genre_columns)

    return df
# Apply the function to the movies dataframe
df_items = combine_genres(df_items)

# Display the updated dataframe to check the new 'genres' column
df_items.head()


Unnamed: 0,movie id,movie title,release date,IMDb URL,genres
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,"Animation, Children's, Comedy"
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,"Action, Adventure, Thriller"
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,Thriller
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,"Action, Comedy, Drama"
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),"Crime, Drama, Thriller"


In [7]:
# create custome data

In [132]:
class MovieLensDataset(Dataset):
    def __init__(self, users, movies, ratings):
        self.users = users
        self.movies = movies
        self.ratings = ratings

    def __len__(self):
        return len(self.users)

    def __getitem__(self, item):
        users = self.users[item]
        movies = self.movies[item]
        ratings = self.ratings[item]
        return {
            "users": torch.tensor(users, dtype=torch.long),
            "movies": torch.tensor(movies, dtype=torch.long),
            "ratings": torch.tensor(ratings, dtype=torch.float),
        }

In [133]:


import torch
from torch import nn

class RatingScaler(nn.Module):
  def __init__(self):
    super(RatingScaler, self).__init__()

  def forward(self, x):
    return torch.clamp(x, min=0.0, max=5.0)

class RecommendationSystemModel(nn.Module):
    def __init__(
        self,
        num_users,
        num_movies,
        embedding_size=256,
        hidden_dim=256,
        dropout_rate=0.2,
    ):
        super(RecommendationSystemModel, self).__init__()
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        self.hidden_dim = hidden_dim
        self.user_embedding = nn.Embedding(
            num_embeddings=self.num_users, embedding_dim=self.embedding_size
        )
        self.movie_embedding = nn.Embedding(
            num_embeddings=self.num_movies, embedding_dim=self.embedding_size
        )
        self.fc1 = nn.Linear(2 * self.embedding_size, self.hidden_dim)
        self.fc2 = nn.Linear(self.hidden_dim, 1)
        self.dropout = nn.Dropout(p=dropout_rate)
        self.relu = nn.ReLU()
        self.rating_scaler = RatingScaler()

    def forward(self, users, movies):
        user_embedded = self.user_embedding(users)
        movie_embedded = self.movie_embedding(movies)
        combined = torch.cat([user_embedded, movie_embedded], dim=1)

        x = self.relu(self.fc1(combined))
        x = self.dropout(x)
        output = self.fc2(x)
        output = self.rating_scaler(output)

        return output
    def recommend_top_5(self, user_id, all_movie_ids, device='cpu'):
        user_tensor = torch.tensor([user_id] * len(all_movie_ids), dtype=torch.long).to(device)
        movie_tensor = torch.tensor(all_movie_ids, dtype=torch.long).to(device)

        self.eval()
        with torch.no_grad():
            predicted_ratings = self.forward(user_tensor, movie_tensor).squeeze(1)

        top_5_ratings, top_5_movie_indices = torch.topk(predicted_ratings, k=5)
        top_5_movie_ids = [all_movie_ids[i] for i in top_5_movie_indices.cpu().numpy()]
        return top_5_movie_ids


In [134]:

from collections import defaultdict
from tqdm import tqdm

def train_model(model, train_loader, val_loader, num_epochs, k=50, threshold=3, lr=0.001, device='cuda'):
    model.to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        with tqdm(total=len(train_loader), desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch") as pbar:
            for batch in train_loader:
                users = batch['users'].to(device)
                movies = batch['movies'].to(device)
                ratings = batch['ratings'].to(device)

                optimizer.zero_grad()
                outputs = model(users, movies).squeeze(1)
                loss = criterion(outputs, ratings)

                loss.backward()
                optimizer.step()

                running_loss += loss.item() * len(users)
                pbar.set_postfix(loss=running_loss / len(train_loader.dataset))
                pbar.update(1)

        precision, recall = validate_model(model, val_loader, k, threshold, device)
        print(f"Validation Precision@{k}: {precision:.4f}, Recall@{k}: {recall:.4f}")

    print("Training complete.")

def calculate_precision_recall(user_ratings, k, threshold):
    user_ratings.sort(key=lambda x: x[0], reverse=True)
    n_rel = sum(true_r >= threshold for _, true_r in user_ratings)
    n_rec_k = sum(est >= threshold for est, _ in user_ratings[:k])
    n_rel_and_rec_k = sum((true_r >= threshold) and (est >= threshold) for est, true_r in user_ratings[:k])

    precision = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
    recall = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
    return precision, recall

def validate_model(model, val_loader, k, threshold, device='cpu'):
    model.eval()
    user_ratings_comparison = defaultdict(list)

    with torch.no_grad():
        for valid_data in val_loader:
            users = valid_data["users"].to(device)
            movies = valid_data["movies"].to(device)
            ratings = valid_data["ratings"].to(device)
            output = model(users, movies)

            for user, pred, true in zip(users, output, ratings):
                user_ratings_comparison[user.item()].append((pred[0].item(), true.item()))

    user_precisions = dict()
    user_based_recalls = dict()

    for user_id, user_ratings in user_ratings_comparison.items():
        precision, recall = calculate_precision_recall(user_ratings, k, threshold)
        user_precisions[user_id] = precision
        user_based_recalls[user_id] = recall

    average_precision = sum(prec for prec in user_precisions.values()) / len(user_precisions)
    average_recall = sum(rec for rec in user_based_recalls.values()) / len(user_based_recalls)

    return average_precision, average_recall


In [135]:
# Data Preparation
def prepare_data(df_ratings, batch_size=16,shuffle=True):
    df_ratings['user_id'] = df_ratings['user_id'] - 1
    df_ratings['item_id'] = df_ratings['item_id'] - 1
    users = df_ratings['user_id'].values
    movies = df_ratings['item_id'].values
    ratings = df_ratings['rating'].values
    dataset = MovieLensDataset(users, movies, ratings)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return data_loader

In [136]:
from sklearn import model_selection
df_train, df_val = model_selection.train_test_split(
    df_ratings, test_size=0.1, random_state=3, stratify=df_ratings.rating.values
)

In [137]:
train_loader = prepare_data(df_train,batch_size=64)
val_loader = prepare_data(df_val,batch_size=1,shuffle=False)

In [138]:
num_users = df_users['user_id'].nunique()
num_movies = df_items["movie id"].nunique()
model = RecommendationSystemModel(num_users=num_users, num_movies=num_movies)

In [None]:
train_model(model, train_loader, val_loader, num_epochs=10, k=50, threshold=3, lr=0.001)

In [139]:
weights = torch.load("models/recommendation_model.pth")
model.load_state_dict(weights)

<All keys matched successfully>

In [140]:
model

RecommendationSystemModel(
  (user_embedding): Embedding(943, 256)
  (movie_embedding): Embedding(1682, 256)
  (fc1): Linear(in_features=512, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (relu): ReLU()
  (rating_scaler): RatingScaler()
)

In [217]:
def recommend_movies_for_user_with_details(model, user_id, df_users, df_movies,df_ratings, top_k=5, device='cpu'):
    """
    Recommend top K movies for a given user ID with user and movie details.
    :param model: Trained recommendation model
    :param user_id: ID of the user to predict for
    :param df_users: DataFrame containing user data
    :param df_movies: DataFrame containing movie data
    :param top_k: Number of top recommendations to return
    :param device: 'cpu' or 'cuda', where the model is located
    :return: Dictionary containing user details, movie details and recommendation scores
    """
    model.to(device)
    model.eval()

    all_movie_ids = df_movies['movie id'].unique()
    user_watched_ids = df_ratings[df_ratings["user_id"]== user_id]["item_id"].unique()
    
    all_movie_ids = set(all_movie_ids) - set(user_watched_ids)
    all_movie_ids = list(all_movie_ids)
    all_movie_ids = [i - 1 for i in all_movie_ids]

    user_tensor = torch.tensor([user_id] * len(all_movie_ids), dtype=torch.long).to(device)
    movie_tensor = torch.tensor(all_movie_ids, dtype=torch.long).to(device)

    with torch.no_grad():
        predictions = model(user_tensor, movie_tensor).squeeze(1)

    predictions = predictions.cpu().numpy()
    movie_predictions = list(zip(all_movie_ids, predictions))
    movie_predictions.sort(key=lambda x: x[1], reverse=True)

    top_movies = movie_predictions[:top_k]
    top_movie_ids = [movie_id for movie_id, _ in top_movies]
    top_movie_scores = [score for _, score in top_movies]

    user_details = df_users[df_users['user_id'] == user_id + 1].to_dict('records')[0]
    recommended_movies = []
    for movie_id, score in zip(top_movie_ids, top_movie_scores):
        movie_details = df_movies[df_movies['movie id'] == movie_id + 1].to_dict('records')
        if movie_details:
            recommended_movies.append({
                'movie_details': movie_details[0],
                'score': score
            })

    return {
        'user_details': user_details,
        'recommendations': recommended_movies
    }

In [218]:
sample_user_id = 186
sample_user_id = sample_user_id - 1 
top_k = 5
recommendations = recommend_movies_for_user_with_details(model, sample_user_id, df_users, df_items,df_ratings, top_k=top_k)

print(f"Top {top_k} recommended movies for user {recommendations['user_details']['user_id']}:")
for i, movie_rec in enumerate(recommendations['recommendations'], 1):
    print(f" {movie_rec['movie_details']['movie id']} . {movie_rec['movie_details']['movie title']} (Score: {movie_rec['score']:.2f}) ( genre : {movie_rec['movie_details']['genres'] }) ")

Top 5 recommended movies for user 186:
 1125 . Innocents, The (1961) (Score: 5.00) ( genre : Thriller) 
 64 . Shawshank Redemption, The (1994) (Score: 4.94) ( genre : Drama) 
 1467 . Saint of Fort Washington, The (1993) (Score: 4.77) ( genre : Drama) 
 300 . Air Force One (1997) (Score: 4.64) ( genre : Action, Thriller) 
 174 . Raiders of the Lost Ark (1981) (Score: 4.60) ( genre : Action, Adventure) 


In [112]:
# user past behaviour 

In [219]:

def get_df(df_ratings,sample_user_id):
    user_ratings = df_ratings[df_ratings["user_id"] == sample_user_id + 1]
    # Merge with movie data to get genres
    user_ratings_with_genres = pd.merge(user_ratings, df_items, left_on='item_id', right_on='movie id', how='inner')

    # Sort by rating in descending order
    sorted_user_ratings_with_genres = user_ratings_with_genres.sort_values('rating', ascending=False)
    # Display the result
    return sorted_user_ratings_with_genres[["item_id","movie title","genres","rating"]]


In [220]:
sorted_user_ratings_with_genres = get_df(df_ratings,sample_user_id)

In [221]:
sorted_user_ratings_with_genres

Unnamed: 0,item_id,movie title,genres,rating
46,79,"Fugitive, The (1993)","Action, Thriller",5
45,1016,Con Air (1997),"Action, Adventure, Thriller",5
33,939,Murder in the First (1995),"Drama, Thriller",5
27,117,"Rock, The (1996)","Action, Adventure, Thriller",5
26,44,Dolores Claiborne (1994),"Drama, Thriller",5
...,...,...,...,...
79,12,"Usual Suspects, The (1995)","Crime, Thriller",1
24,554,Waterworld (1995),"Action, Adventure",1
71,258,Contact (1997),"Drama, Sci-Fi",1
65,1083,Albino Alligator (1996),"Crime, Thriller",1


In [39]:
# recommendations["recommendations"]