In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.optim as optim
import torch.nn.functional as F

from tqdm import tqdm

import pandas as pd
import numpy as np

import time
from pathlib import Path

In [2]:
cur_dir = Path('.').absolute()
data_dir=cur_dir.parent/ 'data'

print('current directory: ',cur_dir)

current directory:  /home/t/aproject/movie-recommender-system-collaborative-filtering/notebooks


In [3]:
list(data_dir.iterdir())

[PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/tag.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/movie.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/link.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/rating.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/new_movie_df.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/movies.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/plot_embedding.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/new_user_df.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/genome_tags.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/genome_scores.csv')]

## Exploring Movie df

In [4]:
movie_df = pd.read_csv(data_dir/'movie.csv')

In [5]:
def get_movie_name(idx, df):
    try:
        return df[df.movieId==idx].title.values[0]
    except IndexError as e:
        print("IndexError:", idx)

def get_movie_id(movie_name, df):
    try:
        return df[df.title==movie_name].movieId.values[0]
    except IndexError as e:
        print("Movie not found in dataset")

In [6]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB


In [7]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
movie_df.isna().sum() # no nulls

movieId    0
title      0
genres     0
dtype: int64

In [9]:
movie_df.duplicated().sum() #no duplicates

0

In [10]:
movie_df.title.nunique()

27262

In [11]:

# Calculate the value counts for each movie title
title_value_counts = movie_df['title'].value_counts()

# Filter titles that appear more than once
duplicate_titles = title_value_counts[title_value_counts > 1].index.tolist()

print(duplicate_titles)

['Aladdin (1992)', 'Johnny Express (2014)', 'Chaos (2005)', 'Hamlet (2000)', '20,000 Leagues Under the Sea (1997)', 'Darling (2007)', 'Casanova (2005)', 'Paradise (2013)', 'Beneath (2013)', 'Girl, The (2012)', 'Clear History (2013)', 'Emma (1996)', 'Offside (2006)', 'Blackout (2007)', 'Men with Guns (1997)', 'War of the Worlds (2005)']


Some movies have multiple entries with different `movieid` , but it doesn't affect much 

so in genres column there seems to have no spaces bw genres, lets see want unique genres as there

In [12]:
all_genres = movie_df.genres.apply(lambda x : ' '.join(str(x).split('|'))).values.tolist() # split from |
all_genres = ' '.join(set(all_genres)).split() # join all strings and break them into words
all_genres = set(all_genres)  # make a set to find unique ones

In [13]:
print(all_genres, len(all_genres))

{'Horror', '(no', 'War', 'Comedy', 'Animation', 'Romance', 'Drama', 'Sci-Fi', 'Mystery', 'genres', 'Children', 'Action', 'Musical', 'IMAX', 'Adventure', 'listed)', 'Documentary', 'Film-Noir', 'Thriller', 'Crime', 'Western', 'Fantasy'} 22


there are 20 genres and 1 for movies with no genre (which is (no listed)) which is broken as '(no' and 'listed)'

<br>

there are alot of movies that only has 1,2 reviews , so now we will only consider mmovies which has more than 100 review.

In [14]:
import seaborn as sns
import matplotlib.pyplot as plt

## Exploraing User data

In [15]:
user_df = pd.read_csv(data_dir/'rating.csv', usecols=['userId','movieId','rating'])

In [16]:
# this columns are using too much precision for very low values, lowering the datatype precision
user_df['movieId'] = user_df['movieId'].astype('int32') # dont lower too much as it changes the numbers to accomodate to the range
user_df['userId'] = user_df['userId'].astype('int32')
user_df['rating'] = user_df['rating'].astype('float32')
print()




In [17]:
user_df.userId.max(), user_df.userId.min(), user_df.userId.nunique()

(138493, 1, 138493)

In [18]:
user_df.shape #(20000263,3)

(20000263, 3)

In [19]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int32  
 1   movieId  int32  
 2   rating   float32
dtypes: float32(1), int32(2)
memory usage: 228.9 MB


In [20]:
user_df.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


finding out reviews per movies

In [21]:
movie_vote_count = user_df.movieId.value_counts()
movie_vote_count

movieId
296       67310
356       66172
318       63366
593       63299
480       59715
          ...  
125545        1
78873         1
112907        1
112909        1
110510        1
Name: count, Length: 26744, dtype: int64

In [22]:
print('Most voted movies')
i=20
for idx, count in zip(movie_vote_count.index ,movie_vote_count.to_list()):
    print(f"{get_movie_name(idx, movie_df):35} : {count}")
    i -= 1
    if i==0:
        break

Most voted movies
Pulp Fiction (1994)                 : 67310
Forrest Gump (1994)                 : 66172
Shawshank Redemption, The (1994)    : 63366
Silence of the Lambs, The (1991)    : 63299
Jurassic Park (1993)                : 59715
Star Wars: Episode IV - A New Hope (1977) : 54502
Braveheart (1995)                   : 53769
Terminator 2: Judgment Day (1991)   : 52244
Matrix, The (1999)                  : 51334
Schindler's List (1993)             : 50054
Toy Story (1995)                    : 49695
Fugitive, The (1993)                : 49581
Apollo 13 (1995)                    : 47777
Independence Day (a.k.a. ID4) (1996) : 47048
Usual Suspects, The (1995)          : 47006
Star Wars: Episode VI - Return of the Jedi (1983) : 46839
Batman (1989)                       : 46054
Star Wars: Episode V - The Empire Strikes Back (1980) : 45313
American Beauty (1999)              : 44987
Twelve Monkeys (a.k.a. 12 Monkeys) (1995) : 44980


In [23]:
movie_df_rating_filter = 100

In [24]:
popular_movieIds = movie_vote_count[movie_vote_count>=movie_df_rating_filter].index
popular_movieIds

Index([   296,    356,    318,    593,    480,    260,    110,    589,   2571,
          527,
       ...
       112911,   8201,  30867,    687,  71878,   4208,  59915,   5256,  51127,
        26746],
      dtype='int32', name='movieId', length=8546)

In [25]:
new_user_df = user_df[user_df['movieId'].isin(popular_movieIds)]
new_user_df.shape

(19706281, 3)

we only lost ~300k ratings while droppin alot of redundent movies while extremly low reviews.

same for the users, if users with certain number of reviews  will be used.

In [26]:
user_vote_count = user_df.userId.value_counts()
user_vote_count

userId
118205    9254
8405      7515
82418     5646
121535    5520
125794    5491
          ... 
89305       20
110463      20
96990       20
134747      20
6526        20
Name: count, Length: 138493, dtype: int64

In [27]:
print('Most user reviews')
i=20
for idx, count in zip(user_vote_count.index ,user_vote_count.to_list()):
    print(f"{idx:35} : {count}")
    i -= 1
    if i==0:
        break

Most user reviews
                             118205 : 9254
                               8405 : 7515
                              82418 : 5646
                             121535 : 5520
                             125794 : 5491
                              74142 : 5447
                              34576 : 5356
                             131904 : 5330
                              83090 : 5169
                              59477 : 4988
                             130767 : 4785
                              79159 : 4707
                               8963 : 4524
                              15617 : 4354
                              92011 : 4236
                              71975 : 4182
                              20132 : 4101
                              46470 : 4094
                              88820 : 4093
                              63147 : 3958


In [28]:
user_df_vote_filter = 500

In [29]:
popular_userIds = user_vote_count[user_vote_count>=user_df_vote_filter].index
popular_userIds

Index([118205,   8405,  82418, 121535, 125794,  74142,  34576, 131904,  83090,
        59477,
       ...
       134897,  50213,  63094, 111821, 103323,  86186,  63958,  28709,  80789,
        62328],
      dtype='int32', name='userId', length=7491)

In [30]:
new_user_df = new_user_df[new_user_df['userId'].isin(popular_userIds)]
new_user_df.shape

(6370818, 3)

In [31]:
new_movie_df = movie_df[movie_df['movieId'].isin(popular_movieIds)]
new_movie_df.shape

(8546, 3)

In [32]:
new_movie_df.to_csv(data_dir/'new_movie_df.csv')
new_user_df.to_csv(data_dir/'new_user_df.csv')

## dataset

In [33]:
user_df.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [34]:
# Define a custom dataset class
class MovieRatingDataset(Dataset):
    def __init__(self, df):
        self.df = df.copy()
        
        users = df.userId.sort_values().unique()
        movies = df.movieId.sort_values().unique()
        
        self.num_users = len(users)
        self.num_movies = len(movies) 
        
        self.userId2idx = {userId:idx for idx, userId in enumerate(users)}
        self.movieId2idx = {movieId:idx for idx, movieId in enumerate(movies)}
        
        self.idx2userId = {idx:userId for userId, idx in self.userId2idx.items()}
        self.idx2movieId = {idx:movieId for movieId, idx in self.movieId2idx.items()}
        
        self.df.movieId =  self.df.movieId.map(self.movieId2idx)
        self.df.userId =  self.df.userId.map(self.userId2idx)

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        user_id = torch.tensor(self.df.iloc[idx]['userId'], dtype=torch.int32)
        movie_id = torch.tensor(self.df.iloc[idx]['movieId'], dtype=torch.int32)
        rating = torch.tensor(self.df.iloc[idx]['rating'], dtype=torch.float32)
        return user_id, movie_id, rating


# Model

In [35]:
import torch
import torch.nn as nn
from pathlib import Path

class RecommenderModel(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim, model_path:Path=None):
        super(RecommenderModel, self).__init__()
        self.model_path = model_path
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)
        self.user_embedding_bias = nn.Embedding(num_users, 1)
        self.movie_embedding_bias = nn.Embedding(num_movies, 1)
        self.out = nn.Linear(embedding_dim, 1)
        
        
        self.user_embedding.weight.data.uniform_(0, 0.05)
        self.movie_embedding.weight.data.uniform_(0, 0.05)
        self.user_embedding_bias.weight.data.uniform_(-0.01, 0.01)
        self.movie_embedding_bias.weight.data.uniform_(-0.01, 0.01)
        
    def forward(self, user_ids, movie_tags, debug=False):
   
        user_emb = self.user_embedding(user_ids)
        movie_emb = self.movie_embedding(movie_tags)
        
        user_emb_bias = self.user_embedding_bias(user_ids)
        movie_emb_bias = self.movie_embedding_bias(movie_tags)

        interaction = (user_emb * movie_emb) + user_emb_bias + movie_emb_bias
        output = self.out(interaction) 
        if debug:
            print('user_emb.shape   : ',user_emb.shape)
            print('movie_emb.shape  : ',movie_emb.shape)
            print('interaction.shape: ',interaction.shape)
            
            print('user_emb_bias.shape  : ',user_emb_bias.shape)
            print('movie_emb_bias.shape: ',movie_emb_bias.shape)
            
            print('output.shape     :',output.shape)

        return output
    
    def load_model(self, model_path=None):
        if model_path is None:
            model_path = self.model_path
        
        try:
            self.load_state_dict(torch.load(model_path))
            print('Model weights loaded.')
        except FileNotFoundError as e:
            print(f'Weights not found. {e}')
        except RuntimeError as e:
            raise(e)
            
            
    def save_model(self, model_path=None):
        if model_path is None:
            model_path = self.model_path
        torch.save(self.state_dict(), model_path)

In [36]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [37]:
model = RecommenderModel(10, 45, 8)
model

RecommenderModel(
  (user_embedding): Embedding(10, 8)
  (movie_embedding): Embedding(45, 8)
  (user_embedding_bias): Embedding(10, 1)
  (movie_embedding_bias): Embedding(45, 1)
  (out): Linear(in_features=8, out_features=1, bias=True)
)

In [38]:
test_out = model(torch.randint(1,10,(32,1)).squeeze(), torch.randint(1,45,(32,1)).squeeze(), debug=True).squeeze()
test_out.shape

user_emb.shape   :  torch.Size([32, 8])
movie_emb.shape  :  torch.Size([32, 8])
interaction.shape:  torch.Size([32, 8])
user_emb_bias.shape  :  torch.Size([32, 1])
movie_emb_bias.shape:  torch.Size([32, 1])
output.shape     : torch.Size([32, 1])


torch.Size([32])

In [39]:
def train_model(model, dataloader, optimizer, loss_function, num_epochs=10, device='cpu', data_percent=1.0, steps_per_epoch=None):
    model.to(device)
    print(f'{model.__class__.__name__} Running on: {device}')

    data_size = int(data_percent * len(dataloader))
    dataloader = iter(dataloader)

    for epoch in range(num_epochs):
        total_loss = 0.0
        total_mse = 0.0
        total_mae = 0.0
        total_samples = 0

        epoch_progress = tqdm(range(data_size), desc=f"Epoch [{epoch+1:2}/{num_epochs:2}]")
        
        if steps_per_epoch is not None:
            epoch_progress = tqdm(range(steps_per_epoch), desc=f"Epoch [{epoch+1:2}/{num_epochs:2}]")

        last_update_time = time.time() - 1.0  # Initialize to ensure the first update
        
        for _ in epoch_progress:
            try:
                batch = next(dataloader)
            except StopIteration:
                print("Dataloader is exhausted. Resetting or stopping training.")
                # You might want to break the loop or take some other action here
                break

            user_ids, movie_ids, ratings = batch

            
            user_ids = user_ids.to(device)
            movie_ids = movie_ids.to(device)
            ratings = ratings.to(device)

            optimizer.zero_grad()
            
            outputs = model(user_ids, movie_ids).squeeze()
            
            
            loss = loss_function(outputs, ratings)
            mse = F.mse_loss(outputs, ratings)
            mae = F.l1_loss(outputs, ratings)
            
            loss.backward()
            optimizer.step()
            
            total_mse += mse.item()
            total_mae += mae.item()
            total_samples += len(ratings)
            total_loss += loss.item()

            formatted_loss = f"{loss.item():.8f}"
            formatted_mse = f"{mse.item():.8f}"
            formatted_mae = f"{mae.item():.8f}"
            
            current_time = time.time()
            if current_time - last_update_time > epoch_progress.mininterval:
                epoch_progress.set_postfix({"Loss": formatted_loss, "MSE": formatted_mse, "MAE": formatted_mae})
                epoch_progress.update()
                last_update_time = current_time

            if steps_per_epoch is not None and _ + 1 >= steps_per_epoch:
                break

        # epoch_progress.close()
        average_loss = total_loss / min(data_size, steps_per_epoch) if steps_per_epoch is not None else total_loss / data_size
        average_mse = total_mse / min(data_size, steps_per_epoch) if steps_per_epoch is not None else total_mse / data_size
        average_mae = total_mae / min(data_size, steps_per_epoch) if steps_per_epoch is not None else total_mae / data_size
        
        print(f"Epoch [{epoch+1:2}/{num_epochs:2}] - Average Loss: {average_loss:.8f} - Average MSE: {average_mse:.8f} - Average MAE: {average_mae:.8f}")
        print()

# training

## make Dataset

In [40]:
# Set batch size for DataLoader
batch_size = 32

# train
dataset = MovieRatingDataset(new_user_df)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

print('users : ',dataset.num_users)
print('movies: ',dataset.num_movies)

users :  7491
movies:  8546


In [41]:
# # Iterate through the DataLoader during training
# for batch in dataloader:

#     user_ids, movie_tags, ratings = batch
    
#     print("User IDs:", user_ids, user_ids.shape)
#     print("Movie ids:", movie_tags, movie_tags.shape)
#     print("Ratings:", ratings, ratings.shape)
#     break  # only print the first batch

In [42]:
# Initialize your model, optimizer, and loss function
num_users = dataset.num_users  # actual number of users #
num_movies = dataset.num_movies  # actual number of tokens
dim = 8

model = RecommenderModel(num_users, num_movies, dim)

# print(f'{num_users=}')
# print(f'{num_movies=}')
# print(f'{model=}')

In [43]:
optimizer = optim.AdamW(model.parameters(), lr=0.0015)
loss_function = nn.MSELoss()

In [44]:
# set model path
model_dir = cur_dir.parent/'models'
model_path = model_dir/'model.pth'
model.model_path=model_path

In [45]:
# load the model is exists
# model.load_model()

In [46]:
# device = 'cpu'
num_epochs = 10
device

'cuda'

In [47]:
train_model(model, dataloader,  optimizer, loss_function, num_epochs=num_epochs, device=device, data_percent=0.1)

RecommenderModel Running on: cuda


Epoch [ 1/10]: 100%|██████████| 19908/19908 [02:59<00:00, 111.17it/s, Loss=0.87323749, MSE=0.87323749, MAE=0.75021380]


Epoch [ 1/10] - Average Loss: 0.98994047 - Average MSE: 0.98994047 - Average MAE: 0.74334480



Epoch [ 2/10]: 100%|██████████| 19908/19908 [02:51<00:00, 116.24it/s, Loss=0.54065311, MSE=0.54065311, MAE=0.61686003]


Epoch [ 2/10] - Average Loss: 0.67889333 - Average MSE: 0.67889333 - Average MAE: 0.63396478



Epoch [ 3/10]: 100%|██████████| 19908/19908 [02:51<00:00, 116.37it/s, Loss=0.81010938, MSE=0.81010938, MAE=0.67042243]


Epoch [ 3/10] - Average Loss: 0.66091714 - Average MSE: 0.66091714 - Average MAE: 0.62414582



Epoch [ 4/10]: 100%|██████████| 19908/19908 [02:51<00:00, 115.92it/s, Loss=0.61759698, MSE=0.61759698, MAE=0.59600157]


Epoch [ 4/10] - Average Loss: 0.65272011 - Average MSE: 0.65272011 - Average MAE: 0.61955556



Epoch [ 5/10]: 100%|██████████| 19908/19908 [02:51<00:00, 115.95it/s, Loss=0.60617924, MSE=0.60617924, MAE=0.65021020]


Epoch [ 5/10] - Average Loss: 0.64523059 - Average MSE: 0.64523059 - Average MAE: 0.61606590



Epoch [ 6/10]: 100%|██████████| 19908/19908 [02:52<00:00, 115.35it/s, Loss=0.46859556, MSE=0.46859556, MAE=0.56067967]


Epoch [ 6/10] - Average Loss: 0.64124235 - Average MSE: 0.64124235 - Average MAE: 0.61392863



Epoch [ 7/10]: 100%|██████████| 19908/19908 [02:48<00:00, 118.13it/s, Loss=0.65299827, MSE=0.65299827, MAE=0.61116153]


Epoch [ 7/10] - Average Loss: 0.63439447 - Average MSE: 0.63439447 - Average MAE: 0.61050153



Epoch [ 8/10]: 100%|██████████| 19908/19908 [02:47<00:00, 118.61it/s, Loss=0.56063366, MSE=0.56063366, MAE=0.59850627]


Epoch [ 8/10] - Average Loss: 0.62847652 - Average MSE: 0.62847652 - Average MAE: 0.60706778



Epoch [ 9/10]: 100%|██████████| 19908/19908 [02:48<00:00, 118.44it/s, Loss=0.87602448, MSE=0.87602448, MAE=0.66765004]


Epoch [ 9/10] - Average Loss: 0.62261133 - Average MSE: 0.62261133 - Average MAE: 0.60451194



Epoch [10/10]: 100%|██████████| 19908/19908 [02:47<00:00, 118.63it/s, Loss=0.36934984, MSE=0.36934984, MAE=0.53011513]

Epoch [10/10] - Average Loss: 0.62109601 - Average MSE: 0.62109601 - Average MAE: 0.60386907






In [49]:
# save the model
model.save_model()

# Getting recommendations

In [50]:
trained_movie_embedding = model.movie_embedding.weight.data.cpu().numpy()
trained_movie_embedding.shape

(8546, 8)

In [51]:
from sklearn.cluster import KMeans

In [52]:
clusters = 10
kmeans = KMeans(n_clusters=clusters,random_state=0).fit(trained_movie_embedding)

  super()._check_params_vs_input(X, default_n_init=10)


In [53]:
for cluster in range(clusters):
    print('Cluster: ',cluster)
    movs = []
    
    for movidx in np.where(kmeans.labels_==cluster)[0]:
        # print(movidx)
        movieid = dataset.idx2movieId[movidx]
        movie_title = movie_df[movie_df.movieId==movieid].title.values
        movs.append(movie_title)
        print('\t',movie_title)
        
        if len(movs)==15:
            break

Cluster:  0
	 ['Jumanji (1995)']
	 ['Othello (1995)']
	 ['When Night Is Falling (1995)']
	 ['Two Bits (1995)']
	 ['Last Summer in the Hamptons (1995)']
	 ['Mary Reilly (1996)']
	 ['City Hall (1996)']
	 ['Muppet Treasure Island (1996)']
	 ['Catwalk (1996)']
	 ['Rumble in the Bronx (Hont faan kui) (1995)']
	 ['Before and After (1996)']
	 ['If Lucy Fell (1996)']
	 ['Steal Big, Steal Little (1995)']
	 ['Boomerang (1992)']
	 ['Pie in the Sky (1996)']
Cluster:  1
	 ['Leaving Las Vegas (1995)']
	 ['Richard III (1995)']
	 ['To Die For (1995)']
	 ['White Balloon, The (Badkonake sefid) (1995)']
	 ['Bottle Rocket (1996)']
	 ['Taxi Driver (1976)']
	 ['Chungking Express (Chung Hing sam lam) (1994)']
	 ['Beauty of the Day (Belle de jour) (1967)']
	 ['Clockers (1995)']
	 ['Crumb (1994)']
	 ['Devil in a Blue Dress (1995)']
	 ['Safe (1995)']
	 ['Smoke (1995)']
	 ['Umbrellas of Cherbourg, The (Parapluies de Cherbourg, Les) (1964)']
	 ['Burnt by the Sun (Utomlyonnye solntsem) (1994)']
Cluster:  2
	 ['Nix

In [54]:
import torch.nn.functional as F

def find_similar_movies(target_movie_embedding, all_movie_embeddings, top_n=5):
    with torch.inference_mode():
        # Calculate cosine similarity
        similarity_scores = F.cosine_similarity(target_movie_embedding, all_movie_embeddings, dim=1)
        
        # Sort movies based on similarity scores
        sorted_indices = torch.argsort(similarity_scores, descending=True)
        
        # Get top N similar movie indices
        top_indices = sorted_indices[:top_n]
        
        # return top_indices
        return top_indices

In [55]:
def more_movies(idx, n:int=10):
    if isinstance(idx, int):
        new_movie_id = dataset.movieId2idx[idx]
        print(f'Movie : {get_movie_name(idx, new_movie_df)}')

    elif isinstance(idx, str):
        new_movie_id = dataset.movieId2idx[get_movie_id(idx, new_movie_df)]
        print(f'Movie: {idx}')
        
    
    target_movie_embedding = model.movie_embedding(torch.tensor(new_movie_id).to(device)).unsqueeze(0)
    all_movie_embeddings = model.movie_embedding.weight.data

    # Find similar movies
    similar_movie_indices = find_similar_movies(target_movie_embedding, all_movie_embeddings, top_n=n+1)

    movies = []
    for num, i in enumerate(similar_movie_indices,1):
        # print(i)
        movies.append(movie_df[movie_df.movieId==dataset.idx2movieId[i.item()]]['title'].values[0])     
    return movies      

In [56]:
# movie_df[movie_df.title.str.contains('venger')]

In [57]:
random_movie_id = int(new_user_df.sample(1).movieId.values[0])
print(random_movie_id)

46664


In [58]:
random_movie_name = new_movie_df.sample(1).title.values[0]
print(random_movie_name)

Sanjuro (Tsubaki Sanjûrô) (1962)


In [59]:
# use this as search bar
search = 'avenger'
new_movie_df[new_movie_df.title.str.lower().str.contains(str(search))]

Unnamed: 0,movieId,title,genres
2069,2153,"Avengers, The (1998)",Action|Adventure
3602,3693,"Toxic Avenger, The (1985)",Comedy|Horror
3603,3694,"Toxic Avenger, Part II, The (1989)",Comedy|Horror
3604,3695,Toxic Avenger Part III: The Last Temptation of...,Comedy|Horror
17506,88140,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War
17874,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX


In [60]:
pp = ['Avengers, The (1998)','Avengers, The (2012)','Captain America: The First Avenger (2011)' ]

for i in pp:
    display(more_movies(idx = i))

Movie: Avengers, The (1998)


['Avengers, The (1998)',
 'Levity (2003)',
 'Resident Evil: Extinction (2007)',
 'Imagine Me & You (2005)',
 'Carrie (2013)',
 'Priest (2011)',
 'Tomcats (2001)',
 'Johnny Be Good (1988)',
 'Fatal Beauty (1987)',
 'Whole Ten Yards, The (2004)',
 'Original Sin (2001)']

Movie: Avengers, The (2012)


['Avengers, The (2012)',
 'Star Trek (2009)',
 'Captain America: The Winter Soldier (2014)',
 'How to Train Your Dragon (2010)',
 'Kung Fu Panda (2008)',
 'Iron Man (2008)',
 'Street Kings (2008)',
 'X-Men: First Class (2011)',
 'Sherlock Holmes (2009)',
 'Guardians of the Galaxy (2014)',
 'Batman Begins (2005)']

Movie: Captain America: The First Avenger (2011)


['Captain America: The First Avenger (2011)',
 'John Carter (2012)',
 'Street Kings (2008)',
 'Mission: Impossible - Ghost Protocol (2011)',
 'Hellboy (2004)',
 'Monster House (2006)',
 'Iron Man (2008)',
 'Killer, The (Die xue shuang xiong) (1989)',
 'Arthur Christmas (2011)',
 'Baby Mama (2008)',
 'Fast Five (Fast and the Furious 5, The) (2011)']

Our prediction looks perfect. 
> Note: more the ratings well adjusted the embeddings, movies with less ratings can be seen randomly at any place, because there place in the embedding space is not adjusted enough as highly rated movies.