###**1. Preprocessing the data**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
##importing the necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
import re
import os

In [3]:
##Reading the datasets
movies_df = pd.read_csv('/content/drive/MyDrive/ml-25m/movies.csv')
ratings_df = pd.read_csv('/content/drive/MyDrive/ml-25m/ratings.csv')
imdb_df = pd.read_csv('/content/drive/MyDrive/imdb_ratings.csv')
links_df = pd.read_csv('/content/drive/MyDrive/ml-25m/links.csv')

In [4]:
##Checking the number of features and entries in our datasets
movies_df.shape, ratings_df.shape

((62423, 3), (25000095, 4))

In [5]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [7]:
##Checking for null values in movies dataset
movies_df.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [8]:
##checking for null values in ratings dataset
ratings_df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [9]:
##Checking the number of unique users in the ratings dataset
print(f"There are {len(pd.unique(ratings_df['userId']))} unique users")

There are 162541 unique users


In [10]:
##A function for flattening the columns when performing join operation
def flatten_cols(df):
  df.columns = [' '.join(col).strip() for col in df.columns.values]
  return df
pd.DataFrame.flatten_cols = flatten_cols

In [11]:
##Information about movie ratings and count
movie_ratings = movies_df.merge(
    ratings_df.groupby('movieId', as_index=False)
    .agg({'rating': ['count', 'mean']})
    .flatten_cols(),
    on='movieId'
)
movie_ratings.head()

Unnamed: 0,movieId,title,genres,rating count,rating mean
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,57309,3.893708
1,2,Jumanji (1995),Adventure|Children|Fantasy,24228,3.251527
2,3,Grumpier Old Men (1995),Comedy|Romance,11804,3.142028
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2523,2.853547
4,5,Father of the Bride Part II (1995),Comedy,11714,3.058434


In [12]:
links_imdb = pd.merge(links_df, imdb_df, on="imdbId")
links_imdb.shape
links_imdb.head()

Unnamed: 0,movieId,imdbId,tmdbId,averageRating
0,1,114709,862.0,8.3
1,2,113497,8844.0,7.1
2,3,113228,15602.0,6.6
3,4,114885,31357.0,6.0
4,5,113041,11862.0,6.1


In [13]:
imdb_ratings=links_imdb.drop(['tmdbId'], axis=1)

In [14]:
imdb_ratings.head(), imdb_ratings.shape

(   movieId  imdbId  averageRating
 0        1  114709            8.3
 1        2  113497            7.1
 2        3  113228            6.6
 3        4  114885            6.0
 4        5  113041            6.1,
 (53789, 3))

In [15]:
##Merging the imdb ratings with movies
movie_imdb_ratings = pd.merge(movies_df, imdb_ratings, on="movieId")
movie_imdb_ratings.head()

Unnamed: 0,movieId,title,genres,imdbId,averageRating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,8.3
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,7.1
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,6.6
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,6.0
4,5,Father of the Bride Part II (1995),Comedy,113041,6.1


In [16]:
movies_df = movies_df.drop(movies_df[movies_df['genres'] == '(no genres listed)'].index)
##Store these movie_ids and delete from user_ratings

In [17]:
##Ranking the movies based on their rating count
movie_ratings[['title','rating count','rating mean']].sort_values('rating count', ascending=False).head()

Unnamed: 0,title,rating count,rating mean
351,Forrest Gump (1994),81491,4.048011
314,"Shawshank Redemption, The (1994)",81482,4.413576
292,Pulp Fiction (1994),79672,4.188912
585,"Silence of the Lambs, The (1991)",74127,4.151342
2480,"Matrix, The (1999)",72674,4.154099


In [18]:
##Defining a mask function to return a filtered DF based on mean ratings
def mask(df, key, function):
  return df[function(df[key])]
pd.DataFrame.mask = mask

In [19]:
##Ranking movies by their average ratings when the count is greater than 10000
(movie_ratings[['title', 'rating count', 'rating mean']]
 .mask('rating count', lambda x: x > 10000)
 .sort_values('rating mean', ascending=False)
 .head(10))
##Take mean ratings from here

Unnamed: 0,title,rating count,rating mean
314,"Shawshank Redemption, The (1994)",81482,4.413576
840,"Godfather, The (1972)",52498,4.324336
49,"Usual Suspects, The (1995)",55366,4.284353
1190,"Godfather: Part II, The (1974)",34188,4.261759
1930,Seven Samurai (Shichinin no samurai) (1954),13367,4.254769
522,Schindler's List (1993),60411,4.247579
1173,12 Angry Men (1957),16569,4.243014
883,Rear Window (1954),20162,4.237948
2867,Fight Club (1999),58773,4.228311
1164,One Flew Over the Cuckoo's Nest (1975),36058,4.218662


In [20]:
##One hot coding for genres
genre = movies_df['genres']
genre_bin = genre.str.get_dummies()
genre_bin

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62417,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
62418,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
62419,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
62420,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [21]:
movie_ids = movies_df['movieId'].values
movie_ids

array([     1,      2,      3, ..., 209159, 209163, 209171])

In [22]:
movie_ratings = ratings_df.groupby('movieId')['rating'].mean()
movie_ratings
##This includes movies with no genres as well delete them first and take mean of these ratings

movieId
1         3.893708
2         3.251527
3         3.142028
4         2.853547
5         3.058434
            ...   
209157    1.500000
209159    3.000000
209163    4.500000
209169    3.000000
209171    3.000000
Name: rating, Length: 59047, dtype: float64

###**Part 2: Applying train test split**

In [23]:
movie_ratings=movie_ratings.sample(n=57361)

In [24]:
feature_vector = np.column_stack((genre_bin, movie_ratings))

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, _, _ = train_test_split(feature_vector, movie_ids, test_size=0.2, random_state=42)

In [46]:
X_test

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        3.48493976],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        2.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        2.60169492],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        3.125     ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        3.75      ],
       [1.        , 0.        , 1.        , ..., 1.        , 0.        ,
        2.92857143]])

###**Part 3: Training the Neural Network**

In [26]:
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

model = Sequential()
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='linear'))

In [27]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

In [28]:
model.fit(X_train, X_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d3e4c3bd300>

In [29]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               5376      
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 38401 (150.00 KB)
Trainable params: 38401 (150.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
#model trained after adding dropout layers to hidden layers

In [30]:
model2 = Sequential()

model2.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))
model2.add(Dropout(0.2))
model2.add(Dense(128, activation='relu'))
model2.add(Dropout(0.2))
model2.add(Dense(1, activation='linear'))

model2.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

model2.fit(X_train, X_train, epochs=10, batch_size=32)

model2.summary()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 256)               5376      
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_4 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_5 (Dense)             (None, 1)                 129       
                                                                 
Total params: 38401 (150.00 KB)
Trainable params: 38401 (150.00 KB)
Non-tra

In [31]:
from tensorflow.keras.optimizers import Adam
model3 = Sequential()

model3.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))
model3.add(Dropout(0.2))
model3.add(Dense(256, activation='relu'))
model3.add(Dropout(0.2))
model3.add(Dense(128, activation='relu'))
model3.add(Dropout(0.2))
model3.add(Dense(1, activation='sigmoid'))

model3.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

model3.fit(X_train, X_train, epochs=10, batch_size=32)

model3.summary()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 256)               5376      
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 dense_7 (Dense)             (None, 256)               65792     
                                                                 
 dropout_3 (Dropout)         (None, 256)               0         
                                                                 
 dense_8 (Dense)             (None, 128)               32896     
                                                                 
 dropout_4 (Dropout)         (None, 128)               0         
         













###**Part 4: Generating Recommendations**

In [41]:
target_movie_id = 12  # Change this to the movie you want recommendations for
target_movie_features = feature_vector[target_movie_id - 1].reshape(1, -1)
movie_similarity = cosine_similarity(target_movie_features, feature_vector)

In [42]:
# Get movie recommendations based on similarity
similar_movies = np.argsort(movie_similarity, axis=1)[:, ::-1]

In [43]:
# Display top N movie recommendations (e.g., top 10)
top_n = 10
recommended_movie_ids = [movie_ids[i] for i in similar_movies[0][:top_n]]

In [44]:
# Print the recommended movie titles
recommended_movie_titles = movie_imdb_ratings[movie_imdb_ratings['movieId'].isin(recommended_movie_ids)][['title','averageRating']]
print(recommended_movie_titles)

                                                   title  averageRating
11                    Dracula: Dead and Loving It (1995)            5.8
821    Tales from the Crypt Presents: Bordello of Blo...            5.4
3585                  Toxic Avenger, Part II, The (1989)            5.0
22381                                    Hellgate (1989)            3.6
26661                             Suburban Gothic (2014)            5.5
32867                                 The Willies (1990)            5.1
36058               I Survived a Zombie Holocaust (2014)            5.0
50522                               Blood Salvage (1990)            4.9
