## Install Libraries

In [1]:
import os
import json
import pandas as pd
import numpy as np
import io
import matplotlib.pyplot as plt
import seaborn as sns
import botocore.exceptions


from io import StringIO
from pandas import DataFrame
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate


## Data Preparation

### Load the datasets

In [2]:
#handle data types when loading file
movies_metadata_df = pd.read_csv('/kaggle/input/the-movies-dataset/movies_metadata.csv', dtype={'id': 'str', 'popularity': 'str'})
ratings_df = pd.read_csv("/kaggle/input/the-movies-dataset/ratings.csv")
links_df = pd.read_csv('/kaggle/input/the-movies-dataset/links.csv')


# Display the first few rows of each DataFrame
print("Ratings DataFrame:")
print(ratings_df.head())

print("\nLinks DataFrame:")
print(links_df.head())

print("\nMovies Metadata DataFrame:")
print(movies_metadata_df.head())

Ratings DataFrame:
   userId  movieId  rating   timestamp
0       1      110     1.0  1425941529
1       1      147     4.5  1425942435
2       1      858     5.0  1425941523
3       1     1221     5.0  1425941546
4       1     1246     5.0  1425941556

Links DataFrame:
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0

Movies Metadata DataFrame:
   adult                              belongs_to_collection    budget  \
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1  False                                                NaN  65000000   
2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
3  False                                                NaN  16000000   
4  False  {'id': 96871, 'name': 'Father of the Bride Col...         0   

                                              genres  \
0  [{'id': 16, 'name': 'Animation'}, {

### Merge datasets into one dataframe

In [3]:
# Merge 'ratings_df' and 'links_df' based on the "movieId" column
ratings_links_df = pd.merge(ratings_df, links_df, on='movieId', how='inner')

# Convert 'imdbId' in the merged DataFrame to object type
ratings_links_df['imdbId'] = ratings_links_df['imdbId'].astype(str)

# Merge based on the "imdbId" and "id" columns
df = pd.merge(ratings_links_df, movies_metadata_df, left_on='imdbId', right_on='id', how='inner')

# Display the resulting DataFrame
print("Merged DataFrame:")
print(df.head())

Merged DataFrame:
   userId  movieId  rating   timestamp  imdbId  tmdbId  adult  \
0       1      858     5.0  1425941523   68646   238.0  False   
1       2     1210     4.0   867039325   86190  1892.0  False   
2       2     1233     4.0   867039820   82096   387.0  False   
3       3      858     4.0  1048076945   68646   238.0  False   
4       4      223     4.0  1042668576  109445  2292.0  False   

                               belongs_to_collection     budget  \
0                                                NaN          0   
1  {'id': 329220, 'name': 'Adventures of a ...', ...          0   
2                                                NaN     947000   
3                                                NaN          0   
4  {'id': 386382, 'name': 'Frozen Collection', 'p...  150000000   

                                              genres  ... release_date  \
0                                                 []  ...   2002-02-28   
1                     [{'id': 35, 'name'

In [4]:
df.shape

(2040650, 30)

In [5]:
df.describe(include='object')

Unnamed: 0,imdbId,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,popularity,poster_path,production_companies,production_countries,release_date,spoken_languages,status,tagline,title,video
count,2040650,2040650,222008,2040650,2040650,318174,2040650,2040650,2040646,2040650,...,2040650.0,2029476,2040650,2040650,2039524,2040650,1978831,845822,2040650,2040650
unique,4289,2,322,237,941,604,4289,4289,50,4261,...,4216.0,4273,2400,370,3473,330,4,1844,4242,2
top,86190,False,"{'id': 329220, 'name': 'Adventures of a ...', ...",0,"[{'id': 35, 'name': 'Comedy'}]",http://www.thefinalmovie.com/,86190,tt0074094,en,Adventures Of A Taxi Driver,...,0.418006,/ApTxiWvCDeEMSDH5ILI0WNmQLIw.jpg,[],"[{'iso_3166_1': 'US', 'name': 'United States o...",1976-04-30,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,He gets more than his fare share...!,Adventures Of A Taxi Driver,False
freq,62714,2040540,62714,1757722,280319,30043,62714,62714,1459131,62714,...,62714.0,62714,697681,725488,62714,812645,1964029,62714,62714,2038129


## Data Cleaning and Transformation

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2040650 entries, 0 to 2040649
Data columns (total 30 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   userId                 int64  
 1   movieId                int64  
 2   rating                 float64
 3   timestamp              int64  
 4   imdbId                 object 
 5   tmdbId                 float64
 6   adult                  object 
 7   belongs_to_collection  object 
 8   budget                 object 
 9   genres                 object 
 10  homepage               object 
 11  id                     object 
 12  imdb_id                object 
 13  original_language      object 
 14  original_title         object 
 15  overview               object 
 16  popularity             object 
 17  poster_path            object 
 18  production_companies   object 
 19  production_countries   object 
 20  release_date           object 
 21  revenue                float64
 22  runtime           

In [7]:
#drop irrelevant columns
df = df.drop(['tmdbId', 'homepage', 'belongs_to_collection', 'poster_path', 'original_title', 'overview', 'status', 'tagline', 'video'], axis=1)

In [8]:
#Check which columns have nulls
df.isnull().sum()

userId                      0
movieId                     0
rating                      0
timestamp                   0
imdbId                      0
adult                       0
budget                      0
genres                      0
id                          0
imdb_id                     0
original_language           4
popularity                  0
production_companies        0
production_countries        0
release_date             1126
revenue                     0
runtime                 12096
spoken_languages            0
title                       0
vote_average                0
vote_count                  0
dtype: int64

In [9]:
#Handle missing values
df['original_language'] = df['original_language'].fillna(df['original_language'].mode()[0])
df = df.dropna(subset=['release_date'])
df['runtime'] = df['runtime'].fillna(df['runtime'].median())

# Convert Data Types
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

# Check for remaining missing values
print(df.isnull().sum())

userId                  0
movieId                 0
rating                  0
timestamp               0
imdbId                  0
adult                   0
budget                  0
genres                  0
id                      0
imdb_id                 0
original_language       0
popularity              0
production_companies    0
production_countries    0
release_date            0
revenue                 0
runtime                 0
spoken_languages        0
title                   0
vote_average            0
vote_count              0
dtype: int64


In [10]:
# Convert 'release_date' to datetime and extract the year
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['year'] = df['release_date'].dt.year

# Check the data types
print(df.dtypes)

userId                           int64
movieId                          int64
rating                         float64
timestamp                        int64
imdbId                          object
adult                           object
budget                          object
genres                          object
id                              object
imdb_id                         object
original_language               object
popularity                      object
production_companies            object
production_countries            object
release_date            datetime64[ns]
revenue                        float64
runtime                        float64
spoken_languages                object
title                           object
vote_average                   float64
vote_count                     float64
year                             int32
dtype: object


In [11]:
# Convert 'popularity' to float in the 'data' DataFrame
df['popularity'] = pd.to_numeric(df['popularity'], errors='coerce')


# Select relevant features
selected_features = df[['userId', 'movieId', 'rating', 'title', 'vote_average', 'popularity']]

# Display the updated DataFrame
print(selected_features.head())

# Drop columns not in the list
final_data = selected_features.copy()

# Specify the file path where you want to save the CSV file
csv_file_path = '/kaggle/working/final_data.csv'

# Save the DataFrame to CSV
final_data.to_csv(csv_file_path, index=False)

# Print a message indicating the successful save
print(f"DataFrame saved to {csv_file_path}")



final_data.info()

   userId  movieId  rating                        title  vote_average  \
0       1      858     5.0               Superprodukcja           2.0   
1       2     1210     4.0  Adventures Of A Taxi Driver           6.8   
2       2     1233     4.0             The Painted Veil           5.2   
3       3      858     4.0               Superprodukcja           2.0   
4       4      223     4.0                       Frozen           7.3   

   popularity  
0    0.053215  
1    0.418006  
2    0.631309  
3    0.053215  
4   24.248243  
DataFrame saved to /kaggle/working/final_data.csv
<class 'pandas.core.frame.DataFrame'>
Index: 2039524 entries, 0 to 2040649
Data columns (total 6 columns):
 #   Column        Dtype  
---  ------        -----  
 0   userId        int64  
 1   movieId       int64  
 2   rating        float64
 3   title         object 
 4   vote_average  float64
 5   popularity    float64
dtypes: float64(3), int64(2), object(1)
memory usage: 108.9+ MB


In [12]:
# Load data using Surprise
reader = Reader(rating_scale=(1, 5))
surprise_data = Dataset.load_from_df(final_data[['userId', 'movieId', 'rating']], reader)


In [13]:
algo = SVD()
trainset = surprise_data.build_full_trainset()
algo.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7cbe3e55ffd0>

In [14]:
# Mapping between raw and inner movie IDs
raw_movie_ids = [trainset.to_raw_iid(inner_id) for inner_id in range(trainset.n_items)]
movie_titles = final_data.loc[final_data['movieId'].isin(raw_movie_ids), ['movieId', 'title']].drop_duplicates()

# Make predictions
user_id = 1
item_id = 1
raw_item_id = trainset.to_raw_iid(item_id)
movie_title = movie_titles.loc[movie_titles['movieId'] == raw_item_id, 'title'].values[0]

prediction = algo.predict(user_id, item_id)
print(f"Predicted rating for user {user_id} and movie {movie_title}: {prediction.est}")

# Evaluate the model (optional)
cross_validate(algo, surprise_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Generate top-N recommendations for a user
user_id = 1
user_items = trainset.ur[user_id]
user_unseen_items = [item for item in trainset.all_items() if item not in user_items]
predictions = [algo.predict(user_id, item) for item in user_unseen_items]

Predicted rating for user 1 and movie Adventures Of A Taxi Driver: 3.6563104295639577
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8877  0.8865  0.8887  0.8878  0.8896  0.8881  0.0010  
MAE (testset)     0.6825  0.6818  0.6829  0.6823  0.6831  0.6825  0.0005  
Fit time          37.07   38.35   38.31   38.00   37.43   37.83   0.50    
Test time         5.91    5.77    5.88    5.92    7.20    6.13    0.54    


In [15]:
# Get top-N recommendations with movie titles
top_n = sorted(predictions, key=lambda x: x.est, reverse=True)[:10]
top_n_movie_ids = [trainset.to_raw_iid(prediction.iid) for prediction in top_n]
top_n_movie_titles = movie_titles.loc[movie_titles['movieId'].isin(top_n_movie_ids), 'title']

# Create a DataFrame for better presentation
top_n_df = pd.DataFrame({'Movie Title': top_n_movie_titles, 'Predicted Rating': [pred.est for pred in top_n]})

print(f"\nTop 10 recommended movies for user {user_id}:\n")
print(top_n_df)



Top 10 recommended movies for user 1:

                                   Movie Title  Predicted Rating
189                                 Neighbours          4.462816
11984               ...tick... tick... tick...          4.381289
19462                      Alone Yet Not Alone          4.372779
28651                                Blind Man          4.347605
30735    How Much Wood Would a Woodchuck Chuck          4.302680
65892             Nora Roberts' Midnight Bayou          4.266929
255934                The Trial of Joan of Arc          4.262242
348823                      Band Baaja Baaraat          4.257988
349336          After the Wall: A World United          4.242694
1139038                      The Invisible Man          4.232530
