In [None]:

import numpy as np 
import pandas as pd 
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt

from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.dump import dump
from surprise.dump import load

import seaborn as sns

sns.set(style="whitegrid")

# Load Data

In [None]:
df=pd.read_csv('/kaggle/input/the-movies-dataset/ratings_small.csv')
print(df.shape)
df.head()

# Basic Statistics

In [None]:
print("Data Info:")
print(df.info())

**No null values**

**Plot the Distribution of ratings**

In [None]:
plt.figure(figsize=(10, 6))

# Create histogram
plt.hist(df['rating'].dropna(), bins=10, color='blue', edgecolor='black', alpha=0.7)

# Add titles and labels
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')

# Show the plot
plt.show()

**Plot Average rating per user**

In [None]:
import warnings

# Average rating per user
warnings.filterwarnings("ignore", message="use_inf_as_na option is deprecated")

plt.figure(figsize=(14, 7))
average_rating_per_user = df.groupby('userId')['rating'].mean().sort_values()
sns.histplot(average_rating_per_user, bins=30, kde=True, color='purple')
plt.title('Average Rating per User')
plt.xlabel('Average Rating')
plt.ylabel('Frequency')
plt.show()



# Build Interaction Matrix

In [None]:
movie_index_map = {movie: idx for idx, movie in enumerate(df['movieId'].unique())}
user_index_map = {user: idx for idx, user in enumerate(df['userId'].unique())}

row_indices = [user_index_map[user] for user in df['userId']]
col_indices = [movie_index_map[movie] for movie in df['movieId']]
interaction_values = df['rating']

interaction_matrix = csr_matrix((interaction_values, (row_indices, col_indices)),
                                 shape=(len(user_index_map), len(movie_index_map)))

print(interaction_matrix)

# Train SVD Model

In [None]:
reader = Reader(rating_scale=(0, 5))  
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

model = SVD()

cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

trainset = data.build_full_trainset()
model.fit(trainset)

In [None]:

model_filename = 'trained_model.pkl'
dump(model_filename, algo=model)
print(f"Model saved to {model_filename}")

In [None]:
loaded_model = load(model_filename)[1]
print("Model loaded successfully")

**Load Meta Data to get Movie titles**

Movie titles are indexed by tmdb indexes so the relevant index must be found from links table before querying meta data table for movie title

In [None]:
meta_df=pd.read_csv('/kaggle/input/the-movies-dataset/movies_metadata.csv')

In [None]:
links_df=pd.read_csv('/kaggle/input/the-movies-dataset/links.csv')

**Print top 10 recommendations for given users**

In [None]:
def get_top_n_recommendations(model, user_id, num_items, n=10):
    recommendations = []
    for item_id in range(1, num_items + 1):  # Assuming items are numbered from 1 to num_items
        prediction = model.predict(user_id, item_id)
        recommendations.append((item_id, prediction.est))
    
    # Sort recommendations by estimated rating
    recommendations.sort(key=lambda x: x[1], reverse=True)
    
    top_n_recommendations = recommendations[:n]
    return top_n_recommendations

user_id = 10
num_items = df['movieId'].nunique()  # Number of unique items in the dataset
top_recommendations = get_top_n_recommendations(model, user_id, num_items, n=10)
print("Top Recommendations for User", user_id)
for item_id, estimated_rating in top_recommendations:
    imdb_id = links_df[links_df['movieId'] == item_id]['tmdbId'].values[0]
    imdb_id= str(int(imdb_id))
   
    filtered_row = meta_df[meta_df['id'] == imdb_id]
   
    original_title = filtered_row['title'].values[0]
    print("Item:", original_title, "Estimated Rating:", estimated_rating)