In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd 
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# Ignore a specific warning by category

In [3]:
credits=pd.read_csv(r"/content/drive/MyDrive/Colab Notebooks/credits.csv")
movies=pd.read_csv(r"/content/drive/MyDrive/Colab Notebooks/movies_metadata.csv",low_memory=False)
keywords=pd.read_csv(r"/content/drive/MyDrive/Colab Notebooks/keywords.csv")
links=pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/links.csv')
links_small=pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/links_small.csv')
ratings=pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/ratings_small.csv')

In [4]:
df=pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/movie.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,movie_tags
0,0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [6]:
df.shape

(45369, 4)

In [7]:

df.drop(labels='Unnamed: 0',axis=1,inplace=True)

In [8]:
ratings.tail()

Unnamed: 0,userId,movieId,rating,timestamp
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663
100003,671,6565,3.5,1074784724


In [9]:
ratings.shape

(100004, 4)

In [10]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [11]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [12]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [13]:
### The Surprise library is a Python scikit for building and evaluating recommender systems. It provides a high-level, user-friendly interface for implementing collaborative filtering and other recommendation algorithms.

In [14]:
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

In [15]:
from sklearn.metrics.pairwise import cosine_similarity


 Collaborative Filtering

In [16]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Create the collaborative filtering model (SVD)
model = SVD()

# Train the model on the training set
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Evaluate the model
accuracy.rmse(predictions)

RMSE: 0.8993


0.8993151412129219

In [17]:
model.predict(1, 302)

Prediction(uid=1, iid=302, r_ui=None, est=2.5752206549749577, details={'was_impossible': False})

For movie with ID 302, we get an estimated prediction of 2.686. One startling feature of this recommender system is that it doesn't care what the movie is (or what it contains). It works purely on the basis of an assigned movie ID and tries to predict ratings based on how the other users have perceive the movie.

In [18]:
model.predict(3, 300)

Prediction(uid=3, iid=300, r_ui=None, est=3.577079784178824, details={'was_impossible': False})

In [19]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split


reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Create the item-based collaborative filtering model (KNNBasic)
model = KNNBasic(sim_options={'user_based': False})

# Train the model on the training set
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Example usage to recommend movies for a specific user
user_id = 1  # Modify the user ID as needed
num_recommendations = 5

# Get a list of all movie IDs
movie_ids = ratings['movieId'].unique()

# Filter out the movies already rated by the user
unrated_movies = [movie_id for movie_id in movie_ids if movie_id not in ratings[ratings['userId'] == user_id]['movieId'].unique()]

# Make predictions for the unrated movies
user_predictions = [model.predict(user_id, movie_id) for movie_id in unrated_movies]

# Sort the predictions by estimated rating in descending order
user_predictions.sort(key=lambda x: x.est, reverse=True)

# Extract the top recommended movie IDs
recommended_movie_ids = [prediction.iid for prediction in user_predictions[:num_recommendations]]

print("Recommended movies for User", user_id)
print(recommended_movie_ids)


Computing the msd similarity matrix...
Done computing similarity matrix.
Recommended movies for User 1
[665, 3010, 149406, 6985, 44204]


In [36]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Create the item-based collaborative filtering model (KNNBasic)
model = KNNBasic(sim_options={'user_based': False})

# Train the model on the training set
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Get user input for user ID and number of recommendations
user_id = int(input("Enter user ID: "))
num_recommendations = int(input("Enter number of recommendations: "))

# Get a list of all movie IDs
movie_ids = ratings['movieId'].unique()

# Filter out the movies already rated by the user
unrated_movies = [movie_id for movie_id in movie_ids if movie_id not in ratings[ratings['userId'] == user_id]['movieId'].unique()]

# Make predictions for the unrated movies
user_predictions = [model.predict(user_id, movie_id) for movie_id in unrated_movies]

# Sort the predictions by estimated rating in descending order
user_predictions.sort(key=lambda x: x.est, reverse=True)

# Extract the top recommended movie IDs
recommended_movie_ids = [prediction.iid for prediction in user_predictions[:num_recommendations]]

print("Recommended movies for User", user_id)
print(recommended_movie_ids)


Computing the msd similarity matrix...
Done computing similarity matrix.
Enter user ID: 5
Enter number of recommendations: 4
Recommended movies for User 5
[301, 1575, 2190, 69280]


In [21]:
# from surprise import Dataset, Reader, KNNBasic
# from surprise.model_selection import train_test_split

# reader = Reader(rating_scale=(1, 5))
# data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# # Split the data into training and test sets
# trainset, testset = train_test_split(data, test_size=0.2)

# # Create the item-based collaborative filtering model (KNNBasic)
# model = KNNBasic(sim_options={'user_based': False})

# # Train the model on the training set
# model.fit(trainset)

# # Make predictions on the test set
# predictions = model.test(testset)

# # Function to get user input for user ID and number of recommendations
# def get_user_input():
#     user_id = int(input("Enter user ID: "))
#     num_recommendations = int(input("Enter number of recommendations: "))
#     return user_id, num_recommendations

# # Get user input for user ID and number of recommendations
# user_id, num_recommendations = get_user_input()

# # Get a list of all movie IDs
# movie_ids = ratings['movieId'].unique()

# # Filter out the movies already rated by the user
# unrated_movies = [movie_id for movie_id in movie_ids if movie_id not in ratings[ratings['userId'] == user_id]['movieId'].unique()]

# # Make predictions for the unrated movies
# user_predictions = [model.predict(user_id, movie_id) for movie_id in unrated_movies]

# # Sort the predictions by estimated rating in descending order
# user_predictions.sort(key=lambda x: x.est, reverse=True)

# # Extract the top recommended movie IDs
# recommended_movie_ids = [prediction.iid for prediction in user_predictions[:num_recommendations]]

# print("Recommended movies for User", user_id)
# print(recommended_movie_ids)


In [22]:
merged_df = pd.merge(ratings, df, left_on='movieId', right_on='id')

# Print the merged dataframe
merged_df.tail()

Unnamed: 0,userId,movieId,rating,timestamp,id,title,movie_tags
44970,652,129009,4.0,1442690827,129009,Love Is a Ball,Etienne makes a good living out of marrying of...
44971,653,2103,3.0,948161066,2103,Solaris,Upon arrival at the space station orbiting an ...
44972,659,167,4.0,836137550,167,K-PAX,Prot is a patient at a mental hospital who cla...
44973,659,563,3.0,834694187,563,Starship Troopers,"Set in the future, the story follows a young s..."
44974,665,129,3.0,995232528,129,Spirited Away,A ten year old girl who wanders away from her ...


In [23]:
merged_df = merged_df.reset_index(drop=True)

**Hybrid recommendation system**

In [31]:
# Content-based filtering
tfidf = TfidfVectorizer(stop_words='english')
item_title_profiles = tfidf.fit_transform(merged_df['title'])
item_tags_profiles = tfidf.fit_transform(merged_df['movie_tags'])
item_profiles = hstack([item_title_profiles, item_tags_profiles])

# Collaborative filtering
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(merged_df[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)

# Train the KNN model on the training set
model = KNNBasic(sim_options={'user_based': False})
model.fit(trainset)

# Function to get hybrid recommendations
def get_hybrid_recommendations(user_id, num_recommendations):
    # Content-based recommendations
    unrated_movies = merged_df[merged_df['movieId'].isin(merged_df[merged_df['userId'] == user_id]['movieId'])]
    item_indices = unrated_movies.index.values
    content_recommendations = item_indices[
        model.compute_similarities().mean(axis=1).argsort()[::-1][:num_recommendations]
    ]

    # Collaborative filtering recommendations
    collab_recommendations = model.get_neighbors(trainset.to_inner_uid(user_id), k=num_recommendations)
    collab_recommendations = [trainset.to_raw_iid(movie_id) for movie_id in collab_recommendations]

    # Hybrid recommendations with movie titles
    hybrid_recommendations = []
    for movie_id in content_recommendations:
        hybrid_recommendations.append((merged_df.loc[movie_id, 'movieId'], merged_df.loc[movie_id, 'title']))

    for movie_id in collab_recommendations:
        hybrid_recommendations.append((movie_id, merged_df[merged_df['movieId'] == movie_id]['title'].iloc[0]))

    hybrid_recommendations = hybrid_recommendations[:num_recommendations]

    return hybrid_recommendations

Computing the msd similarity matrix...
Done computing similarity matrix.


In [32]:
user_id = 6
num_recommendations = 5
recommendations = get_hybrid_recommendations(user_id, num_recommendations)
print(f"Recommended movies for User {user_id}:")
for movie_id, title in recommendations:
    print(f"{movie_id}: {title}")


Computing the msd similarity matrix...
Done computing similarity matrix.
Recommended movies for User 6:
903: Cool Hand Luke
1285: Torrente 3 The Protector
173: 20,000 Leagues Under the Sea
596: The Grapes of Wrath
3114: The Searchers
