In [1]:
# Importing important libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
# Loading the dataset
columns_movies = ['ID', 'Title', 'Genres']
movies = pd.read_csv('movies.dat', sep='::', engine='python', header=None, names=columns_movies)
movies['Genres'] = movies['Genres'].str.replace('|', ', ')
movies['Info'] = movies['Title'] +" " + movies['Genres']
movies

Unnamed: 0,ID,Title,Genres,Info
0,1,Toy Story (1995),"Adventure, Animation, Children, Comedy, Fantasy","Toy Story (1995) Adventure, Animation, Childre..."
1,2,Jumanji (1995),"Adventure, Children, Fantasy","Jumanji (1995) Adventure, Children, Fantasy"
2,3,Grumpier Old Men (1995),"Comedy, Romance","Grumpier Old Men (1995) Comedy, Romance"
3,4,Waiting to Exhale (1995),"Comedy, Drama, Romance","Waiting to Exhale (1995) Comedy, Drama, Romance"
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II (1995) Comedy
...,...,...,...,...
10676,65088,Bedtime Stories (2008),"Adventure, Children, Comedy","Bedtime Stories (2008) Adventure, Children, Co..."
10677,65091,Manhattan Melodrama (1934),"Crime, Drama, Romance","Manhattan Melodrama (1934) Crime, Drama, Romance"
10678,65126,Choke (2008),"Comedy, Drama","Choke (2008) Comedy, Drama"
10679,65130,Revolutionary Road (2008),"Drama, Romance","Revolutionary Road (2008) Drama, Romance"


In [3]:
cv = CountVectorizer()
# Fit and transform the 'Info' column of the 'movies' DataFrame, creating a count matrix
count_matrix = cv.fit_transform(movies['Info'])
# Calculating cosine similarity between movie descriptions based on the count matrix
similarity = cosine_similarity(count_matrix)

In [4]:
similarity.shape

(10681, 10681)

In [5]:
# A function that return the top 5 movies similar to entered movie and their similarity scores
def get_movie_recommendations(movie_title):
    movie_index = movies.index[movies['Title'] == movie_title].tolist()[0]
    movie_similarity_scores = list(enumerate(similarity[movie_index]))
    movie_similarity_scores = sorted(movie_similarity_scores, key=lambda x: x[1], reverse=True)
    top_n = 5
    similar_movies = [(movies['Title'][i], score) for i, score in movie_similarity_scores[1:top_n+1]]
    return similar_movies

In [6]:
# Get recommendation of a movie
movie_of_interest = "Bedtime Stories (2008)"
recommendations = get_movie_recommendations(movie_of_interest)

print(f"Movies similar to '{movie_of_interest}':")
for movie, score in recommendations:
    print(f"{movie} (Similarity Score: {score:.2f})")

Movies similar to 'Bedtime Stories (2008)':
Beverly Hills Chihuahua (2008) (Similarity Score: 0.62)
Bolt (2008) (Similarity Score: 0.62)
Horton Hears a Who! (2008) (Similarity Score: 0.58)
Paulie (1998) (Similarity Score: 0.55)
Candleshoe (1977) (Similarity Score: 0.55)


In [7]:
# Saving the similarity scores as a pickel object
with open('similarity_matrix.pkl', 'wb') as file:
        pickle.dump(similarity, file)

In [8]:
# Calulating the size of similarity matrix
matrix_size_in_bytes = similarity.nbytes
print(f"Size of the similarity matrix: {matrix_size_in_bytes / (1024 ** 2):.2f} MB")

Size of the similarity matrix: 870.39 MB
