Link to dataset: https://files.grouplens.org/datasets/movielens/ml-latest.zip

In [None]:
import pandas as pds
import numpy as np

In [None]:
movies = pds.read_csv("movies.csv")
movie_genres = {}

In [None]:
total_movies = movies.shape[0]
print(f"Total number of ratings: {total_movies}")

Total number of ratings: 9742


In [None]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [None]:
movies.describe()

Unnamed: 0,movieId
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [None]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [None]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(max_features=9742, stop_words = 'english')

In [None]:
cv

In [None]:
vector = cv.fit_transform(movies['genres'].values.astype('U')).toarray()

In [None]:
vector.shape

(9742, 23)

**Cosine Similarity**
Cosine similarity is a mathematical measure that compares the orientation of two vectors in a multidimensional space. It doesn't consider the magnitude (length) of the vectors, but only focuses on the angle between them. *This makes it a valuable tool for tasks like:*
  - Comparing documents: By representing documents as vectors based on their word frequencies, cosine similarity can measure how similar their topics are, regardless of their length.
  - Recommending items: Recommender systems can use cosine similarity to find items similar to those a user has liked in the past, based on their feature vectors.
  - Clustering data: Cosine similarity can be used to group data points that have similar characteristics, even if their individual values differ.


**How does it work:**
- **Represent data as vectors**: Each data point is assigned a vector with a dimension for each feature or variable. The values in the vector represent the importance of each feature to that data point.
- **Calculate the dot product:** This measures how aligned the two vectors are. It's the sum of the product of corresponding elements in each vector.
- **Normalize by magnitudes**: This ensures the similarity is independent of the vector lengths. You divide the dot product by the product of the vector magnitudes.
- **Interpret the result**: The cosine similarity ranges from -1 (completely opposite vectors) to 1 (identical vectors). A value closer to 1 indicates higher similarity.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vector)

In [None]:
similarity

array([[1.        , 0.77459667, 0.31622777, ..., 0.        , 0.31622777,
        0.4472136 ],
       [0.77459667, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 1.        , ..., 0.        , 0.        ,
        0.70710678],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.4472136 , 0.        , 0.70710678, ..., 0.        , 0.        ,
        1.        ]])

In [None]:
movies[movies['title'] == "Sabrina (1995)" ].index[0]

6

In [None]:
distance = sorted(enumerate(similarity[2]), key=lambda vector:vector[1], reverse=True)

In [None]:
for i in distance[0:5]:
  print(movies.iloc[i[0]].title)

Grumpier Old Men (1995)
Sabrina (1995)
Clueless (1995)
Two if by Sea (1996)
French Twist (Gazon maudit) (1995)


In [None]:
def recommend(movies, title):
    """Recommends movies similar to the provided title.

    Args:
        movies: Pandas DataFrame containing movie data with a 'title' column.
        title: String representing the movie title to recommend based on.

    Returns:
        None. Prints movie titles similar to the input title.
    """

    try:
        # Check for exact match
        index = movies[movies['title'] == title].index[0]
    except IndexError:
        # Check for partial match (optional)
        similar_titles = movies[movies['title'].str.contains(title)].index
        if similar_titles.empty:
            print("Movie not found.")
            return
        else:
            print(f"Similar movies: {', '.join(movies.loc[similar_titles, 'title'])}")
            return

    distance = sorted(enumerate(similarity[index]), key=lambda vector: vector[1], reverse=True)
    for i in distance[0:5]:
        print(movies.iloc[i[0]].title)

In [None]:
recommend(movies, "Two if by Sea (1996)")

Grumpier Old Men (1995)
Sabrina (1995)
Clueless (1995)
Two if by Sea (1996)
French Twist (Gazon maudit) (1995)


movies['title'].dtype
type(['movieId'])
movies['title'] == pds.Series(['movieId'] * len(movies), index=movies.index)

In [None]:
import pickle

In [None]:
pickle.dump(movies, open ('movies.pk1',"wb"))

In [None]:
pickle.dump(similarity, open ('similarity.pk1',"wb"))

In [None]:
pickle.load(open('movies.pk1',"rb"))

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation
