In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies=pd.read_csv("datasets/ml-latest-small/movies.csv")
ratings=pd.read_csv("datasets/ml-latest-small/ratings.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [5]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [6]:
ratings=ratings.drop(['timestamp'],axis='columns')
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [7]:
movies.genres[:4]

0    Adventure|Animation|Children|Comedy|Fantasy
1                     Adventure|Children|Fantasy
2                                 Comedy|Romance
3                           Comedy|Drama|Romance
Name: genres, dtype: object

In [8]:
movies.genres=movies.genres.apply(lambda x:x.replace("|",' ') if isinstance(x, str) else [])
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
# Initialize TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the genres column
tfidf_matrix = tfidf.fit_transform(movies['genres'])

# Check the shape of the resulting matrix
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

TF-IDF matrix shape: (9742, 23)


In [18]:
tfidf.get_feature_names_out()

array(['action', 'adventure', 'animation', 'children', 'comedy', 'crime',
       'documentary', 'drama', 'fantasy', 'fi', 'film', 'genres',
       'horror', 'imax', 'listed', 'musical', 'mystery', 'noir',
       'romance', 'sci', 'thriller', 'war', 'western'], dtype=object)

In [10]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [58]:
movie_indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()
movie_indices[:4]

title
Toy Story (1995)            0
Jumanji (1995)              1
Grumpier Old Men (1995)     2
Waiting to Exhale (1995)    3
dtype: int64

In [59]:
def recommend_movies(title, cosine_sim=cosine_sim, movies=movies,movie_indices=movie_indices):
    # Get the index of the movie that matches the title
    idx = movie_indices.get(title, None)
    if idx is None:
        return f"Movie '{title}' not found in the dataset."

    # Get similarity scores for all movies with the selected movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get indices of the top 10 similar movies (excluding the input movie)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies['title'].iloc[movie_indices]

In [89]:
movie_name = "Blue Is the Warmest Color (La vie d'Adèle) (2013)"
recommendations = recommend_movies(movie_name)
print(f"Recommendations for '{movie_name}':\n{recommendations}")

Recommendations for 'Blue Is the Warmest Color (La vie d'Adèle) (2013)':
24                            Leaving Las Vegas (1995)
27                                   Persuasion (1995)
42                How to Make an American Quilt (1995)
45                        When Night Is Falling (1995)
66                                 Bed of Roses (1996)
75     Once Upon a Time... When We Were Colored (1995)
76                           Angels and Insects (1995)
93               Bridges of Madison County, The (1995)
115                       Up Close and Personal (1996)
151                                    Mad Love (1995)
Name: title, dtype: object


In [87]:
movies[movies['title'].apply(lambda x:x.lower().strip()).str.contains("blue is")]

Unnamed: 0,movieId,title,genres
8272,105355,Blue Is the Warmest Color (La vie d'Adèle) (2013),Drama Romance
