In [1]:
import pandas as pd
# abandoned version

movies = pd.read_csv('movies.csv')

In [2]:
# get_year extracts year from the title column
def get_year(title):
    year = title[len(title) - 5:len(title) - 1]
    return year

# get_title extracts the movie title from the title column
def get_title(title):
    return title[: len(title) - 7]

movies.rename(columns = {'title': 'movie_year'}, inplace = True)

movies['title'] = movies['movie_year'].apply(get_title)
movies['year'] = movies['movie_year'].apply(get_year)

In [3]:
movies['genres'] = movies['genres'].str.replace('|' ,' ')

# create a hashmap to store counts of genres
freq_map = dict()
for i in movies.index:
    for genre in movies.loc[i, 'genres'].split(' '):
        if genre not in freq_map:
            freq_map[genre] = 1
        else:
            freq_map[genre] = freq_map[genre] + 1

# create a bar chart
# plt.bar(list(counts.keys()), counts.values(), color='g')
# plt.xticks(rotation=45)
# plt.xlabel('Genres')
# plt.ylabel('Counts')

  movies['genres'] = movies['genres'].str.replace('|' ,' ')


In [4]:
# Step 1: Quantify the features for each movie (tf-idf)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# remove special characters
movies['genres'] = movies['genres'].str.replace('Film-Noir','Noir')
movies['genres'] = movies['genres'].str.replace('Sci-Fi','SciFi')

tfidf_vector = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vector.fit_transform(movies['genres'])

tfidf_matrix.shape

(9742, 21)

In [5]:
print(list(enumerate(tfidf_vector.get_feature_names_out())))

[(0, 'action'), (1, 'adventure'), (2, 'animation'), (3, 'children'), (4, 'comedy'), (5, 'crime'), (6, 'documentary'), (7, 'drama'), (8, 'fantasy'), (9, 'genres'), (10, 'horror'), (11, 'imax'), (12, 'listed'), (13, 'musical'), (14, 'mystery'), (15, 'noir'), (16, 'romance'), (17, 'scifi'), (18, 'thriller'), (19, 'war'), (20, 'western')]


In [10]:
# Step 2: Calculate the Cosine Similarity between movies

In [6]:
from sklearn.metrics.pairwise import linear_kernel

sim_matrix = linear_kernel(tfidf_matrix,tfidf_matrix) 
print(sim_matrix)

[[1.         0.81357774 0.15276924 ... 0.         0.4210373  0.26758648]
 [0.81357774 1.         0.         ... 0.         0.         0.        ]
 [0.15276924 0.         1.         ... 0.         0.         0.57091541]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.4210373  0.         0.         ... 0.         1.         0.        ]
 [0.26758648 0.         0.57091541 ... 0.         0.         1.        ]]


In [7]:
sim_matrix.shape

(9742, 9742)

In [8]:
# apply Levenshtein Distance, in case of typo
from fuzzywuzzy import fuzz
# create a function to find the closest title
def matching_score(a,b):
    return fuzz.ratio(a,b)  # if no typo, the score would be 100

In [9]:
# get_title_year_from_index converts index to title_year
def get_title_year_from_index(index):
    return movies[movies.index == index]['movie_year'].values[0]

# get_title_from_index converts index to title
def get_title_from_index(index):
    return movies[movies.index == index]['title'].values[0]

# get_index_from_title converts title to index
def get_index_from_title(title):
    return movies[movies.title == title].index.values[0]

# find_closest_title returns the most similar title to the words a user type
def find_closest_title(title):
    leven_scores = list(enumerate(movies['title'].apply(matching_score, b=title)))
    sorted_leven_scores = sorted(leven_scores, key=lambda x: x[1], reverse=True)
    closest_title = get_title_from_index(sorted_leven_scores[0][0])
    distance_score = sorted_leven_scores[0][1]
    return closest_title, distance_score

In [10]:
def recommender(movie_titles_user_like, number):
    closest_title, distance_score = find_closest_title(movie_titles_user_like)
    
    if distance_score == 100:
        movie_index = get_index_from_title(movie_titles_user_like)
        movie_list = list(enumerate(sim_matrix[int(movie_index)]))
        
        # remove the typed movie itself
        similar_movies = list(filter(lambda x :x[0] != int(movie_index), sorted(movie_list,key=lambda x:x[1], reverse = True))) 
        print('Here\'s the list of movies similar to '+'\033[1m'+str(closest_title)+'\033[0m'+'.\n')
        for i,s in similar_movies[: number]:
            print(get_title_year_from_index(i))
    # if a typo appears        
    else:
        print('Did you mean '+'\033[1m'+str(closest_title)+'\033[0m'+'?','\n')
        movie_index = get_index_from_title(closest_title)
        movie_list = list(enumerate(sim_matrix[int(movie_index)]))
        
        # remove the typed movie itself
        similar_movies = list(filter(lambda x :x[0] != int(movie_index), sorted(movie_list,key=lambda x:x[1], reverse = True))) 
        print('Here\'s the list of movies similar to '+'\033[1m'+str(closest_title)+'\033[0m'+'.\n')
        for i,s in similar_movies[: number]:
            print(get_title_year_from_index(i))

In [11]:
recommender('Monsters, Inc.', 20)

Here's the list of movies similar to [1mMonsters, Inc.[0m.

Toy Story (1995)
Antz (1998)
Toy Story 2 (1999)
Adventures of Rocky and Bullwinkle, The (2000)
Emperor's New Groove, The (2000)
Wild, The (2006)
Shrek the Third (2007)
Tale of Despereaux, The (2008)
Asterix and the Vikings (Astérix et les Vikings) (2006)
Turbo (2013)
The Good Dinosaur (2015)
Moana (2016)
Inside Out (2015)
Black Cauldron, The (1985)
Lord of the Rings, The (1978)
We're Back! A Dinosaur's Story (1993)
Atlantis: The Lost Empire (2001)
Land Before Time, The (1988)
Pokemon 4 Ever (a.k.a. Pokémon 4: The Movie) (2002)
Sinbad: Legend of the Seven Seas (2003)
