# Building a plot line based recommender

 we will use tf-idf scores and cosine scores to build a recommender system that suggest movies based on overviews.

The task is to build a system that takes in a movie title and outputs a list of movies that has similar plot lines.

In [299]:
import pandas as pd

In [300]:
# Importing Input data
df = pd.read_csv("movie_overviews.csv")

In [301]:
print(df.head())

      id                        title  \
0    862                    Toy Story   
1   8844                      Jumanji   
2  15602             Grumpier Old Men   
3  31357            Waiting to Exhale   
4  11862  Father of the Bride Part II   

                                            overview  \
0  Led by Woody, Andy's toys live happily in his ...   
1  When siblings Judy and Peter discover an encha...   
2  A family wedding reignites the ancient feud be...   
3  Cheated on, mistreated and stepped on, the wom...   
4  Just when George Banks has recovered from his ...   

                                             tagline  
0                                                NaN  
1          Roll the dice and unleash the excitement!  
2  Still Yelling. Still Fighting. Still Ready for...  
3  Friends are the people who let you be yourself...  
4  Just When His World Is Back To Normal... He's ...  


In [312]:
print(df.dtypes)

id           int64
title       object
overview    object
tagline     object
dtype: object


In [302]:
# To check for missing values
df.isna().sum()

id             0
title          0
overview      12
tagline     2066
dtype: int64

In [303]:
# To drop missing values and duplicate values
df = df.dropna(axis=0).reset_index(drop = True)
df = df.drop_duplicates(['title']).reset_index(drop = True)

In [304]:
# Building a Tfidf Vectorizeer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words="english")

In [305]:
# Fitting tfidf vectorizer to the data
tfidf_matrix = tfidf.fit_transform(df['overview'])

In [306]:
# To compute cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
sim_score = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [307]:
# Create inidices to get the index of the movie title
indices = pd.Series(df.index, index=df['title'])
print(indices)

title
Jumanji                                                  0
Grumpier Old Men                                         1
Waiting to Exhale                                        2
Father of the Bride Part II                              3
Heat                                                     4
                                                      ... 
Kingsglaive: Final Fantasy XV                         6838
Sharknado 4: The 4th Awakens                          6839
Rustom                                                6840
Shin Godzilla                                         6841
The Beatles: Eight Days a Week - The Touring Years    6842
Length: 6843, dtype: int64


In [308]:
# function get movie recommendation
def get_recommendations(title, sim_score, indices):
    # Get index of movie that matches title
    idx = indices[title]
    # Sort the movies based on the similarity scores
    sim_scores = list(enumerate(sim_score[idx]))
    sim_scores = sorted(sim_scores,key=lambda x: x[1], reverse=True)
    # Get the scores for 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [309]:
# movie recommendation based on story of Batman
print(get_recommendations('Batman', sim_score, indices))

6029                      The Dark Knight Rises
5760                 Batman: Under the Red Hood
4714                              Batman Begins
5289                            The Dark Knight
6199    Batman: The Dark Knight Returns, Part 1
910                              Batman Returns
1010                             Batman & Robin
6005                           Batman: Year One
2108               Batman: Mask of the Phantasm
6793                    A Very Murray Christmas
Name: title, dtype: object


In [310]:
# Movie recommendation based on the story of The prince of Tides(Love)
print(get_recommendations('The Prince of Tides', sim_score, indices))

2793     How to Marry a Millionaire
3822                   The Hospital
5187    I Could Never Be Your Woman
2586                         Bounce
2802                Crazy/Beautiful
2970        The Hotel New Hampshire
6626                 American Heist
2990                    On the Edge
617                           Laura
5386                           1984
Name: title, dtype: object


In [311]:
# movie recommendation based on the story of Black Moon(Adventures)
print(get_recommendations('Black Moon', sim_score, indices))

3553                          The Last Unicorn
2924                           Little Monsters
3444                              Femme Fatale
3675                      The Railway Children
3737                                Le Divorce
2987                                The Others
1843    Little Nemo: Adventures In Slumberland
5972                              Freakonomics
3309                       Sorry, Wrong Number
2969                                 Def-Con 4
Name: title, dtype: object
