## Import packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

## Load dataset

In [2]:
movies = pd.read_csv('./dataset/imbd/movies_metadata.csv')

  movies = pd.read_csv('./dataset/imbd/movies_metadata.csv')


## Overview of Toy Story

In [3]:
movies['overview'][0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

## TfidfVectorizer - Convert to Numerical

In [14]:
tfidf = TfidfVectorizer(stop_words='english')
movies['overview'] = movies['overview'].fillna('')

#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
overview_matrix = tfidf.fit_transform(movies['overview'])

#Output the shape of tfidf_matrix
overview_matrix.shape

#Every movie has 75927 number of features (words )

(45466, 75827)

In [16]:
print(overview_matrix)

  (0, 17764)	0.13483149538639247
  (0, 4388)	0.1474882034218405
  (0, 38030)	0.10142919482788751
  (0, 21887)	0.10438761058719498
  (0, 19641)	0.13281884272823927
  (0, 48558)	0.10339358185033234
  (0, 59519)	0.13008016104455086
  (0, 12490)	0.12544427954397822
  (0, 51108)	0.13434817283119177
  (0, 29238)	0.10093917370354445
  (0, 50914)	0.09190797940163035
  (0, 39423)	0.11907123344715953
  (0, 1847)	0.140911774178889
  (0, 58571)	0.1135591886873686
  (0, 38693)	0.20627924682810617
  (0, 9874)	0.5028038686135609
  (0, 9087)	0.10635375129287977
  (0, 7491)	0.12380553184830104
  (0, 56872)	0.111248510865236
  (0, 28729)	0.13311522181618415
  (0, 39012)	0.08718689178959059
  (0, 67874)	0.14878284660693247
  (0, 3159)	0.41178365711725945
  (0, 73468)	0.4809827114790237
  (0, 38088)	0.10739705953465473
  :	:
  (45464, 26957)	0.07350962631701621
  (45464, 18919)	0.09271509240923419
  (45464, 18119)	0.07466631763708827
  (45464, 39012)	0.06829617779135382
  (45465, 16520)	0.3237330788694511

## Building the similarity matrix

In [5]:
similarity_matrix = linear_kernel(overview_matrix,overview_matrix)

In [6]:
similarity_matrix

array([[1.        , 0.01504121, 0.        , ..., 0.        , 0.00595453,
        0.        ],
       [0.01504121, 1.        , 0.04681953, ..., 0.        , 0.02198641,
        0.00929411],
       [0.        , 0.04681953, 1.        , ..., 0.        , 0.01402548,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.00595453, 0.02198641, 0.01402548, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.00929411, 0.        , ..., 0.        , 0.        ,
        1.        ]])

## Movies index mapping

In [8]:
mapping = pd.Series(movies.index,index = movies['title'])
mapping

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                               ...  
Subdue                         45461
Century of Birthing            45462
Betrayal                       45463
Satan Triumphant               45464
Queerama                       45465
Length: 45466, dtype: int64

## Building Recommender System

In [12]:
def recommend_movies_based_on_plot(movie_input):
    
    movie_index = mapping[movie_input]
    #get similarity values with other movies
    
    #similarity_score is the list of index and similarity matrix
    similarity_score = list(enumerate(similarity_matrix[movie_index]))
    
    #sort in descending order the similarity score of movie inputted with all the other movies
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies. Ignore the first movie.
    similarity_score = similarity_score[1:10]
    
    #return movie names using the mapping series
    movie_indices = [i[0] for i in similarity_score]
    
    return (movies['title'].iloc[movie_indices])

In [13]:
recommend_movies_based_on_plot('Life Begins for Andy Hardy')

23530                      Andy Hardy Meets Debutante
21422                                 A Family Affair
26304                          You're Only Young Once
10301                          The 40 Year Old Virgin
29369                  Andy Hardy's Private Secretary
23843                     Andy Hardy's Blonde Trouble
15348                                     Toy Story 3
43427                Andy Kaufman Plays Carnegie Hall
38476    Superstar: The Life and Times of Andy Warhol
42721    Andy Peters: Exclamation Mark Question Point
8327                                        The Champ
28128                       The Mayor of Casterbridge
21359                        Andy Hardy's Double Life
32086                                Brother's Keeper
Name: title, dtype: object