### ***Necessary Imports***

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style

# string maniulation 
import re
import ast 

# preprocessing
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stemmer = PorterStemmer()
stopwords_ = stopwords.words('english')

### ***We'll make a basic content based recommender system or in other words which assume that the users are independent***

In [2]:
credits_path = 'tmdb_5000_credits.csv'
movies_path = 'tmdb_5000_movies.csv'

In [3]:
credits = pd.read_csv(credits_path)
print(credits.columns)
credits.head(1)

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')


Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [4]:
movies = pd.read_csv(movies_path)
print(movies.columns)
movies.head(1)

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [5]:
# now first we should combine the dataset and ultimately we have to make the movie tags in vectorized forms
# we can merge the dataset on the title column
movies = movies.merge(credits, on='title')
print(movies.columns)
movies.head()

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [6]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [7]:
# now we should first have to manually select which columns will affect the recommendation of a movie
# id and title will be required for any further references
# budget might be a factor for recommendation as mostly high budget movies include high VFX, so might be a good movie
# genres is strictly required
# keywords is required as it tells many things about movie, so will be useful in creating tags
# overview will also be required in a similarity metric
# popularity might be required beacuse of the sheep mentatlity of humans
# release date do affect recommendation but we aren't including as that will become hectic to incorporate
# runtime do affect the movie recommendation 
# have to remove tagline because there is a lot of missing data
# title, vote_average(won't include vote_count), 
# will include cast(only top 3 or 5) {as they are the superstars who can affect the movie recommendation}
# will include director only amongst crew only as they are the ones who can be known by public {which can affect the 
# recommendation, producer also affects but i am avoiding that}
string_columns_to_select = ['title', 'genres', 'keywords', 'overview', 'cast', 'crew']
numerical_columns_to_select = ['id', 'budget', 'popularity', 'runtime', 'vote_average']
movies = movies.loc[:, string_columns_to_select + numerical_columns_to_select]

In [8]:
movies.head(1)

Unnamed: 0,title,genres,keywords,overview,cast,crew,id,budget,popularity,runtime,vote_average
0,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",19995,237000000,150.437577,162.0,7.2


In [9]:
# let us first check if there are any null values 
# if there are too many in a column then we can drop that column, if there are very less that we can drop the rows
# and if there are some countable null values then we can fill with the mode values in nuerical columns
print(movies.isnull().sum())
print(movies.shape)
movies.dropna(inplace=True)
print(movies.isnull().sum())
print(movies.shape)
# dealt with missing data, now preprocessing

title           0
genres          0
keywords        0
overview        3
cast            0
crew            0
id              0
budget          0
popularity      0
runtime         2
vote_average    0
dtype: int64
(4809, 11)
title           0
genres          0
keywords        0
overview        0
cast            0
crew            0
id              0
budget          0
popularity      0
runtime         0
vote_average    0
dtype: int64
(4806, 11)


In [10]:
# now from these data we have to form vectors somehow by preprocessing
# for string columns we will form vectors by NLP techniques such as BAG of words, or BERT but for this time i am gonna use 
# simple BAG of words by forming dictionary 
# for numerical columns except id and vote_average we will normalize them 
# and then form a vector for a movie by concatenating the vectors for string and numerical columns

def genre_keywords_preprocess(string):
    # we have to take the name out of the dictionary as that will mean some tags
    # strip out white spaces 
    # convert all of them to lower characters
    # stem the tags so that no redundant tags will be created 
    return [stemmer.stem(re.sub(" ", '', x['name'].lower())) for _,x in enumerate(ast.literal_eval(string))]

def overview_preprocess(string):
    # remove any charcters other than alpabets
    # convert to lowercase
    # split on white spaces
    # stem the words and remove the stopwords
    return [stemmer.stem(x) for x in re.sub(r'[^a-zA-Z]', ' ', string.lower()).split() if x not in stopwords_ and x!='']

def cast_preprocess(cast, take_top=5):
    # will have to take names of cast only
    # convert to lowercase
    # remove whitespaces
    return [re.sub(" ", "", x['name'].lower()) for i,x in enumerate(ast.literal_eval(cast)) if i<take_top]


def crew_preprocess(crew):
    # will have to find the name of the person whose job is 'Director'
    # convert name to lowercase
    # remove whitespaces from name
    for _,x in enumerate(ast.literal_eval(crew)):
        if x['job']=='Director':
            return [re.sub(" ", "", x['name'].lower())]

    return []
        
        
def numerical_column_normalizer(column):
    mu = column.mean()
    sigma = column.std()
    return (column-mu)/sigma

In [11]:
# preprocessing string columns
movies['genres'] = movies['genres'].apply(genre_keywords_preprocess)
movies['keywords'] = movies['keywords'].apply(genre_keywords_preprocess)
movies['overview'] = movies['overview'].apply(overview_preprocess)
movies['cast'] = movies['cast'].apply(cast_preprocess)
movies['crew'] = movies['crew'].apply(crew_preprocess)

# preprocessing numerical columns
movies['budget'] = numerical_column_normalizer(movies['budget'])
movies['popularity'] = numerical_column_normalizer(movies['popularity'])
movies['runtime'] = numerical_column_normalizer(movies['runtime'])

In [12]:
# now we have to create a column of tags which will comprise of all the string elements
movies['tags'] = movies['genres'] + movies['keywords'] + movies['overview'] + movies['cast'] + movies['crew']
movies = movies.loc[:, ['id', 'title'] + numerical_columns_to_select[1:] + ['tags']]

In [13]:
movies.head()

Unnamed: 0,id,title,budget,popularity,runtime,vote_average,tags
0,19995,Avatar,5.107917,4.053347,2.438374,7.2,"[action, adventur, fantasi, sciencefict, cultu..."
1,285,Pirates of the Caribbean: At World's End,6.655344,3.696374,2.748078,6.9,"[adventur, fantasi, action, ocean, drugabus, e..."
2,206647,Spectre,5.304416,2.699617,1.818967,6.3,"[action, adventur, crime, spi, basedonnovel, s..."
3,49026,The Dark Knight Rises,5.427227,2.854798,2.571105,7.6,"[action, crime, drama, thriller, dccomic, crim..."
4,49529,John Carter,5.672851,0.704903,1.111074,6.1,"[action, adventur, sciencefict, basedonnovel, ..."


### ***Now we have to select the tags column and and make a vector for any movie through the BAG of words approach***

In [14]:
def make_dict(tags, select=5000):
    # we will make a count dictionary and select the most common 5000 words
    open_vocab = {}
    for _,tag in enumerate(tags, 0):
        #if type(tag)==float:
            #print(f"i, tag, type(tag) tags = {i}, {tag}, {type(tag)}, {tags}")
        for word in tag:
            if word in open_vocab.keys():
                open_vocab[word]+=1
            else:
                open_vocab[word]=1
    sorted_open_vocab = sorted(open_vocab.items(), key=lambda x:x[1])[::-1] # descending order
    closed_vocab = dict(sorted_open_vocab[:select])
    print(f"Length of open vocab is {len(open_vocab)}\nLength of closed_vocab is {len(closed_vocab)}")
    return list(closed_vocab.keys())  # as this will be our dictionary 

select = 5000 # hyperparameter
vocab = make_dict(movies['tags'], select=select)
word2ind = {word:i for i,word in enumerate(vocab)}
ind2word = {i:word for i, word in enumerate(vocab)}

Length of open vocab is 32546
Length of closed_vocab is 5000


In [15]:
def vectorize(tags, word2ind):
    vectors =  []
    for tag in tags:
        vector = np.zeros(len(word2ind))
        for word in tag:
            if word in word2ind.keys():
                vector[word2ind[word]]+=1
        vectors.append(vector)
    vectors = np.array(vectors)
    print(f"vectors.shape = {vectors.shape}")
    return vectors

string_vectors = vectorize(movies['tags'], word2ind)

vectors.shape = (4806, 5000)


In [16]:
# now we have to form the final data by concatenating the string vectors with numerical vectors
numerical_vectors = movies.iloc[:, 2:-1].values
print(numerical_vectors.shape)
data = np.concatenate((string_vectors, numerical_vectors), axis=1)
print(data.shape)

(4806, 4)
(4806, 5004)


## ***Calculate the similarity metric***

In [17]:
def cosine_similarity_matrix(data):
    norm_vec = np.linalg.norm(data, axis=1)
    norm_vec = norm_vec.reshape(1, norm_vec.shape[0])
    norm_mat = norm_vec.T@norm_vec
    mat = (data@data.T)/norm_mat
    return mat

similarity_matrix_numerical = cosine_similarity_matrix(data)

### ***Now given a movie we have to recommend 5 most similar movies to it***

In [18]:
def recommend(movie_name, movies, similarity_matrix):
    # remember movies is a dataframe
    index = movies[movies['title']==movie_name].index
    index = index[0]
    # now we have the index number of the movie and we have to determine the top 5 movies of this dataset
    recommended_movies_index = list(np.argsort(similarity_matrix[index]))[::-1][1:6] # to recommend top 6 movies
    recommended_movies = movies['title'][recommended_movies_index]
    return list(recommended_movies)

In [19]:
idx = 5
print(f"Movies similar to {movies['title'][idx]} are - ")
recommend(movies['title'][idx], movies, similarity_matrix_numerical)

Movies similar to Spider-Man 3 are - 


['Spider-Man',
 'Spider-Man 2',
 "Pirates of the Caribbean: At World's End",
 'The Hobbit: The Desolation of Smaug',
 'Furious 7']

### ***Movies recommended are not good, let's try by dropping numerical_columns***

In [20]:
similarity_matrix_string_only = cosine_similarity_matrix(string_vectors)

In [21]:
idx = 90
print(f"Movies similar to {movies['title'][idx]} are - ")
recommend(movies['title'][idx], movies, similarity_matrix_string_only)

Movies similar to The Polar Express are - 


['The Calling', 'The Book of Life', 'The Hunt', 'Gun Shy', 'The Jungle Book']

In [22]:
idx = 5
print(f"Movies similar to {movies['title'][idx]} are - ")
recommend(movies['title'][idx], movies, similarity_matrix_string_only)

Movies similar to Spider-Man 3 are - 


['Spider-Man',
 'Spider-Man 2',
 'The Amazing Spider-Man 2',
 'The Amazing Spider-Man',
 'Arachnophobia']

In [23]:
recommend('Batman Begins', movies, similarity_matrix_string_only)

['The Dark Knight',
 'The Dark Knight Rises',
 'Batman',
 'Batman',
 'Batman & Robin']

### ***Now the recommendations are better, we can try out by bringing some valuable numerical columns but i am finally going with string_vectors only as they are performing better***
### ***The reason for better suggestion is that fact that the general perception of similar is that the movies should talk about same things but having same budget and rating does not ensure this. Thus. i dropped the numerical columns***

In [None]:
# for deployment purposes, then download the pickle files simply and do the work
# in local IDEs
movies = movies.loc[:, ['id', 'title']]
pickle.dump(movies, open('movies.pkl', 'wb'))
pickle.dump(similarity_matrix_string_only, open('similarity.pkl', 'wb'))
movies.head()