# Import 

In [1]:
import warnings;warnings.filterwarnings('ignore')
import os,sys,joblib,json,math
from datetime import date,time,datetime,timedelta
from tqdm import tqdm;tqdm.pandas()
import numpy as np,pandas as pd
pd.set_option('display.max_columns',None)

from ast import literal_eval


from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Helper Functions

In [3]:
def convert_genres_and_keywords(obj):
    obj = literal_eval(obj)
    extracted_genres = [i.get('name') for i in obj]
    return extracted_genres

def convert_cast(obj):
    obj = literal_eval(obj)
    if len(obj)>=3:
        obj = obj[:3]
    top_3_cast_name = [i.get('name') for i in obj]
    return top_3_cast_name

def fetch_director_from_crew(obj):
    obj= literal_eval(obj)
    director_list = [i.get('name') for i in obj if i.get('job') == 'Director']
    return director_list

def removing_spaces(obj_list):
    obj_list =[i.title().replace(" ","") for i in obj_list]
    return obj_list

# Loading Data

In [10]:
# Upload Data
df_movies = pd.read_csv('data/tmdb_5000_movies.csv')
df_credits = pd.read_csv('data/tmdb_5000_credits.csv')
df_movies = df_movies.merge(df_credits,on='title')
df_movies= df_movies[['genres', 'id', 'keywords', 'title', 'overview', 'cast', 'crew']]
df_movies.dropna(inplace=True)

In [None]:
# PostProcessing
df_movies['genres'] = df_movies.genres.progress_apply(lambda x:convert_genres_and_keywords(x))
df_movies['keywords'] = df_movies.keywords.progress_apply(lambda x:convert_genres_and_keywords(x))
df_movies['cast'] = df_movies.cast.progress_apply(lambda x : convert_cast(x))
df_movies['director_name'] = df_movies.crew.progress_apply(lambda x : fetch_director_from_crew(x))

# Removng Spaces
df_movies['genres'] = df_movies.genres.progress_apply(lambda x:removing_spaces(x))
df_movies['keywords'] = df_movies.keywords.progress_apply(lambda x:removing_spaces(x))
df_movies['cast'] = df_movies.cast.progress_apply(lambda x : removing_spaces(x))
df_movies['director_name'] = df_movies.director_name.progress_apply(lambda x : removing_spaces(x))

# Converting Overview into string
df_movies['overview'] = df_movies.overview.progress_apply(lambda x: x.split())

# Creating tgs
df_movies['tags'] = df_movies.overview + df_movies.genres + df_movies.cast + df_movies.director_name
df_movies['tags'] = df_movies.tags.progress_apply(lambda x:' '.join(x)).str.lower()


# Taking only required columns
df_movies = df_movies['id title tags'.split()]

100%|██████████| 4806/4806 [00:00<00:00, 24070.88it/s]
100%|██████████| 4806/4806 [00:00<00:00, 14832.89it/s]
100%|██████████| 4806/4806 [00:02<00:00, 1803.94it/s]
100%|██████████| 4806/4806 [00:02<00:00, 1640.07it/s]
100%|██████████| 4806/4806 [00:00<00:00, 480738.00it/s]
100%|██████████| 4806/4806 [00:00<00:00, 244713.98it/s]
100%|██████████| 4806/4806 [00:00<00:00, 369109.81it/s]
100%|██████████| 4806/4806 [00:00<00:00, 599560.54it/s]
100%|██████████| 4806/4806 [00:00<00:00, 177884.09it/s]
100%|██████████| 4806/4806 [00:00<00:00, 370160.40it/s]


# Text Pre-Processing

In [None]:
# Converting title into lower case
df_movies['title'] = df_movies.title.str.lower()

In [6]:
# DO the stemming of the tags
def get_steming_of_tag(str_obj):
    lst_obj = str_obj.split()
    lst_obj_stem = [ps_obj.stem(i) for i in lst_obj]
    str_obj_stemped = " ".join(lst_obj_stem)
    return str_obj_stemped

ps_obj = PorterStemmer()
df_movies['tags'] = df_movies.tags.progress_apply(lambda x:get_steming_of_tag(x))

100%|██████████| 4806/4806 [00:04<00:00, 1192.27it/s]


# Converting string into respective vector

### BOW

In [7]:
# Reset Index of DataFrame
df_movies = df_movies.reset_index(drop=True)

# Create Text to Vector Embedding of tags
count_vec_obj = CountVectorizer(max_features=5000,stop_words='english')
tag_vector = count_vec_obj.fit_transform(df_movies.tags).toarray()

# Create Similarity Matrics from vector
similarity_matrics = cosine_similarity(tag_vector)

In [8]:
def recommend(movie_title, k=5):
    movie_index = df_movies[df_movies.title == movie_title].index[0]
    movie_distances = similarity_matrics[movie_index]
    sorted_index = np.argsort(movie_distances)[::-1]
    sorted_scores =movie_distances[sorted_index]
    top_reco_indexes = sorted_index[1:k+1]
    top_movies_list = df_movies.title.iloc[top_reco_indexes].tolist()
    return top_movies_list

df_movies['reco_movies'] = df_movies.title.progress_apply(lambda x: recommend(x))

100%|██████████| 4806/4806 [00:03<00:00, 1351.62it/s]


In [9]:
df_movies.head(10)

Unnamed: 0,id,title,tags,reco_movies
0,19995,avatar,"in the 22nd century, a parapleg marin is dispa...","[mad max beyond thunderdome, the helix... load..."
1,285,pirates of the caribbean: at world's end,"captain barbossa, long believ to be dead, ha c...","[pirates of the caribbean: dead man's chest, t..."
2,206647,spectre,a cryptic messag from bond’ past send him on a...,"[quantum of solace, never say never again, fro..."
3,49026,the dark knight rises,follow the death of district attorney harvey d...,"[the dark knight, batman forever, batman, amid..."
4,49529,john carter,"john carter is a war-weary, former militari ca...","[krrish, stripes, get carter, the other side o..."
5,559,spider-man 3,the seemingli invinc spider-man goe up against...,"[spider-man 2, spider-man, the amazing spider-..."
6,38757,tangled,when the kingdom' most wanted-and most charmin...,"[out of inferno, the thief and the cobbler, at..."
7,99861,avengers: age of ultron,when toni stark tri to jumpstart a dormant pea...,"[iron man 2, iron man 3, iron man, the helix....."
8,767,harry potter and the half-blood prince,"as harri begin hi sixth year at hogwarts, he d...","[harry potter and the goblet of fire, harry po..."
9,209112,batman v superman: dawn of justice,fear the action of a god-lik super hero left u...,"[thor, last action hero, man of steel, defendo..."
