## Library

In [47]:
%matplotlib inline
import ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os 
import nltk
import string
import pickle
import re
import math

from scipy import stats
from ast import literal_eval
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.corpus import stopwords

import warnings; warnings.simplefilter('ignore')

### Merging cast crew keyword data into main dataframe

#### Loading crew cast keyword into dataframe

In [48]:
credits = pd.read_csv('../the-movies-dataset/credits.csv')
keywords = pd.read_csv('../the-movies-dataset/keywords.csv')
smd = pd.read_csv('../the-movies-dataset/movies_metadata_equal_ratings.csv')

#### Loading TMDB_id value 

In [49]:
smd['genres'] = smd['genres'].apply(ast.literal_eval)

In [50]:
smd['genres'].head()

0     [Animation, Comedy, Family]
1    [Adventure, Fantasy, Family]
2               [Romance, Comedy]
3        [Comedy, Drama, Romance]
4                        [Comedy]
Name: genres, dtype: object

In [51]:
smd.shape

(9025, 26)

In [52]:
credits[:5]

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [53]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
cast    45476 non-null object
crew    45476 non-null object
id      45476 non-null int64
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [54]:
keywords[:5]

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [55]:
keywords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
id          46419 non-null int64
keywords    46419 non-null object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB


In [56]:
keywords = keywords.drop(keywords[keywords.duplicated(['id'])].index)
keywords.shape

(45432, 2)

In [57]:
credits = credits.drop(credits[credits.duplicated(['id'])].index)
credits.shape

(45432, 3)

In [58]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
smd['id'] = smd['id'].astype('int')

In [59]:
smd = smd.merge(credits, on='id')
smd = smd.merge(keywords, on='id')

In [60]:
smd.shape

(9025, 29)

In [61]:
N = len(smd)
N

9025

In [62]:
smd

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,tagline,title,video,vote_average,vote_count,description,year,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,,Toy Story,False,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",1995,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,When siblings Judy and Peter discover an encha...,1995,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,A family wedding reignites the ancient feud be...,1995,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"Cheated on, mistreated and stepped on, the wom...",1995,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,Just when George Banks has recovered from his ...,1995,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9020,False,"{'id': 286023, 'name': 'Sharknado Collection',...",0,"[Comedy, Horror, Science Fiction]",http://www.syfy.com/sharknado4,390989,tt4831420,en,Sharknado 4: The 4th Awakens,The new installment of the Sharknado franchise...,...,"What happens in Vegas, stays in Vegas. Unless ...",Sharknado 4: The 4th Awakens,False,4.3,88.0,The new installment of the Sharknado franchise...,2016,"[{'cast_id': 0, 'character': 'Fin Shepard', 'c...","[{'credit_id': '56ffae0cc3a3686ea7001e00', 'de...","[{'id': 2988, 'name': 'shark attack'}, {'id': ..."
9021,False,,8000000,[Drama],,159550,tt0255313,en,The Last Brickmaker in America,A man must cope with the loss of his wife and ...,...,,The Last Brickmaker in America,False,7.0,1.0,A man must cope with the loss of his wife and ...,2001,"[{'cast_id': 1, 'character': 'Henry Cobb', 'cr...","[{'credit_id': '544475aac3a36819fb000578', 'de...","[{'id': 6054, 'name': 'friendship'}, {'id': 20..."
9022,False,,1000000,"[Thriller, Romance]",,392572,tt5165344,hi,रुस्तम,"Rustom Pavri, an honourable officer of the Ind...",...,Decorated Officer. Devoted Family Man. Defendi...,Rustom,False,7.3,25.0,"Rustom Pavri, an honourable officer of the Ind...",2016,"[{'cast_id': 0, 'character': 'Rustom Pavri', '...","[{'credit_id': '5951baf692514129c4016600', 'de...","[{'id': 10540, 'name': 'bollywood'}]"
9023,False,,15050000,"[Adventure, Drama, History, Romance]",,402672,tt3859980,hi,Mohenjo Daro,"Village lad Sarman is drawn to big, bad Mohenj...",...,,Mohenjo Daro,False,6.7,26.0,"Village lad Sarman is drawn to big, bad Mohenj...",2016,"[{'cast_id': 0, 'character': 'Sarman', 'credit...","[{'credit_id': '57cd5d3592514179d50018e8', 'de...","[{'id': 10540, 'name': 'bollywood'}]"


### Preprocessing

#### Cast crew 

In [63]:
# by using literal_eval => '[1,2]' would be converted to [1,2]

smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [64]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
             return i['name']
    return np.nan        

In [65]:
# get director from 
smd['director'] = smd['crew'].apply(get_director)
smd['director']

0             John Lasseter
1              Joe Johnston
2             Howard Deutch
3           Forest Whitaker
4             Charles Shyer
               ...         
9020    Anthony C. Ferrante
9021         Gregg Champion
9022      Tinu Suresh Desai
9023     Ashutosh Gowariker
9024             Ron Howard
Name: director, Length: 9025, dtype: object

In [66]:
smd.loc[8711]['cast']

[{'cast_id': 76,
  'character': 'Tony Stark / Iron Man',
  'credit_id': '55e256d292514162cd000e40',
  'gender': 2,
  'id': 3223,
  'name': 'Robert Downey Jr.',
  'order': 0,
  'profile_path': '/1YjdSym1jTG7xjHSI0yGGWEsw5i.jpg'},
 {'cast_id': 8,
  'character': 'Thor Odinson',
  'credit_id': '52fe4a449251416c750e3455',
  'gender': 2,
  'id': 74568,
  'name': 'Chris Hemsworth',
  'order': 1,
  'profile_path': '/tlkDiLn2G75Xr7m1ybK8QFzZBso.jpg'},
 {'cast_id': 15,
  'character': 'Bruce Banner / Hulk',
  'credit_id': '52fe4a449251416c750e3471',
  'gender': 2,
  'id': 103,
  'name': 'Mark Ruffalo',
  'order': 2,
  'profile_path': '/zdM6RgCR5LpZwnL8UA3m7CfVpiq.jpg'},
 {'cast_id': 12,
  'character': 'Steve Rogers / Captain America',
  'credit_id': '52fe4a449251416c750e3465',
  'gender': 2,
  'id': 16828,
  'name': 'Chris Evans',
  'order': 3,
  'profile_path': '/8CgFKCZJVwZxa1F88n8drEux0vT.jpg'},
 {'cast_id': 10,
  'character': 'Natasha Romanoff / Black Widow',
  'credit_id': '52fe4a449251416c7

In [69]:
smd['character'] = smd['cast'].apply(lambda x: [i['character'] for i in x] if isinstance(x, list) else [])
smd['character'] = smd['character'].apply(lambda x: x[:18] if len(x) >= 18 else x)

In [70]:
smd['character'][8711]

['Tony Stark / Iron Man',
 'Thor Odinson',
 'Bruce Banner / Hulk',
 'Steve Rogers / Captain America',
 'Natasha Romanoff / Black Widow',
 'Clint Barton / Hawkeye',
 'Ultron (voice)',
 'Nick Fury',
 'James Rhodes / War Machine',
 'Pietro Maximoff / Quicksilver',
 'Wanda Maximoff / Scarlet Witch',
 'Jarvis (voice) / Vision',
 'Maria Hill',
 'Sam Wilson / The Falcon',
 'Peggy Carter',
 'Heimdall',
 'Laura Barton',
 'Erik Selvig']

In [71]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >= 18 else x)

In [72]:
smd['cast'].head()

0    [Tom Hanks, Tim Allen, Don Rickles, Jim Varney...
1       [Robin Williams, Jonathan Hyde, Kirsten Dunst]
2    [Walter Matthau, Jack Lemmon, Ann-Margret, Sop...
3    [Whitney Houston, Angela Bassett, Loretta Devi...
4    [Steve Martin, Diane Keaton, Martin Short, Kim...
Name: cast, dtype: object

In [23]:
# creating a metadata dump for every movie which consists of 
# genres, director, main actors and keywords. 

#Lower Capital letter
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x])
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ","")))

In [24]:
# Mention Director 3 times to give it more weight relative to the 
# entire cast.

smd['director'] = smd['director'].apply(lambda x: [x,x,x])
smd['director'][:5]

0          [johnlasseter, johnlasseter, johnlasseter]
1             [joejohnston, joejohnston, joejohnston]
2          [howarddeutch, howarddeutch, howarddeutch]
3    [forestwhitaker, forestwhitaker, forestwhitaker]
4          [charlesshyer, charlesshyer, charlesshyer]
Name: director, dtype: object

#### Keywords

In [73]:
smd

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,vote_count,description,year,cast,crew,keywords,cast_size,crew_size,director,character
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,5415.0,"Led by Woody, Andy's toys live happily in his ...",1995,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",13,106,John Lasseter,"[Woody (voice), Buzz Lightyear (voice), Mr. Po..."
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,2413.0,When siblings Judy and Peter discover an encha...,1995,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",26,16,Joe Johnston,"[Alan Parrish, Samuel Alan Parrish / Van Pelt,..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,92.0,A family wedding reignites the ancient feud be...,1995,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",7,4,Howard Deutch,"[Max Goldman, John Gustafson, Ariel Gustafson,..."
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,34.0,"Cheated on, mistreated and stepped on, the wom...",1995,"[Whitney Houston, Angela Bassett, Loretta Devi...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':...",10,10,Forest Whitaker,"[Savannah 'Vannah' Jackson, Bernadine 'Bernie'..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,173.0,Just when George Banks has recovered from his ...,1995,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",12,7,Charles Shyer,"[George Banks, Nina Banks, Franck Eggelhoffer,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9020,False,"{'id': 286023, 'name': 'Sharknado Collection',...",0,"[Comedy, Horror, Science Fiction]",http://www.syfy.com/sharknado4,390989,tt4831420,en,Sharknado 4: The 4th Awakens,The new installment of the Sharknado franchise...,...,88.0,The new installment of the Sharknado franchise...,2016,"[Ian Ziering, Tara Reid, David Hasselhoff]","[{'credit_id': '56ffae0cc3a3686ea7001e00', 'de...","[{'id': 2988, 'name': 'shark attack'}, {'id': ...",76,15,Anthony C. Ferrante,"[Fin Shepard, April Wexler, Colonel Gilbert Sh..."
9021,False,,8000000,[Drama],,159550,tt0255313,en,The Last Brickmaker in America,A man must cope with the loss of his wife and ...,...,1.0,A man must cope with the loss of his wife and ...,2001,"[Sidney Poitier, Wendy Crewson, Jay O. Sanders...","[{'credit_id': '544475aac3a36819fb000578', 'de...","[{'id': 6054, 'name': 'friendship'}, {'id': 20...",7,2,Gregg Champion,"[Henry Cobb, Karen Potter, Mike Potter, Doroth..."
9022,False,,1000000,"[Thriller, Romance]",,392572,tt5165344,hi,रुस्तम,"Rustom Pavri, an honourable officer of the Ind...",...,25.0,"Rustom Pavri, an honourable officer of the Ind...",2016,"[Akshay Kumar, Ileana D'Cruz, Esha Gupta, Arja...","[{'credit_id': '5951baf692514129c4016600', 'de...","[{'id': 10540, 'name': 'bollywood'}]",14,16,Tinu Suresh Desai,"[Rustom Pavri, Cynthia Rustom Pavri, Priti Mak..."
9023,False,,15050000,"[Adventure, Drama, History, Romance]",,402672,tt3859980,hi,Mohenjo Daro,"Village lad Sarman is drawn to big, bad Mohenj...",...,26.0,"Village lad Sarman is drawn to big, bad Mohenj...",2016,"[Hrithik Roshan, Pooja Hegde, Kabir Bedi, Arun...","[{'credit_id': '57cd5d3592514179d50018e8', 'de...","[{'id': 10540, 'name': 'bollywood'}]",12,16,Ashutosh Gowariker,"[Sarman, Chaani, Maham, Senate Chief, Moonja, ..."


#### Stop word

In [26]:
def remove_stop_word(data):
    stop_word = stopwords.words('english')

    if data not in stop_word: return data
    
    return ""

In [27]:
remove_stop_word("this is a mouse")

'this is a mouse'

In [28]:
smd.loc[0]['keywords']

[{'id': 931, 'name': 'jealousy'},
 {'id': 4290, 'name': 'toy'},
 {'id': 5202, 'name': 'boy'},
 {'id': 6054, 'name': 'friendship'},
 {'id': 9713, 'name': 'friends'},
 {'id': 9823, 'name': 'rivalry'},
 {'id': 165503, 'name': 'boy next door'},
 {'id': 170722, 'name': 'new toy'},
 {'id': 187065, 'name': 'toy comes to life'}]

In [74]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else [])
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s

0           jealousy
0                toy
0                boy
0         friendship
0            friends
            ...     
9021    brick making
9022       bollywood
9023       bollywood
9024           music
9024     documentary
Name: keyword, Length: 63191, dtype: object

In [75]:
s = s.value_counts()
s[:5]

independent film        601
woman director          535
murder                  392
duringcreditsstinger    326
based on novel          307
Name: keyword, dtype: int64

In [76]:
smd

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,vote_count,description,year,cast,crew,keywords,cast_size,crew_size,director,character
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,5415.0,"Led by Woody, Andy's toys live happily in his ...",1995,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousy, toy, boy, friendship, friends, riva...",13,106,John Lasseter,"[Woody (voice), Buzz Lightyear (voice), Mr. Po..."
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,2413.0,When siblings Judy and Peter discover an encha...,1995,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[board game, disappearance, based on children'...",26,16,Joe Johnston,"[Alan Parrish, Samuel Alan Parrish / Van Pelt,..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,92.0,A family wedding reignites the ancient feud be...,1995,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[fishing, best friend, duringcreditsstinger, o...",7,4,Howard Deutch,"[Max Goldman, John Gustafson, Ariel Gustafson,..."
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,34.0,"Cheated on, mistreated and stepped on, the wom...",1995,"[Whitney Houston, Angela Bassett, Loretta Devi...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[based on novel, interracial relationship, sin...",10,10,Forest Whitaker,"[Savannah 'Vannah' Jackson, Bernadine 'Bernie'..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,173.0,Just when George Banks has recovered from his ...,1995,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[baby, midlife crisis, confidence, aging, daug...",12,7,Charles Shyer,"[George Banks, Nina Banks, Franck Eggelhoffer,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9020,False,"{'id': 286023, 'name': 'Sharknado Collection',...",0,"[Comedy, Horror, Science Fiction]",http://www.syfy.com/sharknado4,390989,tt4831420,en,Sharknado 4: The 4th Awakens,The new installment of the Sharknado franchise...,...,88.0,The new installment of the Sharknado franchise...,2016,"[Ian Ziering, Tara Reid, David Hasselhoff]","[{'credit_id': '56ffae0cc3a3686ea7001e00', 'de...","[shark attack, sequel, farce, las vegas, creat...",76,15,Anthony C. Ferrante,"[Fin Shepard, April Wexler, Colonel Gilbert Sh..."
9021,False,,8000000,[Drama],,159550,tt0255313,en,The Last Brickmaker in America,A man must cope with the loss of his wife and ...,...,1.0,A man must cope with the loss of his wife and ...,2001,"[Sidney Poitier, Wendy Crewson, Jay O. Sanders...","[{'credit_id': '544475aac3a36819fb000578', 'de...","[friendship, brick making]",7,2,Gregg Champion,"[Henry Cobb, Karen Potter, Mike Potter, Doroth..."
9022,False,,1000000,"[Thriller, Romance]",,392572,tt5165344,hi,रुस्तम,"Rustom Pavri, an honourable officer of the Ind...",...,25.0,"Rustom Pavri, an honourable officer of the Ind...",2016,"[Akshay Kumar, Ileana D'Cruz, Esha Gupta, Arja...","[{'credit_id': '5951baf692514129c4016600', 'de...",[bollywood],14,16,Tinu Suresh Desai,"[Rustom Pavri, Cynthia Rustom Pavri, Priti Mak..."
9023,False,,15050000,"[Adventure, Drama, History, Romance]",,402672,tt3859980,hi,Mohenjo Daro,"Village lad Sarman is drawn to big, bad Mohenj...",...,26.0,"Village lad Sarman is drawn to big, bad Mohenj...",2016,"[Hrithik Roshan, Pooja Hegde, Kabir Bedi, Arun...","[{'credit_id': '57cd5d3592514179d50018e8', 'de...",[bollywood],12,16,Ashutosh Gowariker,"[Sarman, Chaani, Maham, Senate Chief, Moonja, ..."


In [78]:
# fillter keyword - only take keyword that present in more
# than one document
def filter_keyword(x):
    words = []
    for i in x:
            words.append(i)
    return words        

In [79]:
smd['keywords'] = smd['keywords'].apply(filter_keyword)

In [81]:
smd['keywords']

0       [jealousy, toy, boy, friendship, friends, riva...
1       [board game, disappearance, based on children'...
2       [fishing, best friend, duringcreditsstinger, o...
3       [based on novel, interracial relationship, sin...
4       [baby, midlife crisis, confidence, aging, daug...
                              ...                        
9020    [shark attack, sequel, farce, las vegas, creat...
9021                           [friendship, brick making]
9022                                          [bollywood]
9023                                          [bollywood]
9024                                 [music, documentary]
Name: keywords, Length: 9025, dtype: object

In [35]:
stemmer = SnowballStemmer('english')

In [36]:
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [remove_stop_word(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x])

In [93]:
smd["soup"] = smd['character'] + smd['keywords'] + smd['cast']  

In [94]:
smd["soup"]

0       [Woody (voice), Buzz Lightyear (voice), Mr. Po...
1       [Alan Parrish, Samuel Alan Parrish / Van Pelt,...
2       [Max Goldman, John Gustafson, Ariel Gustafson,...
3       [Savannah 'Vannah' Jackson, Bernadine 'Bernie'...
4       [George Banks, Nina Banks, Franck Eggelhoffer,...
                              ...                        
9020    [Fin Shepard, April Wexler, Colonel Gilbert Sh...
9021    [Henry Cobb, Karen Potter, Mike Potter, Doroth...
9022    [Rustom Pavri, Cynthia Rustom Pavri, Priti Mak...
9023    [Sarman, Chaani, Maham, Senate Chief, Moonja, ...
9024    [Himself, Himself, Himself (archive footage), ...
Name: soup, Length: 9025, dtype: object

In [95]:
# changing ',' with ' '
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [96]:
smd["soup"]

0       Woody (voice) Buzz Lightyear (voice) Mr. Potat...
1       Alan Parrish Samuel Alan Parrish / Van Pelt Ju...
2       Max Goldman John Gustafson Ariel Gustafson Mar...
3       Savannah 'Vannah' Jackson Bernadine 'Bernie' H...
4       George Banks Nina Banks Franck Eggelhoffer Ann...
                              ...                        
9020    Fin Shepard April Wexler Colonel Gilbert Shepa...
9021    Henry Cobb Karen Potter Mike Potter Dorothy Co...
9022    Rustom Pavri Cynthia Rustom Pavri Priti Makhij...
9023    Sarman Chaani Maham, Senate Chief Moonja, Maha...
9024    Himself Himself Himself (archive footage) Hims...
Name: soup, Length: 9025, dtype: object

In [97]:
cast_crew_text = pd.DataFrame(smd["soup"])
cast_crew_text

Unnamed: 0,soup
0,Woody (voice) Buzz Lightyear (voice) Mr. Potat...
1,Alan Parrish Samuel Alan Parrish / Van Pelt Ju...
2,Max Goldman John Gustafson Ariel Gustafson Mar...
3,Savannah 'Vannah' Jackson Bernadine 'Bernie' H...
4,George Banks Nina Banks Franck Eggelhoffer Ann...
...,...
9020,Fin Shepard April Wexler Colonel Gilbert Shepa...
9021,Henry Cobb Karen Potter Mike Potter Dorothy Co...
9022,Rustom Pavri Cynthia Rustom Pavri Priti Makhij...
9023,"Sarman Chaani Maham, Senate Chief Moonja, Maha..."


In [100]:
cast_crew_text.to_csv(r'../the-movies-dataset/cast_crew_text.csv', index=False)

#### Count Vectorizer Using Sklearn Library

In [39]:
# we do not want to down-weight the presence of an actor/director if he or she has acted or directed in
# relatively more movies.

count = CountVectorizer(analyzer = 'word',ngram_range=(1, 2),min_df=0,stop_words=None,tokenizer=None)
count_matrix = count.fit_transform(smd['soup'])
count_matrix.shape

(9025, 106807)

In [40]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [41]:
cosine_sim

array([[1.        , 0.02441931, 0.02738955, ..., 0.        , 0.        ,
        0.        ],
       [0.02441931, 1.        , 0.        , ..., 0.        , 0.02973505,
        0.        ],
       [0.02738955, 0.        , 1.        , ..., 0.03456506, 0.03335187,
        0.        ],
       ...,
       [0.        , 0.        , 0.03456506, ..., 1.        , 0.07147417,
        0.        ],
       [0.        , 0.02973505, 0.03335187, ..., 0.07147417, 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [42]:
np.save('cosine_sim_ver2', cosine_sim)

In [43]:
cosine_sim = np.load('cosine_sim_ver2.npy')

In [44]:
cosine_sim

array([[1.        , 0.02441931, 0.02738955, ..., 0.        , 0.        ,
        0.        ],
       [0.02441931, 1.        , 0.        , ..., 0.        , 0.02973505,
        0.        ],
       [0.02738955, 0.        , 1.        , ..., 0.03456506, 0.03335187,
        0.        ],
       ...,
       [0.        , 0.        , 0.03456506, ..., 1.        , 0.07147417,
        0.        ],
       [0.        , 0.02973505, 0.03335187, ..., 0.07147417, 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [45]:
#When we reset the index, the old index is added as a column, and a new sequential index is used
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [46]:
indices['The Dark Knight']

6873

#### From scatch

In [43]:
def cosine_sim_calculate(a,b):
    cos_sim = np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

##### create a processed_text by getting 'soup' row in smd

In [44]:
processed_text = {}
count = 0

for index, row in smd.iterrows():
    soup = row['soup']
    processed_text[count] = word_tokenize(str(soup))
    count += 1


##### Create Corpus Bag

In [45]:
DF = {}

for i in range(N):
    tokens = processed_text[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

total_vocab = [x for x in DF]
total_vocab_size = len(total_vocab)

In [46]:
D = np.zeros((N,total_vocab_size))

for i in processed_text:
    counter = Counter(processed_text[i])
    document = processed_text[i]
    for token in document:
        try:
            ind = total_vocab.index(token)
            D[i][ind] = counter[token]
        except:
            pass 
   

In [47]:
def gen_vector(id):
    result = []
    for i in D:
        result.append(cosine_sim_calculate(D[id],i))
    
    return result

In [48]:
def get_recommendations_ver2(title,k):
    idx = []
    idx.append(indices[title])
    result = {}
    
    
    for i in idx:
        out = np.array(gen_vector(i)).argsort()[::-1]
        result[i,title] = out[1:k]
        
    return result

In [49]:
#result = get_recommendations_ver2('The Dark Knight',10)
#for i in result:
 #   print(smd.iloc[result[i]].title)

In [50]:
f = open('Store_Cast_Director_Genres_contentBased.pckl','wb')
pickle.dump([titles,indices],f)
f.close()

In [51]:
f = open('Store_Cast_Director_Genres_contentBased_2.pckl','wb')
pickle.dump(cosine_sim,f)
f.close()

In [None]:
np.save('count_vector_matrix.npy', D)