## Library

In [1]:
%matplotlib inline
import ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os 
import nltk
import string
import pickle
import re
import math

from scipy import stats
from ast import literal_eval
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.corpus import stopwords

import warnings; warnings.simplefilter('ignore')

### Merging cast crew keyword data into main dataframe

#### Loading crew cast keyword into dataframe

In [2]:
credits = pd.read_csv('../the-movies-dataset/credits.csv')
keywords = pd.read_csv('../the-movies-dataset/keywords.csv')
md = pd.read_csv('../the-movies-dataset/movies_metadata.csv')

#### Loading TMDB_id value 

In [3]:
links = pd.read_csv('../the-movies-dataset/links_small.csv')
links[:10]

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
5,6,113277,949.0
6,7,114319,11860.0
7,8,112302,45325.0
8,9,114576,9091.0
9,10,113189,710.0


In [4]:
links = links[links['tmdbId'].notnull()]['tmdbId'].astype('int')

In [5]:
links.shape

(9112,)

In [6]:
# this shits have wost id value 
md = md.drop([19730, 29503, 35587])

In [7]:
md['genres'] = md['genres'].apply(ast.literal_eval)
md['genres'] = md['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [8]:
md['id'] = md['id'].astype('int')
smd = md[md['id'].isin(links)]
smd.shape

(9099, 24)

In [9]:
credits[:5]

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [10]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
cast    45476 non-null object
crew    45476 non-null object
id      45476 non-null int64
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [11]:
keywords[:5]

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [12]:
keywords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
id          46419 non-null int64
keywords    46419 non-null object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB


In [13]:
duplicateRowsDF = keywords[keywords.duplicated(['id'])]
duplicateRowsDF.shape

(987, 2)

In [14]:
keywords[keywords['id'] == 105045]

Unnamed: 0,id,keywords
676,105045,"[{'id': 7059, 'name': 'anti-communism'}, {'id'..."
1465,105045,"[{'id': 7059, 'name': 'anti-communism'}, {'id'..."


In [15]:
md[md['id'] == '1997-08-20']

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count


In [16]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
smd['id'] = smd['id'].astype('int')

In [17]:
smd = smd.merge(credits, on='id')
smd = smd.merge(keywords, on='id')

In [18]:
smd.shape

(9219, 27)

In [19]:
N = len(smd)

#### Split date into year values

In [20]:
# when split release_date into 3 element array just take the first element which mean the year 
smd['year'] = pd.to_datetime(smd['release_date'], errors = 'coerce').apply(lambda x: str(x).split('-')[0] if x!= np.nan else np.nan)

#### HTML Image for views

In [21]:
base_poster_url = 'http://image.tmdb.org/t/p/w185/'
smd['poster_path'] = "<img src='" + base_poster_url + smd['poster_path'] + "' style='height:100px;'>"

#### Extract data-frame

In [22]:
smd.to_csv(r'../the-movies-dataset/movies_metadata_merge_crew_keywords.csv')

### Preprocessing

#### Cast crew 

In [22]:
# by using literal_eval => '[1,2]' would be converted to [1,2]

smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [23]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
             return i['name']
    return np.nan        

In [24]:
# get director from 
smd['director'] = smd['crew'].apply(get_director)
smd['director']

0            John Lasseter
1             Joe Johnston
2            Howard Deutch
3          Forest Whitaker
4            Charles Shyer
               ...        
9214        Gregg Champion
9215     Tinu Suresh Desai
9216    Ashutosh Gowariker
9217          Hideaki Anno
9218            Ron Howard
Name: director, Length: 9219, dtype: object

In [25]:
smd.loc[0]['cast'][:3]

[{'cast_id': 14,
  'character': 'Woody (voice)',
  'credit_id': '52fe4284c3a36847f8024f95',
  'gender': 2,
  'id': 31,
  'name': 'Tom Hanks',
  'order': 0,
  'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'},
 {'cast_id': 15,
  'character': 'Buzz Lightyear (voice)',
  'credit_id': '52fe4284c3a36847f8024f99',
  'gender': 2,
  'id': 12898,
  'name': 'Tim Allen',
  'order': 1,
  'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'},
 {'cast_id': 16,
  'character': 'Mr. Potato Head (voice)',
  'credit_id': '52fe4284c3a36847f8024f9d',
  'gender': 2,
  'id': 7167,
  'name': 'Don Rickles',
  'order': 2,
  'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}]

In [26]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >= 3 else x)

In [27]:
smd['cast'][:5]

0                  [Tom Hanks, Tim Allen, Don Rickles]
1       [Robin Williams, Jonathan Hyde, Kirsten Dunst]
2           [Walter Matthau, Jack Lemmon, Ann-Margret]
3    [Whitney Houston, Angela Bassett, Loretta Devine]
4           [Steve Martin, Diane Keaton, Martin Short]
Name: cast, dtype: object

In [28]:
# creating a metadata dump for every movie which consists of 
# genres, director, main actors and keywords. 

#Lower Capital letter
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x])
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ","")))

In [29]:
# Mention Director 3 times to give it more weight relative to the 
# entire cast.

smd['director'] = smd['director'].apply(lambda x: [x,x, x])
smd['director'][:5]

0          [johnlasseter, johnlasseter, johnlasseter]
1             [joejohnston, joejohnston, joejohnston]
2          [howarddeutch, howarddeutch, howarddeutch]
3    [forestwhitaker, forestwhitaker, forestwhitaker]
4          [charlesshyer, charlesshyer, charlesshyer]
Name: director, dtype: object

#### Keywords

#### Stop word

In [30]:
def remove_stop_word(data):
    stop_word = stopwords.words('english')

    if data not in stop_word: return data
    
    return ""

In [31]:
remove_stop_word("this is a mouse")

'this is a mouse'

In [32]:
smd.loc[0]['keywords']

[{'id': 931, 'name': 'jealousy'},
 {'id': 4290, 'name': 'toy'},
 {'id': 5202, 'name': 'boy'},
 {'id': 6054, 'name': 'friendship'},
 {'id': 9713, 'name': 'friends'},
 {'id': 9823, 'name': 'rivalry'},
 {'id': 165503, 'name': 'boy next door'},
 {'id': 170722, 'name': 'new toy'},
 {'id': 187065, 'name': 'toy comes to life'}]

In [33]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else [])
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s

0          jealousy
0               toy
0               boy
0        friendship
0           friends
           ...     
9217    destruction
9217          kaiju
9217          toyko
9218          music
9218    documentary
Name: keyword, Length: 64407, dtype: object

In [34]:
s = s.value_counts()
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [35]:
s = s[s > 1]

In [36]:
# fillter keyword - only take keyword that present in more
# than one document
def filter_keyword(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words        

In [37]:
smd['keywords'] = smd['keywords'].apply(filter_keyword)

In [38]:
stemmer = SnowballStemmer('english')

In [39]:
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [remove_stop_word(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x])

In [40]:
smd["soup"] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']

In [41]:
# changing ',' with ' '
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

#### Count Vectorizer Using Sklearn Library

In [42]:
# we do not want to down-weight the presence of an actor/director if he or she has acted or directed in
# relatively more movies.

count = CountVectorizer(analyzer = 'word',ngram_range=(1, 2),min_df=0,stop_words=None,tokenizer=None)
count_matrix = count.fit_transform(smd['soup'])
count_matrix.shape

(9219, 107488)

In [43]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [44]:
#When we reset the index, the old index is added as a column, and a new sequential index is used
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [45]:
indices['The Dark Knight']

6981

#### From scatch

In [46]:
def cosine_sim_calculate(a,b):
    cos_sim = np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [47]:
processed_text = {}
count = 0

for index, row in smd.iterrows():
    soup = row['soup']
    processed_text[count] = word_tokenize(str(soup))
    count += 1


##### Create Corpus Bag

In [48]:
DF = {}

for i in range(N):
    tokens = processed_text[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

total_vocab = [x for x in DF]
total_vocab_size = len(total_vocab)

In [49]:
D = np.zeros((N,total_vocab_size))

for i in processed_text:
    counter = Counter(processed_text[i])
    document = processed_text[i]
    for token in document:
        try:
            ind = total_vocab.index(token)
            D[i][ind] = counter[token]
        except:
            pass 
   

In [50]:
def gen_vector(id):
    result = []
    for i in D:
        result.append(cosine_sim_calculate(D[id],i))
    
    return result

In [51]:
def get_recommendations_ver2(title,k):
    idx = []
    idx.append(indices[title])
    result = {}
    
    
    for i in idx:
        out = np.array(gen_vector(i)).argsort()[::-1]
        result[i,title] = out[1:k]
        
    return result

In [52]:
result = get_recommendations_ver2("The Dark Knight",11)

In [53]:
f = open('Store_Cast_Director_Genres_contentBased.pckl','wb')
pickle.dump([titles,indices],f)
f.close()

In [54]:
f = open('Store_Cast_Director_Genres_contentBased_2.pckl','wb')
pickle.dump(cosine_sim,f)
f.close()

In [None]:
np.save('count_vector_matrix.npy', D)