In [1]:
%matplotlib inline
import ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import seaborn as sns

from IPython.display import Image, HTML, display
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise.model_selection import cross_validate
from surprise import Reader, SVD, Dataset


import warnings; warnings.simplefilter('ignore')

## Preprocess DataFrame

In [2]:
md = pd.read_csv('../the-movies-dataset/movies_metadata.csv')

In [3]:
md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
adult                    45466 non-null object
belongs_to_collection    4494 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
id                       45466 non-null object
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45460 non-null object
status                   45379 non-null objec

In [4]:
md.head().transpose()[:20]

Unnamed: 0,0,1,2,3,4
adult,False,False,False,False,False
belongs_to_collection,"{'id': 10194, 'name': 'Toy Story Collection', ...",,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",,"{'id': 96871, 'name': 'Father of the Bride Col..."
budget,30000000,65000000,0,16000000,0
genres,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'id': 35, 'name': 'Comedy'}]"
homepage,http://toystory.disney.com/toy-story,,,,
id,862,8844,15602,31357,11862
imdb_id,tt0114709,tt0113497,tt0113228,tt0114885,tt0113041
original_language,en,en,en,en,en
original_title,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II
overview,"Led by Woody, Andy's toys live happily in his ...",When siblings Judy and Peter discover an encha...,A family wedding reignites the ancient feud be...,"Cheated on, mistreated and stepped on, the wom...",Just when George Banks has recovered from his ...


### Split date into year values

In [5]:
# when split release_date into 3 element array just take the first element which mean the year 
md['year'] = pd.to_datetime(md['release_date'], errors = 'coerce').apply(lambda x: str(x).split('-')[0] if x!= np.nan else np.nan)

In [6]:
pd.DataFrame(data = md['genres']).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 1 columns):
genres    45466 non-null object
dtypes: object(1)
memory usage: 355.3+ KB


In [7]:
md.loc[0]['genres']

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [8]:
md.loc[md['genres'].notnull() == False]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year


In [9]:
md['genres'] = md['genres'].apply(ast.literal_eval)

In [10]:
md['genres'] = md['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

### Get List Genres

In [11]:
md['genres'][:20]

0            [Animation, Comedy, Family]
1           [Adventure, Fantasy, Family]
2                      [Romance, Comedy]
3               [Comedy, Drama, Romance]
4                               [Comedy]
5       [Action, Crime, Drama, Thriller]
6                      [Comedy, Romance]
7     [Action, Adventure, Drama, Family]
8          [Action, Adventure, Thriller]
9          [Adventure, Action, Thriller]
10              [Comedy, Drama, Romance]
11                      [Comedy, Horror]
12        [Family, Animation, Adventure]
13                      [History, Drama]
14                   [Action, Adventure]
15                        [Drama, Crime]
16                      [Drama, Romance]
17                       [Crime, Comedy]
18            [Crime, Comedy, Adventure]
19               [Action, Comedy, Crime]
Name: genres, dtype: object

In [12]:
s = md.apply(lambda x: pd.Series(x['genres']), axis = 1)

In [13]:
s = md.apply(lambda x: pd.Series(x['genres']), axis = 1)

In [14]:
s = s.stack().reset_index(level=1, drop = True)

### List genres

In [15]:
s[:5]

0    Animation
0       Comedy
0       Family
1    Adventure
1      Fantasy
dtype: object

In [16]:
s.name = 'genre'

In [17]:
gen_md = md.drop('genres', axis = 1).join(s)

In [18]:
gen_md[:5]

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,genre
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Animation
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Comedy
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,Family
1,False,,65000000,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,Adventure
1,False,,65000000,,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,Fantasy


In [19]:
gen_md['genre'].value_counts().shape[0]

32

In [20]:
pop_gen = pd.DataFrame(gen_md['genre'].value_counts()).reset_index()

In [21]:
pop_gen.column = ['genre','movies']

In [22]:
pop_gen['index'][:5]

0       Drama
1      Comedy
2    Thriller
3     Romance
4      Action
Name: index, dtype: object