In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import json

In [2]:
import ast
import scipy as stats
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

In [3]:
credits = pd.read_csv('./test_data/credits.csv')
keywords = pd.read_csv('./test_data/keywords.csv')
links = pd.read_csv('./test_data/links.csv')
metaData = pd.read_csv('./test_data/movies_metadata.csv')
ratings = pd.read_csv('./test_data/ratings.csv')

In [4]:
metaData.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [5]:
metaData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
adult                    45466 non-null object
belongs_to_collection    4494 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
id                       45466 non-null object
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45460 non-null object
status                   45379 non-null objec

In [6]:
metaData.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [7]:
metaData.shape

(45466, 24)

In [8]:
metaData.isna().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [9]:
metaData['genres'] = metaData['genres'].fillna('[]').apply(ast.literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [10]:
metaData['belongs_to_collection']

0        {'id': 10194, 'name': 'Toy Story Collection', ...
1                                                      NaN
2        {'id': 119050, 'name': 'Grumpy Old Men Collect...
3                                                      NaN
4        {'id': 96871, 'name': 'Father of the Bride Col...
5                                                      NaN
6                                                      NaN
7                                                      NaN
8                                                      NaN
9        {'id': 645, 'name': 'James Bond Collection', '...
10                                                     NaN
11                                                     NaN
12       {'id': 117693, 'name': 'Balto Collection', 'po...
13                                                     NaN
14                                                     NaN
15                                                     NaN
16                                                     N

In [11]:
metaData['belongs_to_collection'] = metaData['belongs_to_collection'].fillna('{}')

In [12]:
metaData.isna().sum()

adult                        0
belongs_to_collection        0
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [13]:
metaData['homepage']

0                     http://toystory.disney.com/toy-story
1                                                      NaN
2                                                      NaN
3                                                      NaN
4                                                      NaN
5                                                      NaN
6                                                      NaN
7                                                      NaN
8                                                      NaN
9             http://www.mgm.com/view/movie/757/Goldeneye/
10                                                     NaN
11                                                     NaN
12                                                     NaN
13                                                     NaN
14                                                     NaN
15                                                     NaN
16                                                     N

In [14]:
metaData['homepage'] = metaData['homepage'].fillna('NA')

In [15]:
metaData.isna().sum()

adult                        0
belongs_to_collection        0
budget                       0
genres                       0
homepage                     0
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [16]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [17]:
metaData['id'] = metaData['id'].apply(convert_int)
metaData[metaData['id'].isnull()]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
19730,- Written by Ørnås,0.065736,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[Carousel Productions, Vision View Entertainme...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,1,,,,,,,,,
29503,Rune Balot goes to a casino connected to the ...,1.931659,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,"[Aniplex, GoHands, BROSTA TV, Mardock Scramble...","[{'iso_3166_1': 'US', 'name': 'United States o...",,0,68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,...,12,,,,,,,,,
35587,Avalanche Sharks tells the story of a bikini ...,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[Odyssey Media, Pulser Productions, Rogue Stat...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",,0,82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,22,,,,,,,,,


In [18]:
metaData = metaData.drop([19730,29503,35587])

In [19]:
metaData.isna().sum()

adult                        0
belongs_to_collection        0
budget                       0
genres                       0
homepage                     0
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   3
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      3
runtime                    260
spoken_languages             3
status                      84
tagline                  25051
title                        3
video                        3
vote_average                 3
vote_count                   3
dtype: int64

In [20]:
metaData = metaData[pd.notnull(metaData['imdb_id'])]
metaData = metaData[pd.notnull(metaData['title'])]

In [21]:
metaData.isna().sum()

adult                        0
belongs_to_collection        0
budget                       0
genres                       0
homepage                     0
id                           0
imdb_id                      0
original_language           11
original_title               0
overview                   952
popularity                   0
poster_path                378
production_companies         0
production_countries         0
release_date                81
revenue                      0
runtime                    255
spoken_languages             0
status                      81
tagline                  25035
title                        0
video                        0
vote_average                 0
vote_count                   0
dtype: int64

In [22]:
metaData['original_language']

0        en
1        en
2        en
3        en
4        en
5        en
6        en
7        en
8        en
9        en
10       en
11       en
12       en
13       en
14       en
15       en
16       en
17       en
18       en
19       en
20       en
21       en
22       en
23       en
24       en
25       en
26       en
27       en
28       fr
29       zh
         ..
45436    en
45437    en
45438    nl
45439    en
45440    en
45441    en
45442    en
45443    fr
45444    fr
45445    en
45446    en
45447    fr
45448    fr
45449    en
45450    fr
45451    fr
45452    en
45453    hi
45454    en
45455    it
45456    en
45457    en
45458    en
45459    en
45460    en
45461    fa
45462    tl
45463    en
45464    en
45465    en
Name: original_language, Length: 45443, dtype: object

In [23]:
metaData['original_language'] = metaData['original_language'].fillna('NA')

In [24]:
metaData.isna().sum()

adult                        0
belongs_to_collection        0
budget                       0
genres                       0
homepage                     0
id                           0
imdb_id                      0
original_language            0
original_title               0
overview                   952
popularity                   0
poster_path                378
production_companies         0
production_countries         0
release_date                81
revenue                      0
runtime                    255
spoken_languages             0
status                      81
tagline                  25035
title                        0
video                        0
vote_average                 0
vote_count                   0
dtype: int64

In [25]:
metaData['overview'] = metaData['overview'].fillna('')
metaData['tagline'] = metaData['tagline'].fillna('')

In [26]:
metaData.isna().sum()

adult                      0
belongs_to_collection      0
budget                     0
genres                     0
homepage                   0
id                         0
imdb_id                    0
original_language          0
original_title             0
overview                   0
popularity                 0
poster_path              378
production_companies       0
production_countries       0
release_date              81
revenue                    0
runtime                  255
spoken_languages           0
status                    81
tagline                    0
title                      0
video                      0
vote_average               0
vote_count                 0
dtype: int64

In [27]:
metaData['popularity']

0         21.9469
1         17.0155
2         11.7129
3         3.85949
4         8.38752
5         17.9249
6         6.67728
7         2.56116
8         5.23158
9          14.686
10        6.31844
11        5.43033
12        12.1407
13          5.092
14        7.28448
15        10.1374
16        10.6732
17        9.02659
18        8.20545
19        7.33791
20        12.6696
21        10.7018
22        11.0659
23        12.1331
24         10.332
25         1.8459
26        8.68132
27        2.22843
28        9.82242
29        1.10091
           ...   
45436    1.270832
45437    20.82178
45438    0.590087
45439    0.143223
45440    0.767762
45441    4.392389
45442     0.21926
45443    1.618458
45444    0.208349
45445    0.148131
45446    0.725084
45447    0.213973
45448    0.071782
45449    2.568495
45450    1.109068
45451    0.225432
45452    0.011025
45453    1.559596
45454    0.139936
45455    0.225051
45456    0.222814
45457    0.076061
45458     0.38645
45459    0.661558
45460    5

In [28]:
metaData['popularity'] = metaData['popularity'].fillna('0.0')

In [29]:
metaData.isna().sum()

adult                      0
belongs_to_collection      0
budget                     0
genres                     0
homepage                   0
id                         0
imdb_id                    0
original_language          0
original_title             0
overview                   0
popularity                 0
poster_path              378
production_companies       0
production_countries       0
release_date              81
revenue                    0
runtime                  255
spoken_languages           0
status                    81
tagline                    0
title                      0
video                      0
vote_average               0
vote_count                 0
dtype: int64

In [30]:
metaData['poster_path']

0        /rhIRbceoE9lR4veEXuwCC2wARtG.jpg
1        /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg
2        /6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg
3        /16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg
4        /e64sOI48hQXyru7naBFyssKFxVd.jpg
5        /zMyfPUelumio3tiDKPffaUpsQTD.jpg
6        /jQh15y5YB7bWz1NtffNZmRw0s9D.jpg
7        /sGO5Qa55p7wTu7FJcX4H4xIVKvS.jpg
8        /eoWvKD60lT95Ss1MYNgVExpo5iU.jpg
9        /5c0ovjT41KnYIHYuF4AWsTe3sKh.jpg
10       /lymPNGLZgPHuqM29rKMGV46ANij.jpg
11       /xve4cgfYItnOhtzLYoTwTVy5FGr.jpg
12       /gV5PCAVCPNxlOLFM1bKk50EqLXO.jpg
13       /cICkmCEiXRhvZmbuAlsA5D9B2rK.jpg
14       /odM9973kIv9hcjfHPp6g6BlyTIJ.jpg
15       /xo517ibXBDdYQY81j0WIG7BVcWq.jpg
16       /lA9HTy84Bb6ZwNeyoZKobcMdpMc.jpg
17       /eQs5hh9rxrk1m4xHsIz1w11Ngqb.jpg
18       /wRlGnJhEzcxBjvWtvbjhDSU1cIY.jpg
19       /jSozzzVOR2kfXgTUuGnbgG2yRFi.jpg
20       /vWtDUUgQAsVyvRW4mE75LBgVm2e.jpg
21       /80czeJGSoik22fhtUM9WzyjUU4r.jpg
22       /xAx5MP7Dg4y85pyS7atX6eWk4Qd.jpg
23       /1uRKsxOCtgz0xVqs9l4hYtp4

In [31]:
metaData['poster_path'] = metaData['poster_path'].fillna('NA')

In [32]:
metaData.isna().sum()

adult                      0
belongs_to_collection      0
budget                     0
genres                     0
homepage                   0
id                         0
imdb_id                    0
original_language          0
original_title             0
overview                   0
popularity                 0
poster_path                0
production_companies       0
production_countries       0
release_date              81
revenue                    0
runtime                  255
spoken_languages           0
status                    81
tagline                    0
title                      0
video                      0
vote_average               0
vote_count                 0
dtype: int64

In [33]:
metaData.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862.0,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,{},65000000,"[Adventure, Fantasy, Family]",,8844.0,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602.0,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,{},16000000,"[Comedy, Drama, Romance]",,31357.0,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862.0,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [34]:
metaData['release_date'] = metaData['release_date'].fillna('NA')
metaData['runtime'] = metaData['runtime'].fillna('0.0')
metaData['status'] = metaData['status'].fillna('NA')

In [35]:
metaData.isna().sum()

adult                    0
belongs_to_collection    0
budget                   0
genres                   0
homepage                 0
id                       0
imdb_id                  0
original_language        0
original_title           0
overview                 0
popularity               0
poster_path              0
production_companies     0
production_countries     0
release_date             0
revenue                  0
runtime                  0
spoken_languages         0
status                   0
tagline                  0
title                    0
video                    0
vote_average             0
vote_count               0
dtype: int64

In [36]:
metaData.shape

(45443, 24)

In [45]:
metaData.to_csv('test_data/metaDataPreprocessed.csv',encoding='utf-8')