# Movie Data ETL Pipeline - Transform Wikipedia Data

### Dependencies

In [2]:
%matplotlib inline

import json
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Kaggle data

Download `movies_metadata.csv` and `ratings.csv` from the TMDB's movie dataset at the link below. Move both files into the `data/raw/` directory.

Source: https://www.kaggle.com/rounakbanik/the-movies-dataset

In [4]:
# Path to data directory
data_path = '../data/'

In [10]:
# Kaggle movie metadata
kmovies_df = pd.read_csv(data_path + 'raw/movies_metadata.csv', low_memory=False)
print(kmovies_df.info())
kmovies_df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [11]:
# Kaggle ratings data
kratings_df = pd.read_csv(data_path + 'raw/ratings.csv')
print(kratings_df.info(null_counts=True))
kratings_df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 4 columns):
 #   Column     Non-Null Count     Dtype  
---  ------     --------------     -----  
 0   userId     26024289 non-null  int64  
 1   movieId    26024289 non-null  int64  
 2   rating     26024289 non-null  float64
 3   timestamp  26024289 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 794.2 MB
None


Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435


### Wikipedia data

This data was scraped from the movie pages on [Wikipedia](https://en.wikipedia.org/).

In [12]:
# Wikipedia movie data
wmovies_df = pd.read_json(data_path + 'raw/wikipedia.movies.json')
print(wmovies_df.info())
wmovies_df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7311 entries, 0 to 7310
Columns: 193 entries, url to Polish
dtypes: float64(5), object(188)
memory usage: 10.8+ MB
None


Unnamed: 0,url,year,imdb_link,title,Directed by,Produced by,Screenplay by,Story by,Based on,Starring,...,Predecessor,Founders,Area served,Products,Services,Russian,Hebrew,Revenue,Operating income,Polish
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990.0,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,Renny Harlin,"[Steve Perry, Joel Silver]","[David Arnott, James Cappe, Daniel Waters]","[David Arnott, James Cappe]","[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",...,,,,,,,,,,
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990.0,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet",James Foley,"[Ric Kidney, Robert Redlin]","[James Foley, Robert Redlin]",,"[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",...,,,,,,,,,,


In [21]:
# Inspect columns
print(wmovies_df.columns.sort_values().tolist())

['Actor control', 'Adaptation by', 'Alias', 'Alma mater', 'Also known as', 'Animation by', 'Arabic', 'Area', 'Area served', 'Artist(s)', 'Attraction type', 'Audio format', 'Author', 'Based on', 'Biographical data', 'Bopomofo', 'Born', 'Box office', 'Budget', 'Camera setup', 'Cantonese', 'Characters', 'Children', 'Chinese', 'Cinematography', 'Closing date', 'Color process', 'Comics', 'Composer(s)', 'Coordinates', 'Country', 'Country of origin', 'Cover artist', 'Created by', 'Date premiered', 'Designer(s)', 'Developed by', 'Developer(s)', 'Dewey Decimal', 'Died', 'Directed by', 'Director', 'Distributed by', 'Distributor', 'Divisions', 'Duration', 'Edited by', 'Editor(s)', 'Ending theme', 'Engine', 'Engine(s)', 'Executive producer(s)', 'Family', 'Fate', 'Film(s)', 'Followed by', 'Format(s)', 'Formerly', 'Founded', 'Founder', 'Founders', 'French', 'Full name', 'Gender', 'Genre', 'Genre(s)', 'Genres', 'Gwoyeu Romatzyh', 'Hangul', 'Hanyu Pinyin', 'Headquarters', 'Hebrew', 'Height', 'Hepburn'

Since the Wikipedia data is a lot messier than the Kaggle data, Reading the it in as a Pandas dataframe resulted in 193 columns and a lot of null values. Instead, we'll be reading this data in as JSON and cleaning it up before converting it to a dataframe.

In [13]:
# Wikipedia movie data
with open(data_path + 'raw/wikipedia.movies.json', 'r') as f:
    wmovies = json.load(f)
    
print('Number of records:', len(wmovies))
print('Sample record:')
wmovies[0]

Number of records: 7311
Sample record:


{'url': 'https://en.wikipedia.org/wiki/The_Adventures_of_Ford_Fairlane',
 'year': 1990,
 'imdb_link': 'https://www.imdb.com/title/tt0098987/',
 'title': 'The Adventures of Ford Fairlane',
 'Directed by': 'Renny Harlin',
 'Produced by': ['Steve Perry', 'Joel Silver'],
 'Screenplay by': ['David Arnott', 'James Cappe', 'Daniel Waters'],
 'Story by': ['David Arnott', 'James Cappe'],
 'Based on': ['Characters', 'by Rex Weiner'],
 'Starring': ['Andrew Dice Clay',
  'Wayne Newton',
  'Priscilla Presley',
  'Lauren Holly',
  'Morris Day',
  'Robert Englund',
  "Ed O'Neill"],
 'Narrated by': 'Andrew "Dice" Clay',
 'Music by': ['Cliff Eidelman', 'Yello'],
 'Cinematography': 'Oliver Wood',
 'Edited by': 'Michael Tronick',
 'Productioncompany ': 'Silver Pictures',
 'Distributed by': '20th Century Fox',
 'Release date': ['July 11, 1990', '(', '1990-07-11', ')'],
 'Running time': '102 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$20 million',
 'Box office': '$21.4 milli

### Filter for movies

There seem to be TV shows mixed into the data. To filter for movies, records must have a value for:
1. `imdb_link`
2. `Directed by`/`Director`
3. `Duration`/`Length`/`Running time`

Records should also NOT have a value for:
1. `No. of seasons`
2. `No. of episodes`

In [17]:
# Filter out tv shows
wmovies_filtered = [movie for movie in wmovies if 
                    ('imdb_link' in movie) and 
                    ('Directed by' in movie or 'Director' in movie) and 
                    ('Duration' in movie or 'Length' in movie or 'Running time' in movie) and 
                    ('No. of seasons' not in movie) and 
                    ('No. of episodes' not in movie)]
len(wmovies_filtered)

6936

In [22]:
# Convert to df
wmovies_filtered_df = pd.DataFrame(wmovies_filtered)
print(wmovies_filtered_df.shape)

# Inspect columns
wmovies_cols = wmovies_filtered_df.columns.sort_values()
wmovies_cols

(6936, 75)


Index(['Adaptation by', 'Also known as', 'Animation by', 'Arabic',
       'Audio format', 'Based on', 'Box office', 'Budget', 'Cantonese',
       'Chinese', 'Cinematography', 'Color process', 'Composer(s)', 'Country',
       'Country of origin', 'Created by', 'Directed by', 'Director',
       'Distributed by', 'Distributor', 'Edited by', 'Editor(s)',
       'Executive producer(s)', 'Followed by', 'French', 'Genre', 'Hangul',
       'Hebrew', 'Hepburn', 'Japanese', 'Label', 'Language', 'Length',
       'Literally', 'Mandarin', 'McCune–Reischauer', 'Music by', 'Narrated by',
       'Original language(s)', 'Original network', 'Original release',
       'Original title', 'Picture format', 'Polish', 'Preceded by',
       'Produced by', 'Producer', 'Producer(s)', 'Production company(s)',
       'Production location(s)', 'Productioncompanies ', 'Productioncompany ',
       'Recorded', 'Release date', 'Released', 'Revised Romanization',
       'Romanized', 'Running time', 'Russian', 'Screen st

### Clean columns

Just by filtering out the TV shows, the data has been reduced to 75 columns. Let's take a look at a sample value for each column to get a better understanding of the data. Since there are a lot of missing values in most of the columns, the sample values will not all correspond to the same record.

In [23]:
# Sample value for each column
for col in wmovies_cols:
    # Print the first non-null value in the column
    print(col, ':', wmovies_filtered_df.loc[wmovies_filtered_df[col].notnull(), col].values[0])

Adaptation by : ['John L. Balderston', 'Paul Perez', 'Daniel Moore']
Also known as : Detonator II: Night Watch
Animation by : ['Andreas Deja', 'Gary Dunn', 'Deboissy Sylvain']
Arabic : قضية رقم ٢٣
Audio format : Stereo
Based on : ['Characters', 'by Rex Weiner']
Box office : $21.4 million
Budget : $20 million
Cantonese : ['Jip', '6', 'Man', '6', 'Saam', '1']
Chinese : 摇滚藏獒
Cinematography : Oliver Wood
Color process : Technicolor
Composer(s) : Richard Bellis
Country : United States
Country of origin : United States
Created by : ['John William Corrington', '(novel)']
Directed by : Renny Harlin
Director : Mark "Aldo" Miceli
Distributed by : 20th Century Fox
Distributor : NBC
Edited by : Michael Tronick
Editor(s) : ['Christopher Cooke', 'James Galloway']
Executive producer(s) : Rich Melcombe
Followed by : See below
French : Le Cinquième Élément
Genre : Thriller
Hangul : 원더풀 데이즈
Hebrew : פוֹקְסטְרוֹט
Hepburn : Omoide no Mānī
Japanese : 思い出のマーニー
Label : ['Warner Music Vision', 'Warner-Reprise

We will be addressing a few things here:
1. There are a lot of columns holding alternate titles for the movies so we're going to group all of these together in the JSON data.
2. There are also a lot of redundant columns giving the same information, such as `Directed by` and `Director`. We'll be grouping these together as well.
3. The column names are inconsistent, so we will rename them for consistency.

In [34]:
# Keys holding alternate titles
title_keys = ['Also known as', 'Arabic', 'Cantonese', 'Chinese', 'French', 
              'Hangul', 'Hebrew', 'Hepburn', 'Japanese', 'Literally', 'Mandarin', 
              'McCune–Reischauer', 'Original title', 'Polish', 'Revised Romanization', 
              'Romanized', 'Russian', 'Simplified', 'Traditional', 'Yiddish']

# Key rename pairs (old name: new name)
keys_to_rename = {
    'Adaptation by': 'writers',
    'Animation by': 'animators',
    'Audio format': 'audio_format',
    'Based on': 'based_on',
    'Box office': 'box_office',
    'Budget': 'budget',
    'Cinematography': 'cinematographers',
    'Color process': 'color_process',
    'Composer(s)': 'composers',
    'Country': 'country',
    'Country of origin': 'country',
    'Created by': 'creators',
    'Directed by': 'director',
    'Director': 'director',
    'Distributed by': 'distributor',
    'Distributor': 'distributor',
    'Edited by': 'editors',
    'Editor(s)': 'editors',
    'Executive producer(s)': 'executive_producers',
    'Followed by': 'sequel',
    'Genre': 'genre',
    'Label': 'label',
    'Language': 'languages',
    'Length': 'duration',
    'Music by': 'composers',
    'Narrated by': 'narrator',
    'Original language(s)': 'languages', 
    'Original network': 'network', 
    'Original release': 'release_date',
    'Picture format': 'picture_format',
    'Preceded by': 'prequel',
    'Produced by': 'producers',
    'Producer': 'producers',
    'Producer(s)': 'producers',
    'Production company(s)': 'production_companies',
    'Production location(s)': 'production_locations',
    'Productioncompanies ': 'production_companies',
    'Productioncompany ': 'production_companies',
    'Recorded': 'recorded',
    'Release date': 'release_date',
    'Released': 'release_date',
    'Running time': 'duration',
    'Screen story by': 'writers',
    'Screenplay by': 'writers',
    'Starring': 'stars',
    'Story by': 'writers',
    'Suggested by': 'suggestors',
    'Theme music composer': 'composers',
    'Venue': 'venue',
    'Voices of': 'voicers',
    'Written by': 'writers',
    'imdb_link': 'imdb_link', 
    'title': 'title',
    'url': 'url', 
    'year': 'year'
}

len(title_keys), len(keys_to_rename)

(20, 55)

In [35]:
def clean_movie(movie_dict, title_keys=title_keys, keys_to_rename=keys_to_rename):
    
    """
    Clean movie dictionary with the following steps:
        [1] combine all alternate titles into a single key
        [2] rename keys for consistency and to consolidate similar columns into 1
    
    Args:
        [1] movie_dict (dict) - record to clean
        [2] title_keys (list[str]) - names of keys with the movie's alternate titles
        [3] keys_to_rename (dict{str: str}) - mapping of old key name to new key name
    
    Returns:
        (Dict) clean movie dictionary
    """
    
    # Copy of movie dict and empty dict for alternate titles
    movie_dict, alt_titles = dict(movie_dict), dict()
    
    # Add keys with alternate titles to alt_titles dict and delete the original key
    for key in title_keys:
        if key in movie_dict:
            alt_titles[key.lower().replace(' ', '_')] = movie_dict.pop(key)
            
    # Add new key for alternate titles
    if len(alt_titles):
        movie_dict['alternate_titles'] = alt_titles
        
    # Rename keys
    for old, new in keys_to_rename.items():
        if old in movie_dict:
            movie_dict[new] = movie_dict.pop(old)
        
    return movie_dict
            
    
# Test function
clean_movie(wmovies_filtered[849])

{'alternate_titles': {'mandarin': 'Xǐyàn', 'traditional': '喜宴'},
 'box_office': '$23.6 million',
 'budget': '$1 million',
 'cinematographers': 'Jong Lin',
 'country': ['Taiwan', 'United States'],
 'director': 'Ang Lee',
 'distributor': 'The Samuel Goldwyn Company',
 'editors': 'Tim Squyres',
 'languages': ['Mandarin Chinese', 'English'],
 'composers': 'Mader',
 'producers': ['Ang Lee', 'Ted Hope', 'James Schamus'],
 'production_companies': 'Good Machine',
 'release_date': ['4 August 1993', '(', '1993-08-04', ')', '(United States)'],
 'duration': '106 minutes',
 'stars': ['Ah-Leh Gua',
  'Sihung Lung',
  'May Chin',
  'Winston Chao',
  'Mitchell Lichtenstein'],
 'writers': ['Ang Lee', 'Neil Peng', 'James Schamus'],
 'imdb_link': 'https://www.imdb.com/title/tt0107156/',
 'title': 'The Wedding Banquet',
 'url': 'https://en.wikipedia.org/wiki/The_Wedding_Banquet',
 'year': 1993}

In [36]:
# Clean movie dictionaries
wmovies_clean = [clean_movie(movie) for movie in wmovies_filtered]

# Convert Wikipedia data to df
wmovies_clean_df = pd.DataFrame(wmovies_clean)
print(wmovies_clean_df.shape)

# Inspect columns
wmovies_clean_df.columns.sort_values()

(6936, 38)


Index(['alternate_titles', 'animators', 'audio_format', 'based_on',
       'box_office', 'budget', 'cinematographers', 'color_process',
       'composers', 'country', 'creators', 'director', 'distributor',
       'duration', 'editors', 'executive_producers', 'genre', 'imdb_link',
       'label', 'languages', 'narrator', 'network', 'picture_format',
       'prequel', 'producers', 'production_companies', 'production_locations',
       'recorded', 'release_date', 'sequel', 'stars', 'suggestors', 'title',
       'url', 'venue', 'voicers', 'writers', 'year'],
      dtype='object')

### Duplicated rows

In [15]:
# Make a copy of the df
wiki_drop_dups_df = wiki_group_titles_df.copy()

# Inspect IMDB links
wiki_drop_dups_df.loc[:5, 'IMDB link']

0    https://www.imdb.com/title/tt0098987/
1    https://www.imdb.com/title/tt0098994/
2    https://www.imdb.com/title/tt0099005/
3    https://www.imdb.com/title/tt0099012/
4    https://www.imdb.com/title/tt0099018/
5    https://www.imdb.com/title/tt0099026/
Name: IMDB link, dtype: object

In [16]:
# Extract IMDB ID from the link
# wiki_drop_dups_df['ID'] = wiki_drop_dups_df['IMDB link'].str.split('/', expand=True)[4]
wiki_drop_dups_df['ID'] = wiki_drop_dups_df['IMDB link'].str.extract(r'(tt\d{7})')
wiki_drop_dups_df.head(2)

Unnamed: 0,Based on,Release date,Country,Budget,Box office,Cinematographer(s),Director,Distributor,Editor(s),Language(s),...,Suggestor(s),Alternate Titles,Released,Recorded,Venue,Label,Producer,Color process,Animator(s),ID
0,"[Characters, by Rex Weiner]","[July 11, 1990, (, 1990-07-11, )]",United States,$20 million,$21.4 million,Oliver Wood,Renny Harlin,20th Century Fox,Michael Tronick,English,...,,,,,,,,,,tt0098987
1,"[the novel, After Dark, My Sweet, by, Jim Thom...","[May 17, 1990, (, 1990-05-17, ), (Cannes Film ...",United States,$6 million,$2.7 million,Mark Plummer,James Foley,Avenue Pictures,Howard E. Smith,English,...,,,,,,,,,,tt0098994


In [17]:
# Check for duplicate movies
print(wiki_drop_dups_df.duplicated(subset=['ID']).sum())
wiki_drop_dups_df[wiki_drop_dups_df.duplicated(subset=['ID'], keep=False)].sort_values('ID').head(4)

42


Unnamed: 0,Based on,Release date,Country,Budget,Box office,Cinematographer(s),Director,Distributor,Editor(s),Language(s),...,Suggestor(s),Alternate Titles,Released,Recorded,Venue,Label,Producer,Color process,Animator(s),ID
23,"[Characters, by, H. P. Lovecraft]","[September 8, 1990, (, 1990-09-08, ), (, TIFF,...",United States,,,Rick Fichter,Brian Yuzna,50th Street Films,Peter Teschner,English,...,,,,,,,,,,tt0099180
273,"[Characters, by, H. P. Lovecraft]","[September 8, 1990, (, 1990-09-08, ), (, TIFF,...",United States,,,Rick Fichter,Brian Yuzna,50th Street Films,Peter Teschner,English,...,,,,,,,,,,tt0099180
199,,"[3 October 1990, (, 1990-10-03, ), (Norway), 1...","[Norway, Sweden, United States]","[60 million, Norwegian Kroner]",$15.1 million,Erling Thurmann-Andersen,Nils Gaup,Buena Vista Pictures,Nils Pagh Andersen,English,...,,,,,,,,,,tt0099816
611,,"[3 October 1990, (, 1990-10-03, ), (Norway), 1...","[Norway, Sweden, United States]","[60 million, Norwegian Kroner]",$15.1 million,Erling Thurmann-Andersen,Nils Gaup,Buena Vista Pictures,Nils Pagh Andersen,English,...,,,,,,,,,,tt0099816


In [18]:
# Drop duplicate movies
wiki_drop_dups_df.drop_duplicates(subset=['ID'], inplace=True)
wiki_drop_dups_df.shape

(6894, 41)

### Columns with mostly missing values

In [19]:
# Check columns where at least 70% of the values are missing
cols_missing = wiki_drop_dups_df.isnull().mean()
cols_missing70 = cols_missing[cols_missing >= 0.7]
cols_missing70

Narrator                  0.959240
Genre                     0.984624
Network                   0.982594
Executive producer(s)     0.986365
Production location(s)    0.993328
Picture format            0.991007
Audio format              0.991297
Sequel                    0.998695
Voicer(s)                 0.999710
Creator(s)                0.998549
Prequel                   0.998549
Suggestor(s)              0.999855
Alternate Titles          0.996954
Released                  0.999710
Recorded                  0.999710
Venue                     0.999855
Label                     0.999710
Producer                  0.999710
Color process             0.999855
Animator(s)               0.999710
dtype: float64

In [20]:
# Drop these columns and inspect remaining columns
wiki_drop_cols_df = wiki_drop_dups_df.drop(cols_missing70.index, axis=1)
print(wiki_drop_cols_df.shape)
wiki_drop_cols_df.isnull().mean()

(6894, 21)


Based on                 0.684798
Release date             0.003916
Country                  0.031912
Budget                   0.315202
Box office               0.208297
Cinematographer(s)       0.093560
Director                 0.000000
Distributor              0.043951
Editor(s)                0.070351
Language(s)              0.010299
Composer(s)              0.067740
Producer(s)              0.025529
Production company(s)    0.231651
Duration                 0.000000
Writer(s)                0.026690
Stars                    0.023934
IMDB link                0.000000
Title                    0.000000
URL                      0.000000
Year                     0.000000
ID                       0.000000
dtype: float64

### Column data types

In [21]:
# Inspect columns
print(wiki_drop_cols_df.info())
wiki_drop_cols_df.head(2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6894 entries, 0 to 6935
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Based on               2173 non-null   object
 1   Release date           6867 non-null   object
 2   Country                6674 non-null   object
 3   Budget                 4721 non-null   object
 4   Box office             5458 non-null   object
 5   Cinematographer(s)     6249 non-null   object
 6   Director               6894 non-null   object
 7   Distributor            6591 non-null   object
 8   Editor(s)              6409 non-null   object
 9   Language(s)            6823 non-null   object
 10  Composer(s)            6427 non-null   object
 11  Producer(s)            6718 non-null   object
 12  Production company(s)  5297 non-null   object
 13  Duration               6894 non-null   object
 14  Writer(s)              6710 non-null   object
 15  Stars                

Unnamed: 0,Based on,Release date,Country,Budget,Box office,Cinematographer(s),Director,Distributor,Editor(s),Language(s),...,Producer(s),Production company(s),Duration,Writer(s),Stars,IMDB link,Title,URL,Year,ID
0,"[Characters, by Rex Weiner]","[July 11, 1990, (, 1990-07-11, )]",United States,$20 million,$21.4 million,Oliver Wood,Renny Harlin,20th Century Fox,Michael Tronick,English,...,"[Steve Perry, Joel Silver]",Silver Pictures,102 minutes,"[David Arnott, James Cappe]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,tt0098987
1,"[the novel, After Dark, My Sweet, by, Jim Thom...","[May 17, 1990, (, 1990-05-17, ), (Cannes Film ...",United States,$6 million,$2.7 million,Mark Plummer,James Foley,Avenue Pictures,Howard E. Smith,English,...,"[Ric Kidney, Robert Redlin]",Avenue Pictures,114 minutes,"[James Foley, Robert Redlin]","[Jason Patric, Rachel Ward, Bruce Dern, George...",https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet","https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,tt0098994


In [22]:
# Columns that need to be converted
cols_to_recast = {
    'Release date': 'datetime',
    'Budget': 'numeric',
    'Box office': 'numeric',
    'Duration': 'numeric'
}

### Convert release date to datetime type

In [23]:
def list_to_str(obj):
    
    ''' Convert a string to a list '''
    
    return obj if type(obj) in [float, str] else ' '.join(obj)
    
#     # If object is a list, join list elements
#     if  type(obj) not in [float, str]:
#         s = ''
#         for e in obj:
#             if isinstance(e, str): # if element is string
#                 s += e + ' '
#             else: # if element is another list
#                 s += ' '.join(e) + ' '
#         return s
#     return obj


# Convert lists in release dates to strings
date_to_str = wiki_drop_cols_df['Release date'].apply(list_to_str)
print(len(date_to_str))
# date_to_str.unique().tolist()

6894


In [24]:
# def standardize_date(d):
    
#     ''' Standardize a date string into the format YYYY-MM-DD '''
    
#     months = {
#         'January': '01',
#         'February': '02',
#         'March': '03',
#         'April': '04',
#         'May': '05',
#         'June': '06',
#         'July': '07',
#         'August': '08',
#         'September': '09',
#         'October': '10',
#         'November': '11',
#         'December': '12'
#     }
    
#     # Initialize date variables
#     YYYY = MM = DD = None
    
#     # For format: (Month) (DD), (YYYY)
#     match = re.search(r'([JFMASOND][a-z]{2,8}) (\d\d?), (\d{4})', d)
#     if match:
#         DD = match.group(2)
#         DD = '0' + DD if len(DD) == 1 else DD
#         MM = months[match.group(1)]
#         YYYY = match.group(3)

#     # For format: (DD) (Month) (YYYY)
#     match = re.search(r'(\d\d?) ([JFMASOND][a-z]{2,8}) (\d{4})', d)
#     if match:
#         DD = match.group(1)
#         DD = '0' + DD if len(DD) == 1 else DD
#         MM = months[match.group(2)]
#         YYYY = match.group(3)
        
#     # For format: (Month),? (YYYY)
#     match = re.search(r'([JFMASOND][a-z]{2,8}),? (\d{4})', d)
#     if match:
#         MM = months[match.group(1)]
#         YYYY = match.group(2)
       
#     # Standard format: YYYY-MM-DD
#     date = d if not YYYY else YYYY + '-' + MM 
#     return date if not DD else date + '-' + DD
    

# def extract_date(s):
    
#     '''
#     Extract a date from the release date strings
#     Note: In a range, select the left limit
#         Ex. December 15 - 20, 1999 becomes December 15, 1999
#         Ex. 1990 - 1992 becomes 1990
#     '''
    
#     # Null check
#     if pd.isnull(s):
#         return s
    
#     # Select left limit of ranges
#     s = re.sub(r' [-–—] \d\d?', '', s.strip()) # include hyphen, en dash, em dash
    
#     # For format: (DD)? (Month) (DD)?,? (YYYY)
#     formats = [r'(?:\d\d? )?[JFMASOND][a-z]{2,8}(?: \d\d?)?,? \d{4}']
    
#     # For format: (YYYY)-(MM)-(DD)
#     formats.append(r'\d{4}(?:\D\d\d?\D\d\d?)?')
    
#     for f in formats:
#         match = re.search(f, s)
#         if match:
#             return standardize_date(match.group(0))


# # Test functions
# dates_extracted, dates_not_extracted = [], []
# for s in date_to_str.tolist():
#     d = extract_date(s)
#     if d is None:
#         dates_not_extracted.append(s)
#     else:
#         dates_extracted.append(d)
# len(dates_extracted), len(dates_not_extracted)

In [25]:
# # Convert to datetime
# date_convert_to_dt = pd.to_datetime(date_to_str.apply(extract_date))
# date_convert_to_dt

In [26]:
# Select lower limit of date ranges
date_replace_range = date_to_str.str.strip().str.replace(r' [-–—] \d\d?', '', regex=True)

In [26]:
# Date formats
date_format1 = r'(?:\d\d? )?[a-z]{3,9}(?: \d\d?)?,? \d{4}'
date_format2 = r'\d{4}(?:\D\d\d?\D\d\d?)?'

# Check formats
date_contains1 = date_replace_range.dropna().str.contains(date_format1, flags=re.IGNORECASE)
date_contains2 = date_replace_range.dropna().str.contains(date_format2, flags=re.IGNORECASE)
date_replace_range.dropna()[~date_contains1 & ~date_contains2]

Series([], Name: Release date, dtype: object)

In [27]:
# Extract date from string
date_clean_str = date_replace_range.str.extract(f'({date_format1}|{date_format2})', flags=re.IGNORECASE)[0]

# Convert date string to datetime type
date_to_dt = pd.to_datetime(date_clean_str, infer_datetime_format=True)
date_to_dt.isnull().sum()

27

In [28]:
# Replace release date column
wiki_recast_date_df = wiki_drop_cols_df.copy()
wiki_recast_date_df['Release date'] = date_to_dt
wiki_recast_date_df['Release date']

0      1990-07-11
1      1990-05-17
2      1990-08-10
3      1990-12-25
4      1990-12-19
          ...    
6931   2018-12-25
6932   2018-12-11
6933   2018-11-08
6934   2018-08-31
6935   2018-12-28
Name: Release date, Length: 6894, dtype: datetime64[ns]

### Convert budget to numeric

In [29]:
# Convert lists in budget to strings
budget_to_str = wiki_recast_date_df['Budget'].apply(list_to_str)

# Clean string and select lower limit of amount ranges
budget_replace_range = budget_to_str.str.strip().str.replace(r'\[\s*(?:\w+\s*)*\]', '', regex=True) \
                                                .str.replace(r'[-–—]\s?\$?\d+', '', regex=True)
# budget_replace_range.unique().tolist()

In [30]:
# Budget formats
budget_format1 = r'\$?\s?\d{1,3}(?:\.\d+)?\s*mil'
budget_format2 = r'\$?\s?\d{1,3}(?:,\d{3})+'

# Check formats
budget_contains1 = budget_replace_range.dropna().str.contains(budget_format1, flags=re.IGNORECASE)
budget_contains2 = budget_replace_range.dropna().str.contains(budget_format2, flags=re.IGNORECASE)
budget_to_na = budget_replace_range.dropna()[~budget_contains1 & ~budget_contains2].unique().tolist()
budget_to_na

['Unknown', 'HBO', '$218.32', 'N/A', '19 crore', '3.5 crore']

In [31]:
# Replace above values with NaN
budget_replace_vals = budget_replace_range.copy()
for val in budget_to_na:
    budget_replace_vals.replace(val, np.NaN, inplace=True)
budget_replace_vals.dropna()[~budget_contains1 & ~budget_contains2]

Series([], Name: Budget, dtype: object)

In [32]:
# Extract amount from string
budget_clean_str = budget_replace_vals.str.extract(f'({budget_format1}|{budget_format2})', flags=re.IGNORECASE)[0]
# budget_clean_str.unique().tolist()

In [33]:
def parse_budget(s):
    
    ''' Convert budget string to float '''
    
    # Null check
    if isinstance(s, float):
        return s
    
    # Remove $, spaces, and commas
    s = re.sub(r'[\$\s,]', '', s).lower()
    
    # Convert to float
    if 'mil' in s:
        f = float(s.replace('mil', '')) * 1e6
    else:
        f = float(s)
        
    return f


# Convert budget to float type
budget_to_float = budget_clean_str.apply(parse_budget)
budget_to_float.isnull().sum()

2182

In [34]:
# Replace budget column in df
wiki_recast_budget_df = wiki_recast_date_df.copy()
wiki_recast_budget_df['Budget'] = budget_to_float
wiki_recast_budget_df['Budget']

0       20000000.0
1        6000000.0
2       35000000.0
3       12000000.0
4       25000000.0
           ...    
6931    42000000.0
6932    60000000.0
6933    20000000.0
6934     9000000.0
6935           NaN
Name: Budget, Length: 6894, dtype: float64

### Convert box office to numeric

In [35]:
# Convert lists in box office to strings
boxoffice_to_str = wiki_recast_budget_df['Box office'].apply(list_to_str)
# boxoffice_to_str.unique().tolist()

In [36]:
# Box office formats
boxoffice_format1 = r'\$?\s?\d{1,3}(?:\.\d+)?\s*[kmb]'
boxoffice_format2 = r'\$?\s?\d{1,3}(?:[\s\.,]?\d{3})+\$?'

# Check formats
boxoffice_contains1 = boxoffice_to_str.dropna().str.contains(boxoffice_format1, flags=re.IGNORECASE)
boxoffice_contains2 = boxoffice_to_str.dropna().str.contains(boxoffice_format2, flags=re.IGNORECASE)
boxoffice_to_na = boxoffice_to_str.dropna()[~boxoffice_contains1 & ~boxoffice_contains2].unique().tolist()
boxoffice_to_na

['N/A',
 '$309',
 'TBA',
 '$20-30',
 '£2.56',
 'Unknown',
 '$588',
 'less than $372',
 '8 crore']

In [37]:
# Replace above values with NaN
boxoffice_replace_vals = boxoffice_to_str.copy()
for val in boxoffice_to_na:
    boxoffice_replace_vals.replace(val, np.NaN, inplace=True)
boxoffice_replace_vals.dropna()[~boxoffice_contains1 & ~boxoffice_contains2]

Series([], Name: Box office, dtype: object)

In [38]:
# Extract amount from string
boxoffice_clean_str = boxoffice_replace_vals.str.extract(f'({boxoffice_format1}|{boxoffice_format2})', 
                                                         flags=re.IGNORECASE)[0]
# boxoffice_clean_str.unique().tolist()

In [39]:
def parse_boxoffice(s):
    
    ''' Convert box office string to float '''
    
    # Null check
    if isinstance(s, float):
        return s
    
    # Remove $, spaces, and commas
    s = re.sub(r'[\$\s,]', '', s).lower()
    
    # Convert to float
    if 'k' in s:
        f = float(s.replace('k', '')) * 1e3
    elif 'm' in s:
        f = float(s.replace('m', '')) * 1e6
    elif 'b' in s:
        f = float(s.replace('b', '')) * 1e9
    elif '.' in s:
        f = float(s.replace('.', ''))
    else:
        f = float(s)
        
    return f


# Convert budget to float type
boxoffice_to_float = boxoffice_clean_str.apply(parse_boxoffice)
boxoffice_to_float.isnull().sum()

1445

In [40]:
# Replace release date column
wiki_recast_boxoffice_df = wiki_recast_budget_df.copy()
wiki_recast_boxoffice_df['Box office'] = boxoffice_to_float
wiki_recast_boxoffice_df['Box office']

0       21400000.0
1        2700000.0
2       57718089.0
3        7331647.0
4        6939946.0
           ...    
6931    41900000.0
6932    76100000.0
6933    38400000.0
6934     5500000.0
6935           NaN
Name: Box office, Length: 6894, dtype: float64

### Convert duration to numeric

In [41]:
# Convert lists in duration to strings
duration_to_str = wiki_recast_budget_df['Duration'].apply(list_to_str)

# Clean string and select lower limit of amount ranges
duration_replace_range = duration_to_str.str.strip().str.replace(r'\[\d\]', '', regex=True) \
                                                    .str.replace(r'[-–—]\s?\d+', '', regex=True)
# duration_replace_range.unique().tolist()

In [42]:
# Duration formats
duration_format1 = r'(?:\d\s*[Hh]o?u?r?s?\s*)?\d{1,3}\s*[Mm]'
duration_format2 = r'\d\s*[Hh]o?u?r?s?|\d{1,2}\s*\:\s*\d{1,2}'

# Check formats
duration_contains1 = duration_replace_range.dropna().str.contains(duration_format1, flags=re.IGNORECASE)
duration_contains2 = duration_replace_range.dropna().str.contains(duration_format2, flags=re.IGNORECASE)
duration_to_na = duration_replace_range.dropna()[~duration_contains1 & ~duration_contains2].unique().tolist()
duration_to_na

['varies', 'minutes']

In [43]:
# Replace above values with NaN
duration_replace_vals = duration_replace_range.copy()
for val in duration_to_na:
    duration_replace_vals.replace(val, np.NaN, inplace=True)
duration_replace_vals.dropna()[~duration_contains1 & ~duration_contains2]

Series([], Name: Duration, dtype: object)

In [44]:
# Extract duration from string
duration_clean_str = duration_replace_vals.str.extract(f'({duration_format1}|{duration_format2})', 
                                                       flags=re.IGNORECASE)[0]
# duration_clean_str.unique().tolist()

In [45]:
def parse_duration(s):
    
    ''' Convert duration string to integer '''
    
    # Null check
    if isinstance(s, float):
        return s
    
    # Remove seconds
    s = re.sub(r'\:\s*\d{1,2}', '', s)
    
    # Remove m and spaces
    s = re.sub(r'm|\s*', '', s, flags=re.IGNORECASE)
    
    # Convert to int
    match = re.search(r'(\d)(ho?u?r?s?)(\d\d?)?', s, flags=re.IGNORECASE)
    if match: # if time is in hours
        i = int(match.group(1)) * 60
        if match.group(3):
            i += int(match.group(3))
    else: # if time is in minutes
        i = int(s)
        
    return i


# Convert duration to integer type
duration_to_int = duration_clean_str.apply(parse_duration)
duration_to_int.isnull().sum()

2

In [46]:
# Replace release date column
wiki_recast_duration_df = wiki_recast_boxoffice_df.copy()
wiki_recast_duration_df['Duration'] = duration_to_int
print(wiki_recast_duration_df.info())
wiki_recast_duration_df.head(2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6894 entries, 0 to 6935
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Based on               2173 non-null   object        
 1   Release date           6867 non-null   datetime64[ns]
 2   Country                6674 non-null   object        
 3   Budget                 4712 non-null   float64       
 4   Box office             5449 non-null   float64       
 5   Cinematographer(s)     6249 non-null   object        
 6   Director               6894 non-null   object        
 7   Distributor            6591 non-null   object        
 8   Editor(s)              6409 non-null   object        
 9   Language(s)            6823 non-null   object        
 10  Composer(s)            6427 non-null   object        
 11  Producer(s)            6718 non-null   object        
 12  Production company(s)  5297 non-null   object        
 13  Dur

Unnamed: 0,Based on,Release date,Country,Budget,Box office,Cinematographer(s),Director,Distributor,Editor(s),Language(s),...,Producer(s),Production company(s),Duration,Writer(s),Stars,IMDB link,Title,URL,Year,ID
0,"[Characters, by Rex Weiner]",1990-07-11,United States,20000000.0,21400000.0,Oliver Wood,Renny Harlin,20th Century Fox,Michael Tronick,English,...,"[Steve Perry, Joel Silver]",Silver Pictures,102.0,"[David Arnott, James Cappe]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,tt0098987
1,"[the novel, After Dark, My Sweet, by, Jim Thom...",1990-05-17,United States,6000000.0,2700000.0,Mark Plummer,James Foley,Avenue Pictures,Howard E. Smith,English,...,"[Ric Kidney, Robert Redlin]",Avenue Pictures,114.0,"[James Foley, Robert Redlin]","[Jason Patric, Rachel Ward, Bruce Dern, George...",https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet","https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,tt0098994


In [47]:
wiki_movies_df = wiki_recast_duration_df.copy()
wiki_movies_df.shape

(6894, 21)

### Inspect Kaggle data

In [48]:
print(kaggle_movies.info())
kaggle_movies.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


### Duplicate rows

In [49]:
# Check for duplicated rows
print(kaggle_movies.duplicated(subset='imdb_id').sum())
kaggle_movies[kaggle_movies.duplicated(subset='imdb_id', keep=False)].sort_values('imdb_id').head()

48


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
35587,Avalanche Sharks tells the story of a bikini ...,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[{'name': 'Odyssey Media', 'id': 17161}, {'nam...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",2014-01-01,0,82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,22,,,,,,,,,
29503,Rune Balot goes to a casino connected to the ...,1.931659,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,"[{'name': 'Aniplex', 'id': 2883}, {'name': 'Go...","[{'iso_3166_1': 'US', 'name': 'United States o...",2012-09-29,0,68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,...,12,,,,,,,,,
19730,- Written by Ørnås,0.065736,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[{'name': 'Carousel Productions', 'id': 11176}...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1997-08-20,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,1,,,,,,,,,
16167,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,99080,tt0022537,en,The Viking,"Originally called White Thunder, American prod...",...,1931-06-21,0.0,70.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Actually produced during the Great Newfoundlan...,The Viking,False,0.0,0.0
38871,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,99080,tt0022537,en,The Viking,"Originally called White Thunder, American prod...",...,1931-06-21,0.0,70.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Actually produced during the Great Newfoundlan...,The Viking,False,0.0,0.0


In [50]:
# Invalid IMDB IDs
kaggle_movies['imdb_id'].count() - kaggle_movies['imdb_id'].dropna().str.contains(r'tt\d{7}').sum()

3

In [51]:
# Replace IMDB ID 0 with NaN
kaggle_replace_id0_df = kaggle_movies.copy()
kaggle_replace_id0_df['imdb_id'].replace('0', np.NaN, inplace=True)
kaggle_replace_id0_df['imdb_id'].count() - kaggle_replace_id0_df['imdb_id'].dropna().str.contains(r'tt\d{7}').sum()

0

In [52]:
# Drop duplicate movies
kaggle_drop_dups_df = kaggle_replace_id0_df.drop_duplicates(subset=['imdb_id'])
kaggle_drop_dups_df.duplicated(subset='imdb_id').sum()

0

### Columns with mostly missing values

In [53]:
# Inspect columns
kaggle_drop_dups_df.isnull().mean()

adult                    0.000000
belongs_to_collection    0.901226
budget                   0.000000
genres                   0.000000
homepage                 0.828853
id                       0.000000
imdb_id                  0.000022
original_language        0.000242
original_title           0.000000
overview                 0.020961
popularity               0.000066
poster_path              0.008389
production_companies     0.000066
production_countries     0.000066
release_date             0.001850
revenue                  0.000066
runtime                  0.005681
spoken_languages         0.000066
status                   0.001850
tagline                  0.550873
title                    0.000066
video                    0.000066
vote_average             0.000066
vote_count               0.000066
dtype: float64

In [54]:
# Drop columns with more than 70% missing values
kaggle_drop_cols_df = kaggle_drop_dups_df.drop(['belongs_to_collection', 'homepage'], axis=1)
kaggle_drop_cols_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45417 entries, 0 to 45465
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   adult                 45417 non-null  object 
 1   budget                45417 non-null  object 
 2   genres                45417 non-null  object 
 3   id                    45417 non-null  object 
 4   imdb_id               45416 non-null  object 
 5   original_language     45406 non-null  object 
 6   original_title        45417 non-null  object 
 7   overview              44465 non-null  object 
 8   popularity            45414 non-null  object 
 9   poster_path           45036 non-null  object 
 10  production_companies  45414 non-null  object 
 11  production_countries  45414 non-null  object 
 12  release_date          45333 non-null  object 
 13  revenue               45414 non-null  float64
 14  runtime               45159 non-null  float64
 15  spoken_languages   

### Recast columns

In [55]:
# Check column values
kaggle_drop_cols_df.head(2).iloc[:, 9:]

Unnamed: 0,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [56]:
# Columns to recast
cols_to_recast = {
    'adult': 'boolean',
    'budget': 'numeric',
    'id': 'numeric',
    'popularity': 'numeric',
    'release_date': 'datetime',
    'video': 'boolean'
}

### Convert adult and video to boolean type

In [57]:
# Inspect adult values
kaggle_drop_cols_df['adult'].value_counts()

False    45408
True         9
Name: adult, dtype: int64

In [58]:
# Inspect video values
kaggle_drop_cols_df['video'].value_counts()

False    45321
True        93
Name: video, dtype: int64

In [59]:
# Remove adult movies and drop both columns
kaggle_drop_bool_df = kaggle_drop_cols_df.query('adult != "True"').drop(['adult', 'video'], axis=1).copy()
kaggle_drop_bool_df.shape

(45408, 20)

### Convert release date to datetime type

In [60]:
# Recast release date to datetime
kaggle_recast_date_df = kaggle_drop_bool_df.copy()
kaggle_recast_date_df['release_date'] = pd.to_datetime(kaggle_recast_date_df['release_date'])
kaggle_recast_date_df['release_date']

0       1995-10-30
1       1995-12-15
2       1995-12-22
3       1995-12-22
4       1995-02-10
           ...    
45461          NaT
45462   2011-11-17
45463   2003-08-01
45464   1917-10-21
45465   2017-06-09
Name: release_date, Length: 45408, dtype: datetime64[ns]

### Convert budget, ID, and popularity to numeric

In [61]:
# Recast budget and ID to integer
kaggle_recast_numeric_df = kaggle_recast_date_df.copy()
kaggle_recast_numeric_df['budget'] = kaggle_recast_numeric_df['budget'].astype(int)
kaggle_recast_numeric_df['id'] = kaggle_recast_numeric_df['id'].astype(int)

# Recast popularity to float
kaggle_recast_numeric_df['popularity'] = kaggle_recast_numeric_df['popularity'].astype(float)
kaggle_recast_numeric_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45408 entries, 0 to 45465
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   budget                45408 non-null  int64         
 1   genres                45408 non-null  object        
 2   id                    45408 non-null  int64         
 3   imdb_id               45407 non-null  object        
 4   original_language     45397 non-null  object        
 5   original_title        45408 non-null  object        
 6   overview              44456 non-null  object        
 7   popularity            45405 non-null  float64       
 8   poster_path           45027 non-null  object        
 9   production_companies  45405 non-null  object        
 10  production_countries  45405 non-null  object        
 11  release_date          45325 non-null  datetime64[ns]
 12  revenue               45405 non-null  float64       
 13  runtime         

In [62]:
kaggle_movies_df = kaggle_recast_numeric_df.copy()
kaggle_movies_df.shape

(45408, 20)

### Inspect ratings data

In [63]:
print(kaggle_ratings.info(null_counts=True))
kaggle_ratings.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 4 columns):
 #   Column     Non-Null Count     Dtype  
---  ------     --------------     -----  
 0   userId     26024289 non-null  int64  
 1   movieId    26024289 non-null  int64  
 2   rating     26024289 non-null  float64
 3   timestamp  26024289 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 794.2 MB
None


Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435


### Save data

In [66]:
wiki_movies_df.to_pickle(data_path + 'wiki_movies.pkl')
pd.read_pickle(data_path + 'wiki_movies.pkl').head(2)

Unnamed: 0,Based on,Release date,Country,Budget,Box office,Cinematographer(s),Director,Distributor,Editor(s),Language(s),...,Producer(s),Production company(s),Duration,Writer(s),Stars,IMDB link,Title,URL,Year,ID
0,"[Characters, by Rex Weiner]",1990-07-11,United States,20000000.0,21400000.0,Oliver Wood,Renny Harlin,20th Century Fox,Michael Tronick,English,...,"[Steve Perry, Joel Silver]",Silver Pictures,102.0,"[David Arnott, James Cappe]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,tt0098987
1,"[the novel, After Dark, My Sweet, by, Jim Thom...",1990-05-17,United States,6000000.0,2700000.0,Mark Plummer,James Foley,Avenue Pictures,Howard E. Smith,English,...,"[Ric Kidney, Robert Redlin]",Avenue Pictures,114.0,"[James Foley, Robert Redlin]","[Jason Patric, Rachel Ward, Bruce Dern, George...",https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet","https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,tt0098994


In [67]:
kaggle_movies_df.to_pickle(data_path + 'kaggle_movies.pkl')
pd.read_pickle(data_path + 'kaggle_movies.pkl').head(2)

Unnamed: 0,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,7.7,5415.0
1,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0
