In [1309]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import re
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os
import datetime
from datetime import timedelta, date
import random
from dateutil.relativedelta import relativedelta
import pickle
import operator

chromedriver = "/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

In [408]:
df_titles = pd.read_csv('list.csv', delimiter = ',', skipinitialspace = True)

In [4]:
df_titles.head()

Unnamed: 0,title,year
0,The Breadwinner,2017
1,The Mountain Between Us,2017
2,The Sense of an Ending,2017
3,The House of Tomorrow,2017
4,The Dark Tower,2017


In [1332]:
# import functions to scrape data from IMDB
from scrape_imdb import *

In [799]:
driver = webdriver.Chrome(chromedriver)
driver.get('http://www.google.com/')
time.sleep(10); 

In [612]:
dict_imdb = {}
for i in range(len(df_titles)):
    title = df_titles['title'][i]
    year = df_titles['year'][i]
    dict_imdb[title] = get_data_imdb(title,year)

In [615]:
dict_imdb

{'A Christmas Carol': ['A Christmas Carol',
  '2009',
  '2009-11-06',
  'PG',
  '6.8',
  200000000,
  '96',
  'Walt Disney Pictures',
  ['Animation', 'Drama', 'Family'],
  223175116.67],
 "A Dog's Purpose": ["A Dog's Purpose",
  '2017',
  '2017-01-27',
  'PG',
  '6.9',
  22000000,
  '100',
  'Amblin Entertainment',
  ['Adventure', 'Comedy', 'Drama'],
  nan],
 'A Hologram for the King': ['A Hologram for the King',
  '2016',
  '2016-04-28',
  'R',
  '6.1',
  30000000,
  '98',
  '',
  ['Comedy', 'Drama', 'Romance'],
  10278844.44],
 'A Long Way Down': ['A Long Way Down',
  '2014',
  '2014-06-05',
  'R',
  '6.4',
  nan,
  '96',
  'Wildgaze Films',
  ['Comedy', 'Drama'],
  nan],
 'A Monster Calls': ['A Monster Calls',
  '2016',
  '2017-01-06',
  'PG-13',
  '7.5',
  43000000,
  '108',
  'Apaches Entertainment',
  ['Animation', 'Drama', 'Fantasy'],
  nan],
 'A Walk Among the Tombstones': ['A Walk Among the Tombstones',
  '2014',
  '2014-09-19',
  'R',
  '6.5',
  28000000,
  '114',
  '',
  ['C

In [1470]:
df_imdb = pd.DataFrame.from_dict(dict_imdb, orient='index')

In [1471]:
df_imdb.columns = ['title_imdb','year','release_date','mpaa_rating','user_rating','budget','runtime','production_co','genre','avg_dir_gross']

In [1282]:
df_imdb.head()

Unnamed: 0,title_imdb,year,release_date,mpaa_rating,user_rating,budget,runtime,production_co,genre,avg_dir_gross
The Vow,The Vow,2012,2012-02-10,PG-13,6.8,30000000.0,104,Screen Gems,"[Drama, Romance]",144849200.0
The Lorax,The Lorax,2012,2012-03-02,PG,6.4,70000000.0,86,Universal Pictures,"[Animation, Adventure, Comedy]",343539900.0
The Hunger Games,The Hunger Games,2012,2012-03-23,PG-13,7.2,78000000.0,142,Lionsgate,"[Adventure, Sci-Fi, Thriller]",187713900.0
The Lucky One,The Lucky One,2012,2012-04-20,PG-13,6.5,25000000.0,101,Warner Bro,"[Drama, Romance]",37493930.0
Abraham Lincoln: Vampire Hunter,Abraham Lincoln: Vampire Hunter,2012,2012-06-22,R,5.9,69000000.0,105,Abraham Productions,"[Action, Fantasy, Horror]",49260560.0


In [1472]:
# create a set of genres since a set contains unique elements
genre_set = set({})
# loop through all the genre lists in the dataframe and add unique elements to the set 
for genre_list in df_imdb['genre']:
    genre_set = genre_set.union(set(genre_list))
genre_set

{'Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Short',
 'Thriller',
 'War',
 'Western'}

In [1473]:
# loop through each genre value from the set to create a new col in the df and set values to T/F where the genre is in the film's genre list
for genre_val in genre_set:
    df_imdb[genre_val.lower()] = df_imdb['genre'].apply(lambda x: genre_val in x)

In [1474]:
# biography is a genre that should not be included in this dataset 
df_imdb = df_imdb[df_imdb['biography'] == False]
# drop biography, short, genre columns as they are no longer needed - only interested in fictional and full length films
# reset index and drop the new index column since it is a duplicate
df_imdb.reset_index(inplace=True)
df_imdb.rename(columns={"sci-fi":"scifi",'index':'title'},inplace=True)
df_imdb.drop(columns=['biography','genre','short'],inplace=True)
# music and musical are dupes so they should be combined
df_imdb['musical'] = ((df_imdb['musical']) | (df_imdb['music']))
df_imdb.drop(columns=['music'],inplace=True)

In [1475]:
# check where nulls are
df_imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 27 columns):
title            219 non-null object
title_imdb       219 non-null object
year             219 non-null object
release_date     217 non-null object
mpaa_rating      215 non-null object
user_rating      217 non-null object
budget           181 non-null float64
runtime          218 non-null object
production_co    217 non-null object
avg_dir_gross    145 non-null float64
family           219 non-null bool
war              219 non-null bool
thriller         219 non-null bool
mystery          219 non-null bool
western          219 non-null bool
adventure        219 non-null bool
crime            219 non-null bool
drama            219 non-null bool
comedy           219 non-null bool
romance          219 non-null bool
fantasy          219 non-null bool
horror           219 non-null bool
history          219 non-null bool
action           219 non-null bool
musical          219 non-null boo

In [1476]:
# keep the records where there is a user rating and a release date
df_imdb = df_imdb[df_imdb['user_rating'].notnull()]
df_imdb = df_imdb[df_imdb['release_date'].notnull()]

In [1477]:
df_imdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 216 entries, 0 to 218
Data columns (total 27 columns):
title            216 non-null object
title_imdb       216 non-null object
year             216 non-null object
release_date     216 non-null object
mpaa_rating      214 non-null object
user_rating      216 non-null object
budget           181 non-null float64
runtime          216 non-null object
production_co    215 non-null object
avg_dir_gross    144 non-null float64
family           216 non-null bool
war              216 non-null bool
thriller         216 non-null bool
mystery          216 non-null bool
western          216 non-null bool
adventure        216 non-null bool
crime            216 non-null bool
drama            216 non-null bool
comedy           216 non-null bool
romance          216 non-null bool
fantasy          216 non-null bool
horror           216 non-null bool
history          216 non-null bool
action           216 non-null bool
musical          216 non-null boo

In [1478]:
# pickle the dataframe as a saved checkpoint
df_imdb.to_pickle('df_imdb.pkl')

In [1479]:
# find out where the other nulls are and whether they're valid or a result of scraping
df_imdb[df_imdb[['title_imdb','year','release_date','mpaa_rating','budget','runtime','production_co']].isnull().any(axis=1)][['title_imdb','year','release_date','mpaa_rating','user_rating','budget','runtime','production_co']]

Unnamed: 0,title_imdb,year,release_date,mpaa_rating,user_rating,budget,runtime,production_co
10,Being Flynn,2012,2012-04-19,R,6.4,,102,Focus Features
12,Anna Karenina,2012,2012-09-07,R,6.6,,129,Universal Pictures
13,Bel Ami,2012,2012-03-09,R,5.4,,102,Redwave Films
15,The Wind in the Willows,2006,2006-12-18,,6.5,,99,Box TV
16,Coriolanus,2011,2012-01-20,R,6.2,,123,Hermetof Pictures
18,Jane Eyre,2011,2011-04-22,PG-13,7.4,,120,Focus Features
28,Horrid Henry: The Movie,2011,2013-01-11,PG,3.7,,93,Vertigo Films
30,Radio Free Albemuth,2010,2014-06-27,R,5.7,3600000.0,111,
33,Salmon Fishing in the Yemen,2011,2012-04-20,PG-13,6.8,,107,UK Film Council
36,An Invisible Sign,2010,2010-10-07,PG-13,5.4,,96,


In [1587]:
# fill in some holes manually
df_imdb.set_value(30,'production_co','Open Pictures')
df_imdb.set_value(36,'production_co','J2 Pictures')
df_imdb.set_value(15,'mpaa_rating','PG')
df_imdb.set_value(172,'mpaa_rating','R')

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


Unnamed: 0,title,title_imdb,year,release_date,mpaa_rating,user_rating,budget,runtime,production_co,avg_dir_gross,...,drama,comedy,romance,fantasy,horror,history,action,musical,animation,scifi
0,The Vow,The Vow,2012,2012-02-10,PG-13,6.8,30000000.0,104,Screen Gems,1.448492e+08,...,True,False,True,False,False,False,False,False,False,False
1,The Lorax,The Lorax,2012,2012-03-02,PG,6.4,70000000.0,86,Universal Pictures,3.435399e+08,...,False,True,False,False,False,False,False,False,True,False
2,The Hunger Games,The Hunger Games,2012,2012-03-23,PG-13,7.2,78000000.0,142,Lionsgate,1.877139e+08,...,False,False,False,False,False,False,False,False,False,True
3,The Lucky One,The Lucky One,2012,2012-04-20,PG-13,6.5,25000000.0,101,Warner Bro,3.749393e+07,...,True,False,True,False,False,False,False,False,False,False
4,Abraham Lincoln: Vampire Hunter,Abraham Lincoln: Vampire Hunter,2012,2012-06-22,R,5.9,69000000.0,105,Abraham Productions,4.926056e+07,...,False,False,False,True,True,False,True,False,False,False
5,Cloud Atlas,Cloud Atlas,2012,2012-10-26,R,7.5,102000000.0,172,Cloud Atlas Productions,1.027884e+07,...,True,False,False,False,False,False,True,False,False,False
6,Les Miserables,Les Misérables,2012,2012-12-25,PG-13,7.6,61000000.0,158,Working Title Films,8.636660e+07,...,True,False,False,False,False,True,False,True,False,False
7,Life of Pi,Life of Pi,2012,2012-11-21,PG,7.9,120000000.0,127,Fox,6.348579e+07,...,True,False,False,True,False,False,False,False,False,False
8,On the Road,On the Road,2012,2012-05-23,R,6.1,25000000.0,124,M,,...,True,False,True,False,False,False,False,False,False,False
9,Cosmopolis,Cosmopolis,2012,2012-05-25,R,5.0,20500000.0,109,Alfama Films,,...,True,False,False,True,False,False,False,False,False,False


In [1588]:
df_imdb[(df_imdb['production_co']=='') | (df_imdb['mpaa_rating']=='')]

Unnamed: 0,title,title_imdb,year,release_date,mpaa_rating,user_rating,budget,runtime,production_co,avg_dir_gross,...,drama,comedy,romance,fantasy,horror,history,action,musical,animation,scifi
58,The Romantics,The Romantics,2010,2010-05-26,PG-13,5.1,4500000.0,95,,,...,True,True,True,False,False,False,False,False,False,False
114,The Life Before Her Eyes,The Life Before Her Eyes,2007,2008-09-05,R,6.4,8000000.0,90,,,...,True,False,False,True,False,False,False,False,False,False
115,Miracle at St. Anna,Miracle at St. Anna,2008,2008-09-26,R,6.0,45000000.0,160,,33919790.0,...,True,False,False,False,False,False,True,False,False,False
119,What Just Happened?,What Just Happened,2008,2008-10-31,R,5.7,25000000.0,104,,79925700.0,...,True,True,False,False,False,False,False,False,False,False
143,This is Where I Leave You,This Is Where I Leave You,2014,2014-09-19,R,6.6,19800000.0,103,,133736700.0,...,True,True,False,False,False,False,False,False,False,False
147,A Walk Among the Tombstones,A Walk Among the Tombstones,2014,2014-09-19,R,6.5,28000000.0,114,,17868250.0,...,True,False,False,False,False,False,False,False,False,False
176,The Circle,The Circle,2017,2017-04-28,PG-13,5.3,18000000.0,110,,6564940.0,...,True,False,False,False,False,False,False,False,False,True
207,A Hologram for the King,A Hologram for the King,2016,2016-04-28,R,6.1,30000000.0,98,,10278840.0,...,True,True,True,False,False,False,False,False,False,False


In [1589]:
# fill in some holes manually
df_imdb.set_value(58,'production_co','10th Hole Productions')
df_imdb.set_value(114,'production_co','2929 Productions')
df_imdb.set_value(115,'production_co','40 Acres & A Mule Filmworks')
df_imdb.set_value(119,'production_co','2929 Productions')
df_imdb.set_value(143,'production_co','Warner Bro')
df_imdb.set_value(147,'production_co','1984 Private Defense Contractors')
df_imdb.set_value(176,'production_co','1978 Films')
df_imdb.set_value(207,'production_co','X-Filme Creative Pool')

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  
  import sys
  
  if __name__ == '__main__':


Unnamed: 0,title,title_imdb,year,release_date,mpaa_rating,user_rating,budget,runtime,production_co,avg_dir_gross,...,drama,comedy,romance,fantasy,horror,history,action,musical,animation,scifi
0,The Vow,The Vow,2012,2012-02-10,PG-13,6.8,30000000.0,104,Screen Gems,1.448492e+08,...,True,False,True,False,False,False,False,False,False,False
1,The Lorax,The Lorax,2012,2012-03-02,PG,6.4,70000000.0,86,Universal Pictures,3.435399e+08,...,False,True,False,False,False,False,False,False,True,False
2,The Hunger Games,The Hunger Games,2012,2012-03-23,PG-13,7.2,78000000.0,142,Lionsgate,1.877139e+08,...,False,False,False,False,False,False,False,False,False,True
3,The Lucky One,The Lucky One,2012,2012-04-20,PG-13,6.5,25000000.0,101,Warner Bro,3.749393e+07,...,True,False,True,False,False,False,False,False,False,False
4,Abraham Lincoln: Vampire Hunter,Abraham Lincoln: Vampire Hunter,2012,2012-06-22,R,5.9,69000000.0,105,Abraham Productions,4.926056e+07,...,False,False,False,True,True,False,True,False,False,False
5,Cloud Atlas,Cloud Atlas,2012,2012-10-26,R,7.5,102000000.0,172,Cloud Atlas Productions,1.027884e+07,...,True,False,False,False,False,False,True,False,False,False
6,Les Miserables,Les Misérables,2012,2012-12-25,PG-13,7.6,61000000.0,158,Working Title Films,8.636660e+07,...,True,False,False,False,False,True,False,True,False,False
7,Life of Pi,Life of Pi,2012,2012-11-21,PG,7.9,120000000.0,127,Fox,6.348579e+07,...,True,False,False,True,False,False,False,False,False,False
8,On the Road,On the Road,2012,2012-05-23,R,6.1,25000000.0,124,M,,...,True,False,True,False,False,False,False,False,False,False
9,Cosmopolis,Cosmopolis,2012,2012-05-25,R,5.0,20500000.0,109,Alfama Films,,...,True,False,False,True,False,False,False,False,False,False


In [1590]:
df_imdb['user_rating'] = pd.to_numeric(df_imdb['user_rating'], downcast = 'float')
df_imdb['runtime'] = pd.to_numeric(df_imdb['runtime'], downcast = 'float')
df_imdb['release_date'] = pd.to_datetime(df_imdb['release_date'], format = '%Y-%m-%d', errors='ignore')

In [1591]:
# the nulls are significant for avg_dir_gross so they will be set to 0 and a boolean variable added to represent whether the directors are well-known
df_imdb.loc[(df_imdb['avg_dir_gross'].isnull()),'avg_dir_gross'] = 0
df_imdb['dir_accomplished'] = (df_imdb['avg_dir_gross']>0)

In [1592]:
df_imdb.reset_index(inplace=True,drop=True)

In [1543]:
df_imdb.head()

Unnamed: 0,title,title_imdb,year,release_date,mpaa_rating,user_rating,budget,runtime,production_co,avg_dir_gross,...,comedy,romance,fantasy,horror,history,action,musical,animation,scifi,dir_accomplished
0,The Vow,The Vow,2012,2012-02-10,PG-13,6.8,30000000.0,104.0,Screen Gems,144849200.0,...,False,True,False,False,False,False,False,False,False,True
1,The Lorax,The Lorax,2012,2012-03-02,PG,6.4,70000000.0,86.0,Universal Pictures,343539900.0,...,True,False,False,False,False,False,False,True,False,True
2,The Hunger Games,The Hunger Games,2012,2012-03-23,PG-13,7.2,78000000.0,142.0,Lionsgate,187713900.0,...,False,False,False,False,False,False,False,False,True,True
3,The Lucky One,The Lucky One,2012,2012-04-20,PG-13,6.5,25000000.0,101.0,Warner Bro,37493930.0,...,False,True,False,False,False,False,False,False,False,True
4,Abraham Lincoln: Vampire Hunter,Abraham Lincoln: Vampire Hunter,2012,2012-06-22,R,5.9,69000000.0,105.0,Abraham Productions,49260560.0,...,False,False,True,True,False,True,False,False,False,True


In [1593]:
# let's look at production company values
df_imdb.groupby(['production_co']).size().reset_index(name='Freq').sort_values(by=['Freq'],ascending=False)

Unnamed: 0,production_co,Freq
119,Warner Bro,12
111,Twentieth Century Fox,9
56,Focus Features,8
114,Universal Pictures,8
47,DreamWorks,7
105,Summit Entertainment,6
86,New Line Cinema,6
118,Walt Disney Pictures,6
57,Fox,5
39,Columbia Pictures,5


Looks like there are some different variations of the same production companies. These will need to be standardized.

In [1594]:
df_imdb.loc[df_imdb['production_co'].str.contains('Fox'), 'production_co'] = 'Fox'
df_imdb.loc[df_imdb['production_co'].str.contains('Warner'), 'production_co'] = 'Warner'
df_imdb.loc[df_imdb['production_co'].str.contains('Disney'), 'production_co'] = 'Disney'
df_imdb.loc[df_imdb['production_co'].str.contains('Universal'), 'production_co'] = 'Universal'
df_imdb.loc[df_imdb['production_co'].str.contains('DreamWorks'), 'production_co'] = 'DreamWorks'

In [1595]:
# tried making this an iterable or function but the regex wouldn't search properly
regex = r'Film$'
df_imdb['production_co'] = df_imdb['production_co'].apply(lambda x: re.sub(regex,'',x.strip()))
regex = r'Films$'
df_imdb['production_co'] = df_imdb['production_co'].apply(lambda x: re.sub(regex,'',x.strip()))
regex = r'Picture$'
df_imdb['production_co'] = df_imdb['production_co'].apply(lambda x: re.sub(regex,'',x.strip()))
regex = r'Pictures$'
df_imdb['production_co'] = df_imdb['production_co'].apply(lambda x: re.sub(regex,'',x.strip()))
regex = r'Feature$'
df_imdb['production_co'] = df_imdb['production_co'].apply(lambda x: re.sub(regex,'',x.strip()))
regex = r'Features$'
df_imdb['production_co'] = df_imdb['production_co'].apply(lambda x: re.sub(regex,'',x.strip()))
regex = r'Production$'
df_imdb['production_co'] = df_imdb['production_co'].apply(lambda x: re.sub(regex,'',x.strip()))
regex = r'Productions$'
df_imdb['production_co'] = df_imdb['production_co'].apply(lambda x: re.sub(regex,'',x.strip()))
regex = r'Media$'
df_imdb['production_co'] = df_imdb['production_co'].apply(lambda x: re.sub(regex,'',x.strip()))
regex = r'Entertainment$'
df_imdb['production_co'] = df_imdb['production_co'].apply(lambda x: re.sub(regex,'',x.strip()))
regex = r'See more$'
df_imdb['production_co'] = df_imdb['production_co'].apply(lambda x: re.sub(regex,'',x.strip()))

In [1596]:
# now let's look at production company after standardizing
df_company_freq = df_imdb.groupby(['production_co']).size().reset_index(name='Freq').sort_values(by=['Freq'],ascending=False)
df_company_freq

Unnamed: 0,production_co,Freq
55,Fox,21
111,Warner,12
46,DreamWorks,9
54,Focus,8
107,Universal,8
80,New Line Cinema,6
45,Disney,6
99,Summit,6
85,Paramount,5
37,Columbia,5


There are still so many unique production company values. Let's see if we can quantify these by market share.

In [2]:
driver = webdriver.Chrome(chromedriver)
driver.get('http://www.boxofficemojo.com/studio/?view=parent&view2=yearly&yr=2017')

In [8]:
df_company = pd.read_html(driver.current_url, header=0, parse_dates=True)[2]
df_company.columns = ['rank','company','market_share','total_gross','movies_tracked','2017_movies']
df_company['market_share'] = df_company['market_share'].apply(lambda x: x[:-1]).astype(float)
df_company[['company','market_share']].to_csv('company.csv',sep=',',index=False)

In [71]:
driver.get('http://www.boxofficemojo.com/studio/?view=company&view2=yearly&yr=2017')

In [79]:
df_company2 = pd.read_html(driver.current_url, header=0, parse_dates=True)[2]
df_company2.columns = ['rank','company','market_share','total_gross','movies_tracked','2017_movies']
df_company2['market_share'] = df_company2['market_share'].apply(lambda x: x[:-1]).astype(float)
df_company2[['company','market_share']].to_csv('company2.csv',sep=',',index=False)

In [1584]:
df_company_trim = pd.read_csv('company.csv',sep=',')

In [81]:
df_company_trim.head()

Unnamed: 0,company,market_share
0,Buena Vista,21.8
1,Columbia,9.6
2,Focus,1.2
3,Disney,21.8
4,Warner,18.4


In [1597]:
share_list = []
for val in df_imdb['production_co']:
    match = 0
    counter = 0
    max_counter = len(df_company_trim['company'])
    while (match==0) and (counter<max_counter):
        if df_company_trim['company'][counter].find(val) > -1:
            match = 1
        else:
            counter +=1
    try:
        share_list.append(df_company_trim['market_share'][counter])
    except:
        share_list.append(np.nan)

In [1598]:
df_imdb['market_share'] = share_list

In [1599]:
df_imdb[['production_co','market_share']].head()

Unnamed: 0,production_co,market_share
0,Screen Gems,
1,Universal,15.0
2,Lionsgate,8.0
3,Warner,18.4
4,Abraham,


In [1600]:
df_imdb.loc[df_imdb['market_share'].isnull(), 'market_share'] = 0

In [1601]:
# now production_co can be dropped
df_imdb.drop(columns=['production_co'],inplace=True)

In [1602]:
# make dummy variable for mpaa_rating
df_mpaa = pd.get_dummies(df_imdb['mpaa_rating'], prefix='mpaa', drop_first=True)

In [1603]:
df_imdb = df_imdb.join(df_mpaa)

In [1604]:
df_imdb.rename(columns={'mpaa_PG':'mpaa_pg','mpaa_PG-13':'mpaa_pg13','mpaa_R':'mpaa_r'},inplace=True)
df_imdb.drop(columns=['mpaa_rating','budget','release_date'],inplace=True)

In [1605]:
df_imdb.head()

Unnamed: 0,title,title_imdb,year,user_rating,runtime,avg_dir_gross,family,war,thriller,mystery,...,history,action,musical,animation,scifi,dir_accomplished,market_share,mpaa_pg,mpaa_pg13,mpaa_r
0,The Vow,The Vow,2012,6.8,104.0,144849200.0,False,False,False,False,...,False,False,False,False,False,True,0.0,0,1,0
1,The Lorax,The Lorax,2012,6.4,86.0,343539900.0,False,False,False,False,...,False,False,False,True,False,True,15.0,1,0,0
2,The Hunger Games,The Hunger Games,2012,7.2,142.0,187713900.0,False,False,True,False,...,False,False,False,False,True,True,8.0,0,1,0
3,The Lucky One,The Lucky One,2012,6.5,101.0,37493930.0,False,False,False,False,...,False,False,False,False,False,True,18.4,0,1,0
4,Abraham Lincoln: Vampire Hunter,Abraham Lincoln: Vampire Hunter,2012,5.9,105.0,49260560.0,False,False,False,False,...,False,True,False,False,False,True,0.0,0,0,1


In [1606]:
# pickle the dataframe as a saved checkpoint
df_imdb.to_pickle('df_imdb_clean.pkl')