# Data Science Boot Camp Project 1

## (1) Reading Input Data from Files

In [5]:
import numpy as np
import pandas as pd

In [2]:
#imports
import os
import json
import pyprind

#defining function load_mojo_data
#which will read all data files and return list of dictionaries
#each dictionary = info for one movie
def load_mojo_data(baseDirectory = "C:\Work\Training\DSBootCamp\Git\ct16_cap1_ds5\project_1"):
    """
       Output: List of Dictionary of Movie
    """
    try:
        import numpy as np
        print "Start Loading MOJO DATA:"
        DATA_DIR = os.path.join(baseDirectory,'data', 'boxofficemojo')

        DataFileList = [dataFileName for dataFileName in os.listdir(DATA_DIR) if ".json" in dataFileName ]
        MovieInfoList = []

        n = len(DataFileList)
        bar = pyprind.ProgBar(n, bar_char='X')

        i = 0
        for dataFileName in DataFileList:
            i+=1
            target_file_path = os.path.join(DATA_DIR, dataFileName)
            with open(target_file_path, 'r') as target_file:
                movie = json.load(target_file)


                if type(movie) is not dict:
                    continue
                columnNames = ["alt_title", "director", "domestic_gross", "mojo_slug", "opening_per_theater", \
                               "opening_weekend_take", "production_budget", "release_date_limited", \
                               "release_date_wide", "title", "widest_release", "worldwide_gross", "year"]
                for columnName in columnNames:
                    if columnName not in movie:
                        movie[columnName] = np.nan

                MovieInfoList.append(movie)
            bar.update(item_id = i)
        print "Finish Loading MOJO DATA:"
        return MovieInfoList
    except Exception as ex:
        print "Error loading MOJO data: " + " ".join([str(x) for x in ex.args])
        raise(ex)

def load_critic_data(baseDirectory = "C:\Work\Training\DSBootCamp\Git\ct16_cap1_ds5\project_1"):
    """
       Output: List of Dictionary of Movie
    """
    import numpy as np
    try:
        print "Start Loading CRITIC DATA:"
        DATA_DIR = os.path.join(baseDirectory,'data', 'metacritic')

        DataFileList = [dataFileName for dataFileName in os.listdir(DATA_DIR) if ".json" in dataFileName ]
        CriticInfoList = []

        n = len(DataFileList)
        bar = pyprind.ProgBar(n, bar_char='X')

        i = 0
        for dataFileName in DataFileList:
            i+=1
            target_file_path = os.path.join(DATA_DIR, dataFileName)
            with open(target_file_path, 'r') as target_file:
                critic = json.load(target_file)
                if type(critic) is not dict:
                    continue
                columnNames = ["complete", "director", "genre", "metacritic_page", "metascore", \
                               "num_critic_reviews", "num_user_ratings", "num_user_reviews", \
                               "rating", "release_date", "runtime_minutes", "studio", \
                               "title", "user_score", "year"]
                for columnName in columnNames:
                    if columnName not in critic:
                        critic[columnName] = np.nan
                CriticInfoList.append(critic)
            bar.update(item_id = i)
        print "Finish Loading CRITIC DATA:"
        return CriticInfoList
    except Exception as ex:
        print "Error loading CRITIC data: " + " ".join([str(x) for x in ex.args])
        raise(ex)

In [6]:
movie_dicts = load_mojo_data()
movie_df_raw = pd.DataFrame(movie_dicts)

print movie_df_raw.head(5)
print movie_df_raw.shape

0%                          100%
[XX                            ] | ETA[sec]: 1.942 | Item ID: 249

Start Loading MOJO DATA:


[XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX] | ETA[sec]: 0.000 | Item ID: 3728

Finish Loading MOJO DATA:
                           alt_title         director  domestic_gross  \
0                 10,000 B.C. (2008)  Roland Emmerich      94784201.0   
1              102 Dalmatians (2000)             None      66957026.0   
2  10 Things I Hate About You (1999)             None      38178166.0   
3                    10 Years (2012)     Jamie Linden        203373.0   
4                  11 Flowers (2013)             None          9213.0   

               mojo_slug  opening_per_theater  opening_weekend_take  \
0                10000bc              10518.0            35867488.0   
1          102dalmatians               7353.0            19883351.0   
2  10thingsihateaboutyou               3668.0             8330681.0   
3                10years               7569.0               22707.0   
4              11flowers               1758.0                3516.0   

   production_budget release_date_limited release_date_wide  \
0        105000000.0                 None    


Total time elapsed: 1.669 sec


In [13]:
print movie_df_raw.shape
print movie_df_raw.isnull().sum()

(3728, 13)
alt_title                  0
director                1556
domestic_gross            75
mojo_slug                  0
opening_per_theater      180
opening_weekend_take     180
production_budget       2137
release_date_limited    3435
release_date_wide         10
title                      0
widest_release           152
worldwide_gross          907
year                       7
dtype: int64


In [7]:
critic_dicts = load_critic_data()
critic_df_raw = pd.DataFrame(critic_dicts)

print critic_df_raw.head(5)
print critic_df_raw.shape

0%                          100%
[                              ]

Start Loading CRITIC DATA:


[XXXXXXXXXXXXXXXXXXXXXXXXXXXXX ] | ETA[sec]: 0.744 | Item ID: 4610

Finish Loading CRITIC DATA:
  complete         director                        genre  \
0     True       Gil Junger            [Comedy, Romance]   
1     True     Jamie Linden     [Drama, Comedy, Romance]   
2     True  Roland Emmerich  [Adventure, Drama, Fantasy]   
3     True    Stephen Herek  [Adventure, Comedy, Family]   
4     True       Kevin Lima             [Comedy, Family]   

                     metacritic_page  metascore num_critic_reviews  \
0  /movie/10-things-i-hate-about-you       70.0     [18, 7, 1, 26]   
1                    /movie/10-years       61.0     [7, 11, 0, 18]   
2                    /movie/10000-bc       34.0     [4, 8, 17, 29]   
3              /movie/101-dalmatians       49.0      [8, 9, 3, 20]   
4              /movie/102-dalmatians       35.0     [7, 5, 12, 24]   

   num_user_ratings   num_user_reviews rating release_date  runtime_minutes  \
0             175.0     [27, 1, 1, 29]  PG-13   1999-03-31             97.0   
1              12.0       [3, 2,

In [12]:
print critic_df_raw.shape
print critic_df_raw.isnull().sum()

(4765, 16)
complete                 2
director                 2
genre                    2
metacritic_page          2
metascore              177
num_critic_reviews       2
num_user_ratings       284
num_user_reviews         2
rating                   2
release_date             2
runtime_minutes         97
studio                  40
title                    2
unable to retrieve    4763
user_score               2
year                     2
dtype: int64


In [17]:
rawDataDF = movie_df_raw.merge(critic_df_raw, how = 'inner', on = ["year", "title"])

In [18]:
print rawDataDF.head(5)

                           alt_title    director_x  domestic_gross  \
0  10 Things I Hate About You (1999)          None      38178166.0   
1                    10 Years (2012)  Jamie Linden        203373.0   
2               The 11th Hour (2007)          None        707343.0   
3                   127 Hours (2010)   Danny Boyle      18335230.0   
4                   12 Rounds (2009)  Renny Harlin      12234694.0   

               mojo_slug  opening_per_theater  opening_weekend_take  \
0  10thingsihateaboutyou               3668.0             8330681.0   
1                10years               7569.0               22707.0   
2               11thhour              15213.0               60853.0   
3               127hours               2333.0             2136801.0   
4               12rounds               2286.0             5329240.0   

   production_budget release_date_limited release_date_wide  \
0         30000000.0                 None        1999-03-31   
1                NaN      

In [19]:
print rawDataDF.shape

print rawDataDF.isnull().sum()

(2381, 27)
alt_title                  0
director_x               866
domestic_gross            22
mojo_slug                  0
opening_per_theater       47
opening_weekend_take      47
production_budget       1204
release_date_limited    2178
release_date_wide          2
title                      0
widest_release            38
worldwide_gross          329
year                       0
complete                   0
director_y                 0
genre                      0
metacritic_page            0
metascore                 41
num_critic_reviews         0
num_user_ratings          82
num_user_reviews           0
rating                     0
release_date               0
runtime_minutes           41
studio                     8
unable to retrieve      2381
user_score                 0
dtype: int64


In [21]:
import pprint
DirectorList = rawDataDF.director_y.unique()

for director in DirectorList:
    print director

Gil Junger
Jamie Linden
Leila Conners
Danny Boyle
Renny Harlin
Gary Winick
Daniel Stamm
Géla Babluani
Mikael Håfström
Richard Donner
Burr Steers
Roland Emmerich
Kar Wai Wong
Morgan Neville
Robert Luketic
Jon Lucas
Alejandro González Iñárritu
Chris Miller
Spike Lee
Anne Fletcher
Betty Thomas
Juan Carlos Fresnadillo
Julie Delpy
John Singleton
Baltasar Kormákur
Zack Snyder
Demian Lichtenstein
Alexis Lloyd
David Slade
Ruben Fleischer
Claire Denis
Fernando Meirelles
Benoît Jacquot
Brian Helgeland
Bill Ross IV
Carl Rinsch
Marc Webb
Peter Segal
Michael Apted
Rick Bieber
Blayne Weaver
Yann Demange
Jon Avnet
Curtis Hanson
Joel Schumacher
Shane Acker
Aharon Keshales
Chris Weitz
Stephen Elliott
Alexander Payne
Richard Curtis
James Cameron
Kat Coiro
Steve Pink
Julie Taymor
Mike McCoy
Max Mayer
Spike Jonze
Bille Woodruff
George Nolfi
Paul Weitz
Anne Fontaine
Scott Coffey
Greg Mottola
Yael Hersonski
Jill Soloway
Harald Zwart
Alejandro Amenábar
Tobias Lindholm
David Lowery
Wolfgang Petersen
Jieho Lee