In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline
import re
import cpi

import plotly.offline
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected = True)


In [2]:
# source of the data https://www.kaggle.com/c/tmdb-box-office-prediction/data

train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

# train_df.head()


In [3]:
cols = ['original_title', 'title', 'belongs_to_collection', 'genres', 'homepage', 'original_language', 'production_companies', 'runtime', 'release_date', 'budget', 'revenue']
df = train_df[cols]
df.head()


Unnamed: 0,original_title,title,belongs_to_collection,genres,homepage,original_language,production_companies,runtime,release_date,budget,revenue
0,Hot Tub Time Machine 2,Hot Tub Time Machine 2,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...","[{'id': 35, 'name': 'Comedy'}]",,en,"[{'name': 'Paramount Pictures', 'id': 4}, {'na...",93.0,2/20/15,14000000,12314651
1,The Princess Diaries 2: Royal Engagement,The Princess Diaries 2: Royal Engagement,"[{'id': 107674, 'name': 'The Princess Diaries ...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,en,"[{'name': 'Walt Disney Pictures', 'id': 2}]",113.0,8/6/04,40000000,95149435
2,Whiplash,Whiplash,,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,en,"[{'name': 'Bold Films', 'id': 2266}, {'name': ...",105.0,10/10/14,3300000,13092000
3,Kahaani,Kahaani,,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,hi,,122.0,3/9/12,1200000,16000000
4,마린보이,Marine Boy,,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,ko,,118.0,2/5/09,0,3923970


In [4]:
def col_string_extraction(col_name):

    col_name_list = []

    for i in range(df.shape[0]):
        st = df[col_name].iloc[i]
        # some movies have production compnaies as null
        if type(st) != str:
            col_name_list.append(['NaN'])
        
        else:
            subs = re.findall("\{(.*?)\}", str(st))

            col = []
            for j in range(len(subs)):
                extracted = re.findall("\: '(.*?)\'", subs[j])
                # some entries do not fit in the regex above
                if len(extracted) != 0:
                    col.append(extracted[0])
                else:
                    continue

            col_name_list.append(col)
    
    return col_name_list


In [5]:
production_company_list = col_string_extraction('production_companies')
genre_list = col_string_extraction('genres')
collection_list = col_string_extraction('belongs_to_collection')
flat_collection_list = [i for sub in collection_list for i in sub]

df.insert(2, 'genre', genre_list)
df.insert(3, 'collection', flat_collection_list)
df.insert(4, 'production_company', production_company_list)

df = df.drop(['genres', 'production_companies', 'belongs_to_collection'], axis = 1)
df.head()


Unnamed: 0,original_title,title,genre,collection,production_company,homepage,original_language,runtime,release_date,budget,revenue
0,Hot Tub Time Machine 2,Hot Tub Time Machine 2,[Comedy],Hot Tub Time Machine Collection,"[Paramount Pictures, United Artists, Metro-Gol...",,en,93.0,2/20/15,14000000,12314651
1,The Princess Diaries 2: Royal Engagement,The Princess Diaries 2: Royal Engagement,"[Comedy, Drama, Family, Romance]",The Princess Diaries Collection,[Walt Disney Pictures],,en,113.0,8/6/04,40000000,95149435
2,Whiplash,Whiplash,[Drama],,"[Bold Films, Blumhouse Productions, Right of W...",http://sonyclassics.com/whiplash/,en,105.0,10/10/14,3300000,13092000
3,Kahaani,Kahaani,"[Thriller, Drama]",,[NaN],http://kahaanithefilm.com/,hi,122.0,3/9/12,1200000,16000000
4,마린보이,Marine Boy,"[Action, Thriller]",,[NaN],,ko,118.0,2/5/09,0,3923970


In [6]:
# extract release year and store in the following format xxxx

release_year = df['release_date'].apply(lambda x: x.split('/')[-1])

release_year_adj = []

for i in range(len(release_year)):
    if 0 <= int(release_year[i]) <= 20:
        year = str(20) + str(release_year[i])
        release_year_adj.append(year)
    else:
        year = str(19) + str(release_year[i])
        release_year_adj.append(year)
        
df.insert(8, 'release_year', release_year_adj)

# adjustment for revenue 
adjusted_revenue = []
for i in range(df.shape[0]):
    value = cpi.inflate(df['revenue'].iloc[i], int(df['release_year'].iloc[i]), to = 2018)
    adjusted_revenue.append(int(value))
    
df.insert(10, 'adjusted_revenue', adjusted_revenue)
 

sequals - revenue, release_date
language - revenue
production company - revenue, date



### Disney

In [7]:
# movies produced by walt disney 

disney_movies = []

for i in range(df.shape[0]):
    if 'Walt Disney Pictures' in df['production_company'].iloc[i]:
        name = df['original_title'].iloc[i]
        disney_movies.append(name)
        

In [8]:
disney_df = df[df['original_title'].isin(disney_movies)]
disney_df


Unnamed: 0,original_title,title,genre,collection,production_company,homepage,original_language,runtime,release_year,release_date,adjusted_revenue,budget,revenue
1,The Princess Diaries 2: Royal Engagement,The Princess Diaries 2: Royal Engagement,"[Comedy, Drama, Family, Romance]",The Princess Diaries Collection,[Walt Disney Pictures],,en,113.0,2004,8/6/04,126483267,40000000,95149435
8,Muppet Treasure Island,Muppet Treasure Island,"[Action, Comedy, Music, Family, Adventure]",The Muppet Collection,"[Walt Disney Pictures, Jim Henson Productions,...",,en,100.0,1996,2/16/96,54938484,0,34327391
121,Tarzan,Tarzan,"[Adventure, Animation, Drama, Family]",Tarzan (Animation) Collection,[Walt Disney Pictures],http://disneydvd.disney.go.com/tarzanr-special...,en,88.0,1999,6/18/99,675245714,150000000,448000000
155,Return to Never Land,Return to Never Land,"[Adventure, Fantasy, Animation, Family]",Peter Pan Collection,"[Walt Disney Pictures, Walt Disney Television ...",,en,72.0,2002,2/14/02,153347907,20000000,109862682
188,WALL·E,WALL·E,"[Animation, Family]",,"[Walt Disney Pictures, Pixar Animation Studios...",http://disney.go.com/disneypictures/wall-e/,en,98.0,2008,6/22/08,608003870,180000000,521311860
309,Freaky Friday,Freaky Friday,[Comedy],,"[Walt Disney Pictures, Gunn Films, Casual Frid...",,en,97.0,2003,8/3/03,150432652,26000000,110230332
314,Cars 3,Cars 3,"[Family, Comedy, Animation, Adventure]",Cars Collection,"[Walt Disney Pictures, Pixar Animation Studios]",http://cars.disney.com,en,109.0,2017,6/15/17,358722880,175000000,350170057
335,Million Dollar Arm,Million Dollar Arm,[Drama],,[Walt Disney Pictures],http://movies.disney.com/million-dollar-arm,en,124.0,2014,5/9/14,40633081,25000000,38307627
357,The Lion King,The Lion King,"[Family, Animation, Drama]",The Lion King Collection,"[Walt Disney Pictures, Walt Disney Feature Ani...",http://movies.disney.com/the-lion-king,en,89.0,1994,6/23/94,1335580483,45000000,788241776
461,The Game Plan,The Game Plan,"[Comedy, Family]",,"[Walt Disney Pictures, Mayhem Pictures, Monkey...",http://movies.disney.com/the-game-plan,en,110.0,2007,9/28/07,179094633,22000000,147880543


In [9]:
disney_sequel_df = disney_df[disney_df['collection'] != 'NaN']
disney_sequel_df.head()


Unnamed: 0,original_title,title,genre,collection,production_company,homepage,original_language,runtime,release_year,release_date,adjusted_revenue,budget,revenue
1,The Princess Diaries 2: Royal Engagement,The Princess Diaries 2: Royal Engagement,"[Comedy, Drama, Family, Romance]",The Princess Diaries Collection,[Walt Disney Pictures],,en,113.0,2004,8/6/04,126483267,40000000,95149435
8,Muppet Treasure Island,Muppet Treasure Island,"[Action, Comedy, Music, Family, Adventure]",The Muppet Collection,"[Walt Disney Pictures, Jim Henson Productions,...",,en,100.0,1996,2/16/96,54938484,0,34327391
121,Tarzan,Tarzan,"[Adventure, Animation, Drama, Family]",Tarzan (Animation) Collection,[Walt Disney Pictures],http://disneydvd.disney.go.com/tarzanr-special...,en,88.0,1999,6/18/99,675245714,150000000,448000000
155,Return to Never Land,Return to Never Land,"[Adventure, Fantasy, Animation, Family]",Peter Pan Collection,"[Walt Disney Pictures, Walt Disney Television ...",,en,72.0,2002,2/14/02,153347907,20000000,109862682
314,Cars 3,Cars 3,"[Family, Comedy, Animation, Adventure]",Cars Collection,"[Walt Disney Pictures, Pixar Animation Studios]",http://cars.disney.com,en,109.0,2017,6/15/17,358722880,175000000,350170057


In [10]:
def movie_with_sequels(sequel_df):

    # find movie names with collections 
    collection_names = sequel_df[sequel_df.duplicated(subset = 'collection', keep = 'first')]['collection']
    # movie collection df 
    movie_wcollection = sequel_df[sequel_df['collection'].isin(collection_names)]
    
    # find movie names with collections 
    collection_names = sequel_df[sequel_df.duplicated(subset = 'collection', keep = 'first')]['collection']
    # movie collection df 
    movie_wcollection = sequel_df[sequel_df['collection'].isin(collection_names)]

    # find the total number of movies in the movie collection
    num_movies = movie_wcollection.groupby(['collection']).count().reset_index()[['collection', 'title']].rename(columns = {'title': 'num_movies'})

    # add the finding above to the movie collectin df
    movie_wcollection = pd.merge(num_movies, movie_wcollection, on = 'collection', how = 'left')

    collection_df = movie_wcollection.groupby(['collection', 'original_title']).agg({'release_year': 'sum',
                                                                                    'num_movies': 'sum',
                                                                                    'runtime': 'sum',
                                                                                    'budget': 'sum',
                                                                                    'adjusted_revenue': 'sum'})

    return collection_df


In [11]:
disney_sequel_df = movie_with_sequels(disney_sequel_df)
# disney_sequel_df


In [12]:

savanna = ['Dinosaur', 'The Lion King']
castle =  ['Alice in Wonderland', 'Beauty and the Beast']
water =  ['Finding Dory', 'Pirates of the Caribbean: The Curse of the Black Pearl']
jungle = ['Tarzan', 'The Jungle Book 2']

location_list = [savanna, castle, water, jungle]


In [13]:
location_revenue_df = []

for i in range(len(location_list)):
    location_df = pd.DataFrame({})
    
    for j in range(len(location_list[i])):
        
        movie_name = location_list[i][j] 
        one_df = disney_df[disney_df['original_title'] == movie_name]
        location_df = pd.concat([location_df, one_df])
    
    location_revenue_df.append(location_df)
    

In [14]:

data = []

trace = go.Bar(x = ['savanna', 'castle', 'water', 'jungle'],
               y = [location_revenue_df[i]['adjusted_revenue'].sum() for i in range(len(location_revenue_df))])
data.append(trace)
# work on: color of the bars, i.e. blue for water, green for jungle...   

layout = dict(title = 'major sceneries and their revenues',
              xaxis = dict(title = 'major sceneries'),
              yaxis = dict(title = 'revenue (adjusted to the value in 2018)'),
              width = 1000,
              height = 600,
              margin = dict(l = 10, r = 10, b = 200, t = 100, pad = 4))

fig = dict(data = data, layout = layout)

plotly.offline.iplot(fig)



### Sequels

In [15]:
sequel_df = df[df['collection'] != 'NaN']
sequel_df.head()


Unnamed: 0,original_title,title,genre,collection,production_company,homepage,original_language,runtime,release_year,release_date,adjusted_revenue,budget,revenue
0,Hot Tub Time Machine 2,Hot Tub Time Machine 2,[Comedy],Hot Tub Time Machine Collection,"[Paramount Pictures, United Artists, Metro-Gol...",,en,93.0,2015,2/20/15,13046722,14000000,12314651
1,The Princess Diaries 2: Royal Engagement,The Princess Diaries 2: Royal Engagement,"[Comedy, Drama, Family, Romance]",The Princess Diaries Collection,[Walt Disney Pictures],,en,113.0,2004,8/6/04,126483267,40000000,95149435
8,Muppet Treasure Island,Muppet Treasure Island,"[Action, Comedy, Music, Family, Adventure]",The Muppet Collection,"[Walt Disney Pictures, Jim Henson Productions,...",,en,100.0,1996,2/16/96,54938484,0,34327391
10,Rocky,Rocky,[Drama],Rocky Collection,[United Artists],,en,119.0,1976,11/21/76,517373744,1000000,117235147
11,Revenge of the Nerds II: Nerds in Paradise,Revenge of the Nerds II: Nerds in Paradise,[Comedy],Revenge of the Nerds Collection,"[Twentieth Century Fox Film Corporation, Amerc...",,en,98.0,1987,7/10/87,50049057,0,22642033


In [16]:
collection_df = movie_with_sequels(sequel_df)
# collection_df


In [17]:
collection_to_plot = ['Alien Collection', 'Resident Evil Collection', 'Transformers Collection', 'The Fast and the Furious Collection', 'Ice Age Collection']

list_of_df = []

for i in range(len(collection_to_plot)):
    
    one_collection_df = collection_df[collection_df['num_movies'] == 4].loc[collection_to_plot[i]].reset_index()
    one_collection_df = one_collection_df.sort_values('release_year')
    
    list_of_df.append(one_collection_df)
    


In [21]:

data = []

for i in range(len(list_of_df)):
    trace = go.Scatter( x = list_of_df[i]['original_title'],
                        y = list_of_df[i]['adjusted_revenue'],
                        name = collection_to_plot[i],                   
                        mode = 'lines+markers')
    data.append(trace)
    

layout = dict(title = 'movie sequels and their revenues',
              xaxis = dict(title = 'movie name'),
              yaxis = dict(title = 'revenue (adjusted to the value in 2018)'),
              width = 1000,
              height = 600,
              margin = dict(l = 50, r = 10, b = 200, t = 100, pad = 4))

fig = dict(data = data, layout = layout)

plotly.offline.iplot(fig)
plotly.offline.plot(fig, filename = 'movie_sequels_and_revenues')


Your filename `movie_sequels_and_revenues` didn't end with .html. Adding .html to the end of your file.



'file:///Users/yueying.teng/Documents/TMDB/movie_sequels_and_revenues.html'