In [52]:
# Import dependencies
import pandas as pd
import requests
import numpy as np

In [53]:
# Link
url = 'https://www.boxofficemojo.com/year/world/?ref_=bo_nb_di_tab'

# Retrieve html for links
response = requests.get(url)

# Read responses into the table
worldwide_2023 = pd.read_html(response.text)

# Select and display the table
worldwide_2023_df = worldwide_2023[0]

display(worldwide_2023_df.head())

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,%,Foreign,%.1
0,1,Barbie,"$1,441,820,453","$636,220,453",44.1%,"$805,600,000",55.9%
1,2,The Super Mario Bros. Movie,"$1,361,365,341","$574,934,330",42.2%,"$786,431,011",57.8%
2,3,Oppenheimer,"$952,010,450","$326,076,450",34.3%,"$625,934,000",65.7%
3,4,Guardians of the Galaxy Vol. 3,"$845,555,777","$358,995,815",42.5%,"$486,559,962",57.5%
4,5,Fast X,"$704,875,015","$146,126,015",20.7%,"$558,749,000",79.3%


Table - 2023 Worldwide Box Office

In [54]:
worldwide_2023_df.columns

Index(['Rank', 'Release Group', 'Worldwide', 'Domestic', '%', 'Foreign',
       '%.1'],
      dtype='object')

In [55]:
worldwide_2023_df['Rank'].value_counts()

1      1
138    1
128    1
129    1
130    1
      ..
70     1
71     1
72     1
73     1
200    1
Name: Rank, Length: 200, dtype: int64

In [56]:
worldwide_2023_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Rank           200 non-null    int64 
 1   Release Group  200 non-null    object
 2   Worldwide      200 non-null    object
 3   Domestic       200 non-null    object
 4   %              200 non-null    object
 5   Foreign        200 non-null    object
 6   %.1            200 non-null    object
dtypes: int64(1), object(6)
memory usage: 11.1+ KB


In [57]:
# Convert object data types to integer: 'Worldwide' column

worldwide_200 = worldwide_2023_df.copy()

worldwide_200['Worldwide'] = worldwide_200['Worldwide'].str.replace(',', '')
worldwide_200['Worldwide'] = worldwide_200['Worldwide'].str.replace('$', '')
worldwide_200['Worldwide'] = worldwide_200['Worldwide'].astype('int64', copy=True)

worldwide_200['Worldwide'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 200 entries, 0 to 199
Series name: Worldwide
Non-Null Count  Dtype
--------------  -----
200 non-null    int64
dtypes: int64(1)
memory usage: 1.7 KB


  worldwide_200['Worldwide'] = worldwide_200['Worldwide'].str.replace('$', '')


In [58]:
# Convert 'Domestic' & 'Foreign' columns to float, nan values cannot be converted to integer.

# 'Domestic' column
worldwide_200['Domestic'] = worldwide_200['Domestic'].str.replace(',', '')
worldwide_200['Domestic'] = worldwide_200['Domestic'].str.replace('$', '')

domestic_list = worldwide_200['Domestic'].values.tolist()

new_list = []
for i in range(len(domestic_list)):
    if domestic_list[i] == '-':
        new_list.append(np.nan)
    else:
        new_list.append(int(domestic_list[i]))

worldwide_200['Domestic'] = new_list

worldwide_200.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Rank           200 non-null    int64  
 1   Release Group  200 non-null    object 
 2   Worldwide      200 non-null    int64  
 3   Domestic       122 non-null    float64
 4   %              200 non-null    object 
 5   Foreign        200 non-null    object 
 6   %.1            200 non-null    object 
dtypes: float64(1), int64(2), object(4)
memory usage: 11.1+ KB


  worldwide_200['Domestic'] = worldwide_200['Domestic'].str.replace('$', '')


In [59]:
# 'Foreign' column
worldwide_200['Foreign'] = worldwide_200['Foreign'].str.replace(',', '')
worldwide_200['Foreign'] = worldwide_200['Foreign'].str.replace('$', '')

foreign_list = worldwide_200['Foreign'].values.tolist()

new_list = []
for i in range(len(foreign_list)):
    if foreign_list[i] == '-':
        new_list.append(np.nan)
    else:
        new_list.append(int(foreign_list[i]))

worldwide_200['Foreign'] = new_list

worldwide_200.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Rank           200 non-null    int64  
 1   Release Group  200 non-null    object 
 2   Worldwide      200 non-null    int64  
 3   Domestic       122 non-null    float64
 4   %              200 non-null    object 
 5   Foreign        194 non-null    float64
 6   %.1            200 non-null    object 
dtypes: float64(2), int64(2), object(3)
memory usage: 11.1+ KB


  worldwide_200['Foreign'] = worldwide_200['Foreign'].str.replace('$', '')


In [60]:
# Convert '%' and '%.1' columns to float.

worldwide_200['%'] = pd.to_numeric(worldwide_200['%'].str.replace('%', ''), errors='coerce') / 100
worldwide_200['%.1'] = pd.to_numeric(worldwide_200['%.1'].str.replace('%', ''), errors='coerce') / 100

worldwide_200[['%', '%.1']].head()

Unnamed: 0,%,%.1
0,0.441,0.559
1,0.422,0.578
2,0.343,0.657
3,0.425,0.575
4,0.207,0.793


In [61]:
worldwide_200.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Rank           200 non-null    int64  
 1   Release Group  200 non-null    object 
 2   Worldwide      200 non-null    int64  
 3   Domestic       122 non-null    float64
 4   %              122 non-null    float64
 5   Foreign        194 non-null    float64
 6   %.1            194 non-null    float64
dtypes: float64(4), int64(2), object(1)
memory usage: 11.1+ KB


In [66]:
# Rename '%' and '%.1' columns

worldwide_200.rename(columns={'%': '% of Total', '%.1': '% of Total'}, inplace=True)

worldwide_200.head()

Unnamed: 0,Rank,Release Group,Worldwide,Domestic,% of Total,Foreign,% of Total.1
0,1,Barbie,1441820453,636220453.0,0.441,805600000.0,0.559
1,2,The Super Mario Bros. Movie,1361365341,574934330.0,0.422,786431011.0,0.578
2,3,Oppenheimer,952010450,326076450.0,0.343,625934000.0,0.657
3,4,Guardians of the Galaxy Vol. 3,845555777,358995815.0,0.425,486559962.0,0.575
4,5,Fast X,704875015,146126015.0,0.207,558749000.0,0.793


In [67]:
# Export DataFrame to a CSV file
worldwide_200.to_csv('2023_worldwide_box_office_data.csv', index=False)