# Import Libraries and Load CSV to Pandas Dataframe

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('tn.movie_budgets.csv')
df.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


# Data Exploration and Cleaning

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


## Drop id Column

In [None]:
df.drop(columns = ['id'], inplace = True)
df.head()

Unnamed: 0,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


## pd.to_datetime()

In [None]:
df['datetime_format'] = pd.to_datetime(df['release_date'])
df.head()

Unnamed: 0,release_date,movie,production_budget,domestic_gross,worldwide_gross,datetime_format
0,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279",2009-12-18
1,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",2011-05-20
2,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350",2019-06-07
3,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963",2015-05-01
4,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747",2017-12-15


## pd.sort_values()

In [36]:
df.sort_values('datetime_format', ascending = False)

Unnamed: 0,release_date,movie,production_budget,domestic_gross,worldwide_gross,datetime_format
194,"Dec 31, 2020",Moonfall,"$150,000,000",$0,$0,2020-12-31
1205,"Dec 31, 2020",Hannibal the Conqueror,"$50,000,000",$0,$0,2020-12-31
535,"Feb 21, 2020",Call of the Wild,"$82,000,000",$0,$0,2020-02-21
480,"Dec 31, 2019",Army of the Dead,"$90,000,000",$0,$0,2019-12-31
3515,"Dec 31, 2019",Eli,"$11,000,000",$0,$0,2019-12-31
...,...,...,...,...,...,...
5606,"Nov 19, 1925",The Big Parade,"$245,000","$11,000,000","$22,000,000",1925-11-19
5683,"Sep 17, 1920",Over the Hill to the Poorhouse,"$100,000","$3,000,000","$3,000,000",1920-09-17
5614,"Dec 24, 1916","20,000 Leagues Under the Sea","$200,000","$8,000,000","$8,000,000",1916-12-24
5523,"Sep 5, 1916",Intolerance,"$385,907",$0,$0,1916-09-05


## Drop release_date column

In [37]:
df = df[['datetime_format', 'movie', 'production_budget', 'domestic_gross', 'worldwide_gross']]
df.head()

Unnamed: 0,datetime_format,movie,production_budget,domestic_gross,worldwide_gross
0,2009-12-18,Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2011-05-20,Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,2019-06-07,Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,2015-05-01,Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,2017-12-15,Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


## Convert Numeric Columns to Integer

### Remove Dollar Signs

In [40]:
cols_to_clean = ['production_budget', 'domestic_gross', 'worldwide_gross']
for col in cols_to_clean:
    df[col] = df[col].map(lambda x: x.replace('$',''))

In [41]:
df.head()

Unnamed: 0,datetime_format,movie,production_budget,domestic_gross,worldwide_gross
0,2009-12-18,Avatar,425000000,760507625,2776345279
1,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875
2,2019-06-07,Dark Phoenix,350000000,42762350,149762350
3,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963
4,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747


### Remove Commas

In [42]:
for col in cols_to_clean:
    df[col] = [x.replace(',', '') if ',' in x else x for x in df[col]]

In [43]:
df.head()

Unnamed: 0,datetime_format,movie,production_budget,domestic_gross,worldwide_gross
0,2009-12-18,Avatar,425000000,760507625,2776345279
1,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875
2,2019-06-07,Dark Phoenix,350000000,42762350,149762350
3,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963
4,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747


### Convert Columns to Integer D-Type

In [44]:
for col in cols_to_clean:
    df[col] = df[col].astype('int')

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   datetime_format    5782 non-null   datetime64[ns]
 1   movie              5782 non-null   object        
 2   production_budget  5782 non-null   int64         
 3   domestic_gross     5782 non-null   int64         
 4   worldwide_gross    5782 non-null   int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 226.0+ KB


## Exploring Distribution's of Numerical Values with pd.describe()

In [46]:
df.describe()

Unnamed: 0,production_budget,domestic_gross,worldwide_gross
count,5782.0,5782.0,5782.0
mean,31587760.0,41873330.0,91487460.0
std,41812080.0,68240600.0,174720000.0
min,1100.0,0.0,0.0
25%,5000000.0,1429534.0,4125415.0
50%,17000000.0,17225940.0,27984450.0
75%,40000000.0,52348660.0,97645840.0
max,425000000.0,936662200.0,2776345000.0
