# Movies Database Cleaning

In [1]:
import numpy as np
import pandas as pd

In [16]:
original_movies_db = pd.read_excel('movies.xls')
original_movies_db.head()

Unnamed: 0,budget,company,country,director,genre,gross,name,rating,released,runtime,score,star,votes,writer,year
0,8000000.0,Columbia Pictures Corporation,USA,Rob Reiner,Adventure,52287414.0,Stand by Me,R,1986-08-22 00:00:00,89.0,8.1,Wil Wheaton,299174.0,Stephen King,1986.0
1,6000000.0,Paramount Pictures,USA,John Hughes,Comedy,70136369.0,Ferris Bueller's Day Off,PG-13,1986-06-11 00:00:00,103.0,7.8,Matthew Broderick,264740.0,John Hughes,1986.0
2,15000000.0,Paramount Pictures,USA,Tony Scott,Action,179800601.0,Top Gun,PG,1986-05-16 00:00:00,110.0,6.9,Tom Cruise,236909.0,Jim Cash,1986.0
3,18500000.0,Twentieth Century Fox Film Corporation,USA,James Cameron,Action,85160248.0,Aliens,R,1986-07-18 00:00:00,137.0,8.4,Sigourney Weaver,540152.0,James Cameron,1986.0
4,9000000.0,Walt Disney Pictures,USA,Randal Kleiser,Adventure,18564613.0,Flight of the Navigator,PG,1986-08-01 00:00:00,90.0,6.9,Joey Cramer,36636.0,Mark H. Baker,1986.0


In [17]:
movies_db = original_movies_db

In [18]:
# Filtering to just get 'USA' matches
movies_db = movies_db[(movies_db['country'] == 'USA')]


# Filtering to just get 'BUDGET' not equal to 0.0
movies_db = movies_db[(movies_db['budget'] != 0.0)]


movies_db.head()

Unnamed: 0,budget,company,country,director,genre,gross,name,rating,released,runtime,score,star,votes,writer,year
0,8000000.0,Columbia Pictures Corporation,USA,Rob Reiner,Adventure,52287414.0,Stand by Me,R,1986-08-22 00:00:00,89.0,8.1,Wil Wheaton,299174.0,Stephen King,1986.0
1,6000000.0,Paramount Pictures,USA,John Hughes,Comedy,70136369.0,Ferris Bueller's Day Off,PG-13,1986-06-11 00:00:00,103.0,7.8,Matthew Broderick,264740.0,John Hughes,1986.0
2,15000000.0,Paramount Pictures,USA,Tony Scott,Action,179800601.0,Top Gun,PG,1986-05-16 00:00:00,110.0,6.9,Tom Cruise,236909.0,Jim Cash,1986.0
3,18500000.0,Twentieth Century Fox Film Corporation,USA,James Cameron,Action,85160248.0,Aliens,R,1986-07-18 00:00:00,137.0,8.4,Sigourney Weaver,540152.0,James Cameron,1986.0
4,9000000.0,Walt Disney Pictures,USA,Randal Kleiser,Adventure,18564613.0,Flight of the Navigator,PG,1986-08-01 00:00:00,90.0,6.9,Joey Cramer,36636.0,Mark H. Baker,1986.0


In [19]:
# Checking column names

movies_db.columns

Index(['budget', 'company', 'country', 'director', 'genre', 'gross', 'name',
       'rating', 'released', 'runtime', 'score', 'star', 'votes', 'writer',
       'year'],
      dtype='object')

In [20]:
# Selecting the columns we want to keep
movies_db = movies_db[['year', 'name', 'budget', 'gross', 'score']]


# Renaming the columns
movies_db.columns = ['YEAR', 'NAME', 'BUDGET', 'REVENUE', 'IMDB SCORE']

movies_db

Unnamed: 0,YEAR,NAME,BUDGET,REVENUE,IMDB SCORE
0,1986.0,Stand by Me,8000000.0,52287414.0,8.1
1,1986.0,Ferris Bueller's Day Off,6000000.0,70136369.0,7.8
2,1986.0,Top Gun,15000000.0,179800601.0,6.9
3,1986.0,Aliens,18500000.0,85160248.0,8.4
4,1986.0,Flight of the Navigator,9000000.0,18564613.0,6.9
...,...,...,...,...,...
6807,2016.0,The Darkness,4000000.0,10732841.0,4.4
6809,2016.0,The Hollars,3800000.0,1016872.0,6.5
6811,2016.0,Middle School: The Worst Years of My Life,8500000.0,19985196.0,6.1
6814,2016.0,Risen,20000000.0,36874745.0,6.3


In [21]:
# Creating the '% BUDGET/REVENUE' column

movies_db['% BUDGET/REVENUE'] = round((movies_db['BUDGET'] / movies_db['REVENUE'] * 100), 1)

#movies_db

In [31]:
# Converting 'YEAR' column to 'int' type

movies_db['YEAR'] = movies_db['YEAR'].astype(int)
movies_db.head()

Unnamed: 0,YEAR,NAME,BUDGET,REVENUE,IMDB SCORE,% BUDGET/REVENUE
0,1986,Stand by Me,8000000.0,52287414.0,8.1,15.3
1,1986,Ferris Bueller's Day Off,6000000.0,70136369.0,7.8,8.6
2,1986,Top Gun,15000000.0,179800601.0,6.9,8.3
3,1986,Aliens,18500000.0,85160248.0,8.4,21.7
4,1986,Flight of the Navigator,9000000.0,18564613.0,6.9,48.5
