# Movies Database Cleaning

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import fuzzy

In [2]:
original_movies_db = pd.read_excel('movies.xls')

In [3]:
movies_db = original_movies_db

In [4]:
# Filtering to just get 'USA' matches
#movies_db = movies_db[(movies_db['country'] == 'USA')]


# Filtering to just get 'BUDGET' not equal to 0.0
movies_db = movies_db[(movies_db['budget'] != 0.0)]


In [5]:
movies_db.loc[(movies_db['year'].isnull())]

Unnamed: 0,budget,company,country,director,genre,gross,name,rating,released,runtime,score,star,votes,writer,year
199,18000000.0,J&M,,,,,,,,,,,,,
1454,5000000.0,A&M,,,,,,,,,,,,,
1714,12000000.0,A&M,,,,,,,,,,,,,
4831,6000000.0,B&W,,,,,,,,,,,,,


In [6]:
# Exclude nan rows
movies_db['name'] = movies_db['name'].str.replace('&','and').str.replace("'",' ').str.replace('-','').str.replace(',','').str.replace('.','').str.replace(':','').str.strip().str.title()

movies_db = movies_db.dropna()
movies_db.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


budget      0
company     0
country     0
director    0
genre       0
gross       0
name        0
rating      0
released    0
runtime     0
score       0
star        0
votes       0
writer      0
year        0
dtype: int64

In [7]:
# Checking column names
movies_db.columns

Index(['budget', 'company', 'country', 'director', 'genre', 'gross', 'name',
       'rating', 'released', 'runtime', 'score', 'star', 'votes', 'writer',
       'year'],
      dtype='object')

In [8]:
# Selecting the columns we want to keep
movies_db = movies_db[['name', 'budget', 'gross', 'score']]

# Renaming the columns
movies_db.columns = ['TITLE', 'BUDGET', 'REVENUE', 'IMDB SCORE']

movies_db

Unnamed: 0,TITLE,BUDGET,REVENUE,IMDB SCORE
0,Stand By Me,8000000.0,52287414.0,8.1
1,Ferris Bueller S Day Off,6000000.0,70136369.0,7.8
2,Top Gun,15000000.0,179800601.0,6.9
3,Aliens,18500000.0,85160248.0,8.4
4,Flight Of The Navigator,9000000.0,18564613.0,6.9
...,...,...,...,...
6808,Love And Friendship,3000000.0,14013564.0,6.5
6809,The Hollars,3800000.0,1016872.0,6.5
6811,Middle School The Worst Years Of My Life,8500000.0,19985196.0,6.1
6814,Risen,20000000.0,36874745.0,6.3


In [9]:
# Creating the '% BUDGET/REVENUE' column

movies_db['% BUDGET/REVENUE'] = round((movies_db['BUDGET'] / movies_db['REVENUE'] * 100), 1)

In [10]:
# Converting 'YEAR' column to 'int' type

#movies_db['YEAR'] = movies_db['YEAR'].astype('str')
#movies_db['YEAR'] = movies_db['YEAR'].apply(lambda x: x.split('.')[0])
#movies_db['YEAR'] = movies_db['YEAR'].astype('int')

In [11]:
# Changing these 2 columns to be in millions
#movies_db['BUDGET'] = round((movies_db['BUDGET'] / 1000000), 1)
#movies_db['REVENUE'] = round((movies_db['REVENUE'] / 1000000), 1)


# Filtering to just get 'REVENUE' not equal to 0.0
#movies_db = movies_db[(movies_db['REVENUE'] != 0.0)]

In [12]:
# Renaming columns to be in millions
movies_db.columns = ['TITLE', 'BUDGET IN MILLIONS', 'REVENUE IN MILLIONS', 'IMDB SCORE', '% BUDGET/REVENUE']
movies_db.head()

Unnamed: 0,TITLE,BUDGET IN MILLIONS,REVENUE IN MILLIONS,IMDB SCORE,% BUDGET/REVENUE
0,Stand By Me,8000000.0,52287414.0,8.1,15.3
1,Ferris Bueller S Day Off,6000000.0,70136369.0,7.8,8.6
2,Top Gun,15000000.0,179800601.0,6.9,8.3
3,Aliens,18500000.0,85160248.0,8.4,21.7
4,Flight Of The Navigator,9000000.0,18564613.0,6.9,48.5


In [13]:
# Final order
movies_db = movies_db[['TITLE', 'BUDGET IN MILLIONS', 'REVENUE IN MILLIONS', '% BUDGET/REVENUE', 'IMDB SCORE']]


In [14]:
movies_db.isnull().sum()

TITLE                  0
BUDGET IN MILLIONS     0
REVENUE IN MILLIONS    0
% BUDGET/REVENUE       0
IMDB SCORE             0
dtype: int64

### Awards winning movies

In [15]:
url = 'https://en.wikipedia.org/wiki/List_of_Academy_Award-winning_films'
html = requests.get(url).content
soup = BeautifulSoup(html, "lxml")

# tag: tbody

table_body = soup.find_all('tbody')[0]
rows = [element.text for element in table_body.find_all('tr')]

clean_rows = [row.strip().replace('\n','  ').split('  ') for row in rows]

colnames = ['TITLE', 'YEAR', 'AWARDS', 'NOMINATIONS']
awards = pd.DataFrame(clean_rows[1:], columns = colnames)
awards

Unnamed: 0,TITLE,YEAR,AWARDS,NOMINATIONS
0,Green Book,2018,3,5
1,Bohemian Rhapsody,2018,4,5
2,Roma,2018,3,10
3,Black Panther,2018,3,7
4,The Favourite,2018,1,10
...,...,...,...,...
1294,The Yankee Doodle Mouse,1943,1,1
1295,The Yearling,1946,2,7
1296,"Yesterday, Today and Tomorrow (Ieri, oggi, dom...",1964,1,1
1297,You Can't Take It with You,1938,2,7


In [16]:
#Clean movie title
awards['TITLE'] = awards['TITLE'].str.replace('&','and').str.replace("'",' ').str.replace('-','').str.replace(',','').str.replace('.','').str.replace(':','').str.strip().str.title()
awards['TITLE'] = awards['TITLE'].apply(lambda x: x.split('(')[0])
awards['TITLE'] = awards['TITLE'].str.strip()
awards['TITLE'].unique()

array(['Green Book', 'Bohemian Rhapsody', 'Roma', ...,
       'Yesterday Today And Tomorrow', 'You Can T Take It With You',
       'Zorba The Greek'], dtype=object)

In [17]:
# Like this it matches the movies' file year range
awards = awards[(awards['YEAR'] <= '2016') & (awards['YEAR'] >= '1986')]
awards['YEAR'] = awards['YEAR'].astype('int')
awards

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,TITLE,YEAR,AWARDS,NOMINATIONS
31,Moonlight,2016,3,8
32,La La Land,2016,6,14
33,Hacksaw Ridge,2016,2,6
34,Manchester By The Sea,2016,2,6
35,Arrival,2016,1,8
...,...,...,...,...
474,Down And Out In America,1986,1,1
475,The Fly,1986,1,1
476,A Greek Tragedy,1986,1,1
477,Precious Images,1986,1,1


In [18]:
# Filter awards by len

awards['AWARDS'] = awards['AWARDS'].apply(lambda x: x.split(' ')[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
# Resetting the index, as it was starting at 31
awards.reset_index()
awards_db = awards[['TITLE', 'AWARDS', 'NOMINATIONS']]
awards_db.dtypes

TITLE          object
AWARDS         object
NOMINATIONS    object
dtype: object

# Merge

In [20]:
all_db = pd.merge(movies_db, awards_db, on = 'TITLE', how = 'outer')
all_db

Unnamed: 0,TITLE,BUDGET IN MILLIONS,REVENUE IN MILLIONS,% BUDGET/REVENUE,IMDB SCORE,AWARDS,NOMINATIONS
0,Stand By Me,8000000.0,52287414.0,15.3,8.1,,
1,Ferris Bueller S Day Off,6000000.0,70136369.0,8.6,7.8,,
2,Top Gun,15000000.0,179800601.0,8.3,6.9,1,4
3,Aliens,18500000.0,85160248.0,21.7,8.4,2,7
4,Flight Of The Navigator,9000000.0,18564613.0,48.5,6.9,,
...,...,...,...,...,...,...,...
4805,The Assault,,,,,1,1
4806,Down And Out In America,,,,,1,1
4807,A Greek Tragedy,,,,,1,1
4808,Precious Images,,,,,1,1


In [21]:
all_db = all_db.sort_values(['TITLE'])
all_db.reset_index()

all_db.isnull().sum()

TITLE                     0
BUDGET IN MILLIONS      185
REVENUE IN MILLIONS     185
% BUDGET/REVENUE        185
IMDB SCORE              185
AWARDS                 4359
NOMINATIONS            4359
dtype: int64

In [22]:
all_db[['AWARDS', 'NOMINATIONS']] = all_db[['AWARDS', 'NOMINATIONS']].fillna(0)
all_db = all_db.reset_index()
all_db.isnull().sum()

index                    0
TITLE                    0
BUDGET IN MILLIONS     185
REVENUE IN MILLIONS    185
% BUDGET/REVENUE       185
IMDB SCORE             185
AWARDS                   0
NOMINATIONS              0
dtype: int64

In [23]:
all_db = all_db.dropna()
all_db.isnull().sum()

index                  0
TITLE                  0
BUDGET IN MILLIONS     0
REVENUE IN MILLIONS    0
% BUDGET/REVENUE       0
IMDB SCORE             0
AWARDS                 0
NOMINATIONS            0
dtype: int64

In [24]:
all_db = all_db[['TITLE', 'BUDGET IN MILLIONS', 'REVENUE IN MILLIONS','% BUDGET/REVENUE', 'IMDB SCORE', 'AWARDS', 'NOMINATIONS']]
all_db.head(50)

Unnamed: 0,TITLE,BUDGET IN MILLIONS,REVENUE IN MILLIONS,% BUDGET/REVENUE,IMDB SCORE,AWARDS,NOMINATIONS
0,10 Cloverfield Lane,15000000.0,71897215.0,20.9,7.2,0,0
1,10 Things I Hate About You,30000000.0,38176108.0,78.6,7.2,0,0
2,10000 Bc,105000000.0,94784201.0,110.8,5.1,0,0
3,101 Dalmatians,75000000.0,136189294.0,55.1,5.7,0,0
4,102 Dalmatians,85000000.0,66957026.0,126.9,4.8,0,0
5,12 Horas Para Sobrevivir,9000000.0,71962800.0,12.5,6.5,0,0
6,12 Rounds,22000000.0,12232937.0,179.8,5.6,0,0
7,12 Years A Slave,20000000.0,56671993.0,35.3,8.1,3,9
8,127 Hours,18000000.0,18335230.0,98.2,7.6,0,0
9,13 Assassins,6000000.0,802524.0,747.6,7.6,0,0


Unnamed: 0,TITLE,BUDGET IN MILLIONS,REVENUE IN MILLIONS,% BUDGET/REVENUE,IMDB SCORE,AWARDS,NOMINATIONS
0,10 Cloverfield Lane,15000000.0,71897215.0,20.9,7.2,0,0
1,10 Things I Hate About You,30000000.0,38176108.0,78.6,7.2,0,0
2,10000 Bc,105000000.0,94784201.0,110.8,5.1,0,0
3,101 Dalmatians,75000000.0,136189294.0,55.1,5.7,0,0
4,102 Dalmatians,85000000.0,66957026.0,126.9,4.8,0,0
...,...,...,...,...,...,...,...
4805,[Rec] 2,5600000.0,27766.0,20168.6,6.6,0,0
4806,[Rec] 3 Genesis,6400000.0,8973.0,71325.1,5.0,0,0
4807,°Three Amigos!,25000000.0,39246734.0,63.7,6.4,0,0
4808,Øsabes Quièn Viene?,25000000.0,2547047.0,981.5,7.2,0,0
