# Movies

In [2]:
# import packages
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import sklearn.cluster as clust
from scipy.spatial.distance import cdist
import os

# to show multiple outputs per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# import data
df = pd.read_excel('movies.xls',sheet_name = None)
df_1950s = df['1900s']
df_2000s = df['2000s']
df_2010s = df['2010s']

In [4]:
# concatenate all into 1 dataframe
# check all have 25 columns before concatenating
df_1950s.shape
df_2000s.shape
df_2010s.shape

(1338, 25)

(2100, 25)

(1604, 25)

In [62]:
df_all = pd.concat([df_1950s, df_2000s, df_2010s]) # concatenate
df_all.shape

(5042, 25)

### Overview of the dataset

In [51]:
df_all.head()

Unnamed: 0,Title,Year,Genres,Language,Country,Content Rating,Duration,Aspect Ratio,Budget,Gross Earnings,...,Facebook Likes - Actor 1,Facebook Likes - Actor 2,Facebook Likes - Actor 3,Facebook Likes - cast Total,Facebook likes - Movie,Facenumber in posters,User Votes,Reviews by Users,Reviews by Crtiics,IMDB Score
0,Intolerance: Love's Struggle Throughout the Ages,1916.0,Drama|History|War,,USA,Not Rated,123.0,1.33,385907.0,,...,436.0,22.0,9.0,481,691,1.0,10718,88.0,69.0,8.0
1,Over the Hill to the Poorhouse,1920.0,Crime|Drama,,USA,,110.0,1.33,100000.0,3000000.0,...,2.0,2.0,0.0,4,0,1.0,5,1.0,1.0,4.8
2,The Big Parade,1925.0,Drama|Romance|War,,USA,Not Rated,151.0,1.33,245000.0,,...,81.0,12.0,6.0,108,226,0.0,4849,45.0,48.0,8.3
3,Metropolis,1927.0,Drama|Sci-Fi,German,Germany,Not Rated,145.0,1.33,6000000.0,26435.0,...,136.0,23.0,18.0,203,12000,1.0,111841,413.0,260.0,8.3
4,Pandora's Box,1929.0,Crime|Drama|Romance,German,Germany,Not Rated,110.0,1.33,,9950.0,...,426.0,20.0,3.0,455,926,1.0,7431,84.0,71.0,8.0


In [52]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5042 entries, 0 to 1603
Data columns (total 25 columns):
Title                          5042 non-null object
Year                           4935 non-null float64
Genres                         5042 non-null object
Language                       5031 non-null object
Country                        5038 non-null object
Content Rating                 4740 non-null object
Duration                       5028 non-null float64
Aspect Ratio                   4714 non-null float64
Budget                         4551 non-null float64
Gross Earnings                 4159 non-null float64
Director                       4938 non-null object
Actor 1                        5035 non-null object
Actor 2                        5029 non-null object
Actor 3                        5020 non-null object
Facebook Likes - Director      4938 non-null float64
Facebook Likes - Actor 1       5035 non-null float64
Facebook Likes - Actor 2       5029 non-null float64
F

### Pre-processing
#### A couple things to check
- are all film titles unique?
- missing values? 
- are all columns in correct data type?
    - year: should be int instead of float
    - all FB likes: should be in int instead of float
    - need to change all object types to category


In [68]:
# change all object types to category
df_all[df_all.select_dtypes(include=['object']).columns] = df_all.select_dtypes(include=['object']).astype('category')
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5042 entries, 0 to 1603
Data columns (total 25 columns):
Title                          5042 non-null category
Year                           4935 non-null float64
Genres                         5042 non-null category
Language                       5031 non-null category
Country                        5038 non-null category
Content Rating                 4740 non-null category
Duration                       5028 non-null float64
Aspect Ratio                   4714 non-null float64
Budget                         4551 non-null float64
Gross Earnings                 4159 non-null float64
Director                       4938 non-null category
Actor 1                        5035 non-null category
Actor 2                        5029 non-null category
Actor 3                        5020 non-null category
Facebook Likes - Director      4938 non-null float64
Facebook Likes - Actor 1       5035 non-null float64
Facebook Likes - Actor 2       5029 

In [70]:
# check all titles are unique
df_all['Title'].value_counts()
df_all[df_all['Title'] == 'King Kong']

Ben-Hur                                      3
Halloween                                    3
Victor Frankenstein                          3
The Fast and the Furious                     3
Pan                                          3
Home                                         3
King Kong                                    3
Jack Reacher                                 2
Side Effects                                 2
The Texas Chain Saw Massacre                 2
The Omen                                     2
Conan the Barbarian                          2
A Nightmare on Elm Street                    2
Footloose                                    2
Unknown                                      2
The Avengers                                 2
Dredd                                        2
RoboCop                                      2
Chasing Liberty                              2
The Alamo                                    2
The Love Letter                              2
The Astronaut

Unnamed: 0,Title,Year,Genres,Language,Country,Content Rating,Duration,Aspect Ratio,Budget,Gross Earnings,...,Facebook Likes - Actor 1,Facebook Likes - Actor 2,Facebook Likes - Actor 3,Facebook Likes - cast Total,Facebook likes - Movie,Facenumber in posters,User Votes,Reviews by Users,Reviews by Crtiics,IMDB Score


In [None]:
df_all['Year'] = df_all['Year'].astype('int64')

In [None]:
# check missing values (NaN)
#df_all.isnull().sum()
# change year into int

### Potential interesting questions to look at
- relationship between 
    - total facebook likes and IMDB score?
    - number of faces in posters and IMDB score?
    - between length of the movie and IMDB score?
    - between number of user votes + number of reviews and IMDB score (assuming the number of votes and reviews indicate notoriety)? Will those with more votes/reviews have polarising IMDB scores (ie. have a bimodal distribution instead of a normal distribution)?
    - between budget, gross earning and IMDB score?
- number of films in each genre 
- relationship between genre and IMDB score



### Assumptions and things to keep in mind
- only using IMDB scores as a metric for popularity 