# Data Collection and Cleaning

### 1. Collecting Movie Data

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns 
from sklearn.linear_model import LogisticRegression
import datetime
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error
from sklearn import metrics
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

In [3]:
#Read in data
df = pd.read_csv("C:\\Users\\taylo\\Desktop\\GA\\Capstone\\the-movies-dataset\\movies_metadata.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
#Checks out column names
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [5]:
#Looks at DF head
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [6]:
#Gets general sense of DF numbers
df.describe()

Unnamed: 0,revenue,runtime,vote_average,vote_count
count,45460.0,45203.0,45460.0,45460.0
mean,11209350.0,94.128199,5.618207,109.897338
std,64332250.0,38.40781,1.924216,491.310374
min,0.0,0.0,0.0,0.0
25%,0.0,85.0,5.0,3.0
50%,0.0,95.0,6.0,10.0
75%,0.0,107.0,6.8,34.0
max,2787965000.0,1256.0,10.0,14075.0


## 2. Data Cleaning

In [7]:
#Renaming Columns
df.rename(columns={'adult': 'Adult'}, inplace=True)
df.rename(columns={'belongs_to_collection': 'Belongs To Collection'}, inplace=True)
df.rename(columns={'budget': 'Budget'}, inplace=True)
df.rename(columns={'genres': 'Genres'}, inplace=True)
df.rename(columns={'homepage': 'Homepage'}, inplace=True)
df.rename(columns={'id': 'ID'}, inplace=True)
df.rename(columns={'imdb_id': 'IMDB_id'}, inplace=True)
df.rename(columns={'original_language': 'Original Language'}, inplace=True)
df.rename(columns={'original_title': 'Original Title'}, inplace=True)
df.rename(columns={'overview': 'Overview'}, inplace=True)
df.rename(columns={'popularity': 'Popularity'}, inplace=True)
df.rename(columns={'poster_path': 'Poster Path'}, inplace=True)
df.rename(columns={'production_companies': 'Production Companies'}, inplace=True)
df.rename(columns={'production_countries': 'Production Countries'}, inplace=True)
df.rename(columns={'release_date': 'Release Date'}, inplace=True)
df.rename(columns={'revenue': 'Revenue'}, inplace=True)
df.rename(columns={'runtime': 'Runtime'}, inplace=True)
df.rename(columns={'spoken_languages': 'Spoken Languages'}, inplace=True)
df.rename(columns={'status': 'Status'}, inplace=True)
df.rename(columns={'tagline': 'Tagline'}, inplace=True)
df.rename(columns={'title': 'Title'}, inplace=True)
df.rename(columns={'video': 'Video'}, inplace=True)
df.rename(columns={'vote_average': 'Vote Average'}, inplace=True)
df.rename(columns={'vote_count': 'Vote Count'}, inplace=True)

In [8]:
#Confirming columns rename
df.columns

Index(['Adult', 'Belongs To Collection', 'Budget', 'Genres', 'Homepage', 'ID',
       'IMDB_id', 'Original Language', 'Original Title', 'Overview',
       'Popularity', 'Poster Path', 'Production Companies',
       'Production Countries', 'Release Date', 'Revenue', 'Runtime',
       'Spoken Languages', 'Status', 'Tagline', 'Title', 'Video',
       'Vote Average', 'Vote Count'],
      dtype='object')

In [10]:
#Checks data type of Release Date
df['Release Date'].dtype

dtype('O')

In [11]:
#Changing Release Date to Release Month
df['Month'] = pd.to_datetime(df['Release Date'],  errors='coerce').dt.month

In [12]:
#Adding in Release Month Column
df['Month'] = df['Month']

In [13]:
#Creating blank column for Season Released
df["Season Released"] = ""
df.head()

Unnamed: 0,Adult,Belongs To Collection,Budget,Genres,Homepage,ID,IMDB_id,Original Language,Original Title,Overview,...,Runtime,Spoken Languages,Status,Tagline,Title,Video,Vote Average,Vote Count,Month,Season Released
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,10.0,
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,12.0,
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,12.0,
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,12.0,
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,2.0,


In [14]:
#Mapping each Release Month to a season released 
df.loc[df.Month == 1.0, 'Season Released'] = '1' 
df.loc[df.Month == 2.0, 'Season Released'] = '1'
df.loc[df.Month == 3.0, 'Season Released'] = '2' 
df.loc[df.Month == 4.0, 'Season Released'] = '2' 
df.loc[df.Month == 5.0, 'Season Released'] = '2' 
df.loc[df.Month == 6.0, 'Season Released'] = '3' 
df.loc[df.Month == 7.0, 'Season Released'] = '3' 
df.loc[df.Month == 8.0, 'Season Released'] = '3' 
df.loc[df.Month == 9.0, 'Season Released'] = '4' 
df.loc[df.Month == 10.0, 'Season Released'] = '4' 
df.loc[df.Month == 11.0, 'Season Released'] = '4' 
df.loc[df.Month == 12.0, 'Season Released'] = '1' 

df.head()

Unnamed: 0,Adult,Belongs To Collection,Budget,Genres,Homepage,ID,IMDB_id,Original Language,Original Title,Overview,...,Runtime,Spoken Languages,Status,Tagline,Title,Video,Vote Average,Vote Count,Month,Season Released
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,10.0,4
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,12.0,1
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,12.0,1
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,12.0,1
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,2.0,1


In [15]:
#Check nulls
df.isnull().sum()

Adult                        0
Belongs To Collection    40972
Budget                       0
Genres                       0
Homepage                 37684
ID                           0
IMDB_id                     17
Original Language           11
Original Title               0
Overview                   954
Popularity                   5
Poster Path                386
Production Companies         3
Production Countries         3
Release Date                87
Revenue                      6
Runtime                    263
Spoken Languages             6
Status                      87
Tagline                  25054
Title                        6
Video                        6
Vote Average                 6
Vote Count                   6
Month                       90
Season Released              0
dtype: int64

In [16]:
df.Popularity.describe()

count     45461
unique    44176
top         0.0
freq         34
Name: Popularity, dtype: object

In [17]:
df['Vote Count'].sort_values()

45465        0.0
23680        0.0
23677        0.0
23674        0.0
23671        0.0
23670        0.0
23663        0.0
23648        0.0
23644        0.0
23642        0.0
23640        0.0
23634        0.0
23600        0.0
23565        0.0
23549        0.0
23548        0.0
42666        0.0
42667        0.0
23681        0.0
23531        0.0
23687        0.0
23703        0.0
23870        0.0
42638        0.0
23842        0.0
23839        0.0
23837        0.0
42640        0.0
23803        0.0
23801        0.0
          ...   
1639      7770.0
26555     7993.0
351       8147.0
7000      8226.0
314       8358.0
19971     8427.0
292       8670.0
25084     8842.0
4863      8892.0
12588     8951.0
20830     8951.0
2458      9079.0
18252     9263.0
26553     9629.0
18244     9634.0
2843      9678.0
23753    10014.0
20051    10297.0
22879    11187.0
26564    11444.0
17818    12000.0
14551    12114.0
12481    12269.0
15480    14075.0
19729        NaN
19730        NaN
29502        NaN
29503        N

In [18]:
#Check data types
df.dtypes

Adult                     object
Belongs To Collection     object
Budget                    object
Genres                    object
Homepage                  object
ID                        object
IMDB_id                   object
Original Language         object
Original Title            object
Overview                  object
Popularity                object
Poster Path               object
Production Companies      object
Production Countries      object
Release Date              object
Revenue                  float64
Runtime                  float64
Spoken Languages          object
Status                    object
Tagline                   object
Title                     object
Video                     object
Vote Average             float64
Vote Count               float64
Month                    float64
Season Released           object
dtype: object

In [19]:
#Finding which columns in dataframe have 0 revenue
df[df.Revenue == 0].count()

Adult                    38052
Belongs To Collection     3007
Budget                   38052
Genres                   38052
Homepage                  5410
ID                       38052
IMDB_id                  38035
Original Language        38041
Original Title           38052
Overview                 37120
Popularity               38052
Poster Path              37671
Production Companies     38052
Production Countries     38052
Release Date             37969
Revenue                  38052
Runtime                  37801
Spoken Languages         38052
Status                   37973
Tagline                  14465
Title                    38052
Video                    38052
Vote Average             38052
Vote Count               38052
Month                    37969
Season Released          38052
dtype: int64

In [20]:
#Setting new dataframe to contain only movies without 0 for Revenue
df = df[df.Revenue != 0]

In [21]:
#Confirming no movies in dataframe have 0 revenue
df[df.Revenue == 0].count()

Adult                    0
Belongs To Collection    0
Budget                   0
Genres                   0
Homepage                 0
ID                       0
IMDB_id                  0
Original Language        0
Original Title           0
Overview                 0
Popularity               0
Poster Path              0
Production Companies     0
Production Countries     0
Release Date             0
Revenue                  0
Runtime                  0
Spoken Languages         0
Status                   0
Tagline                  0
Title                    0
Video                    0
Vote Average             0
Vote Count               0
Month                    0
Season Released          0
dtype: int64

In [22]:
#Checking length of new dataframe
len(df)

7414

In [23]:
#Change Budget columns to floats and confirm change
df['Budget'] = pd.to_numeric(df['Budget'],errors='coerce')
df['Budget'].dtype

dtype('float64')

In [24]:
#Change season released to floats and confirm change
df['Season Released'] = pd.to_numeric(df['Season Released'],errors='coerce')
df['Season Released'].dtype

dtype('float64')

In [25]:
#Change popularity type from object to float
df.Popularity.dtypes
df['Popularity'] = pd.to_numeric(df['Popularity'],errors='coerce')
df.Popularity.dtypes

dtype('float64')

In [26]:
#See format of Genres
eval(df['Genres'][0])

[{'id': 16, 'name': 'Animation'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 10751, 'name': 'Family'}]

In [27]:
#for each of the rows in Genres, pull all genres and use to run correlation
genres = []
genreslist = []
#Going through df and getting genres
for string in df['Genres']:
    #Getting rid of strings for each list in Genres 
    lst = eval(string)
    #If list has Genre info, create empty list in OMG
    if len(lst) > 0:
        onemoviesgenres = []
        #Going through each dict in list, getting list of genres
        for j in range(len(lst)):
            onegenre= lst[j]["name"]
            onemoviesgenres.append(onegenre)
        genreslist.append(onemoviesgenres)
        
        #Picking out primary genre for each movie
        genre = lst[0]["name"]
        genres.append(genre)
    else:
        genres.append('NA')
        genreslist.append('NA')

In [28]:
#Creates new column for primary genre and all genres
df['Primary Genre'] = genres
df['List of Genres'] = genreslist
df.head()

Unnamed: 0,Adult,Belongs To Collection,Budget,Genres,Homepage,ID,IMDB_id,Original Language,Original Title,Overview,...,Status,Tagline,Title,Video,Vote Average,Vote Count,Month,Season Released,Primary Genre,List of Genres
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,False,7.7,5415.0,10.0,4.0,Animation,"[Animation, Comedy, Family]"
1,False,,65000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,12.0,1.0,Adventure,"[Adventure, Fantasy, Family]"
3,False,,16000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,12.0,1.0,Comedy,"[Comedy, Drama, Romance]"
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0.0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,2.0,1.0,Comedy,[Comedy]
5,False,,60000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",...,Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0,12.0,1.0,Action,"[Action, Crime, Drama, Thriller]"


In [None]:
#Going through list of Genres - and print number of each unique value for distribution plot

#then trying to isolate each genre with <50 values to get rid of 


In [29]:
#Creates placeholder for all unique values in Primary genre column
for genre in df['Primary Genre'].unique():
    #creates dataframe that contains all instances where Primary Genre is unique value
    df_for = df.loc[df['Primary Genre'] == genre]
    #prints length of PG and Genre name
    print(len(df_for['Primary Genre']))
    print(genre)

189
Animation
478
Adventure
1570
Comedy
1196
Action
79
Family
39
History
1937
Drama
334
Crime
182
Fantasy
116
Science Fiction
49
Music
403
Horror
188
Documentary
176
Romance
91
Mystery
259
Thriller
50
War
44
Western
23
NA
6
Foreign
2
TV Movie
1
Carousel Productions
1
Aniplex
1
Odyssey Media


In [30]:
#Checks number of unqiue genres for movies in dataframe
df['Primary Genre'].unique()

array(['Animation', 'Adventure', 'Comedy', 'Action', 'Family', 'History',
       'Drama', 'Crime', 'Fantasy', 'Science Fiction', 'Music', 'Horror',
       'Documentary', 'Romance', 'Mystery', 'Thriller', 'War', 'Western',
       'NA', 'Foreign', 'TV Movie', 'Carousel Productions', 'Aniplex',
       'Odyssey Media'], dtype=object)

In [31]:
#Creates placeholder for all unique values in Primary genre column
for genre in df['Primary Genre'].unique():
    #creates dataframe that contains all instances where Primary Genre is unique value
    df_50 = df.loc[df['Primary Genre'] == genre]
    #prints length of PG and Genre name
    if len(df_50['Primary Genre']) <= 50 :
        print(len(df_50['Primary Genre']))
        print(genre)

39
History
49
Music
50
War
44
Western
23
NA
6
Foreign
2
TV Movie
1
Carousel Productions
1
Aniplex
1
Odyssey Media


In [32]:
df.head()

Unnamed: 0,Adult,Belongs To Collection,Budget,Genres,Homepage,ID,IMDB_id,Original Language,Original Title,Overview,...,Status,Tagline,Title,Video,Vote Average,Vote Count,Month,Season Released,Primary Genre,List of Genres
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,False,7.7,5415.0,10.0,4.0,Animation,"[Animation, Comedy, Family]"
1,False,,65000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,12.0,1.0,Adventure,"[Adventure, Fantasy, Family]"
3,False,,16000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,12.0,1.0,Comedy,"[Comedy, Drama, Romance]"
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0.0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,2.0,1.0,Comedy,[Comedy]
5,False,,60000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",...,Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0,12.0,1.0,Action,"[Action, Crime, Drama, Thriller]"


In [34]:
#List of genres that have less than 50 observations to remove
genres_remove = ['History', 'Music', 'War', 'Western', 'NA', 'Foreign', 'TV Movie', 'Carousel Productions', 'Aniplex', 'Odyssey Media']

In [35]:
#For loop to set df without removed genres 
for genre in genres_remove:
    df = df[df['Primary Genre'] != genre]

In [36]:
#Confirming genres removed
df['Primary Genre'].unique()

array(['Animation', 'Adventure', 'Comedy', 'Action', 'Family', 'Drama',
       'Crime', 'Fantasy', 'Science Fiction', 'Horror', 'Documentary',
       'Romance', 'Mystery', 'Thriller'], dtype=object)

In [37]:
#Creates dummy variable 
df_pg = pd.get_dummies(df['Primary Genre'])

In [38]:
#DF 
df_1 = pd.concat([df, df_pg], axis=1)
df_1.head()

Unnamed: 0,Adult,Belongs To Collection,Budget,Genres,Homepage,ID,IMDB_id,Original Language,Original Title,Overview,...,Crime,Documentary,Drama,Family,Fantasy,Horror,Mystery,Romance,Science Fiction,Thriller
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,0,0,0,0,0,0,0,0,0,0
1,False,,65000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,0,0,0,0,0,0,0,0,0,0
3,False,,16000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,0,0,0,0,0,0,0,0,0,0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0.0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,0,0,0,0,0,0,0,0,0,0
5,False,,60000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",...,0,0,0,0,0,0,0,0,0,0


In [39]:
#Sees data 
df_1.sum()

Adult                FalseFalseFalseFalseFalseFalseFalseFalseFalseF...
Budget                                                     1.64002e+11
Genres               [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
ID                   8628844313571186294990917109087210321408524458...
IMDB_id              tt0114709tt0113497tt0114885tt0113041tt0113277t...
Original Language    enenenenenenenenenenenenenenenenenenenfrenenen...
Original Title       Toy StoryJumanjiWaiting to ExhaleFather of the...
Popularity                                                     61707.2
Revenue                                                    5.01919e+11
Runtime                                                         772712
Video                                                                0
Vote Average                                                   44717.8
Vote Count                                                 4.07672e+06
Month                                                            49195
Season

In [40]:
#Checks names of columns in dataframe
df_1.columns

Index(['Adult', 'Belongs To Collection', 'Budget', 'Genres', 'Homepage', 'ID',
       'IMDB_id', 'Original Language', 'Original Title', 'Overview',
       'Popularity', 'Poster Path', 'Production Companies',
       'Production Countries', 'Release Date', 'Revenue', 'Runtime',
       'Spoken Languages', 'Status', 'Tagline', 'Title', 'Video',
       'Vote Average', 'Vote Count', 'Month', 'Season Released',
       'Primary Genre', 'List of Genres', 'Action', 'Adventure', 'Animation',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy',
       'Horror', 'Mystery', 'Romance', 'Science Fiction', 'Thriller'],
      dtype='object')

In [41]:
#Confirms dataframe contains only unique Genres 
df_1['Primary Genre'].unique()

array(['Animation', 'Adventure', 'Comedy', 'Action', 'Family', 'Drama',
       'Crime', 'Fantasy', 'Science Fiction', 'Horror', 'Documentary',
       'Romance', 'Mystery', 'Thriller'], dtype=object)

In [42]:
#Confirms List of Genres formatting correctly as an example
df_1['List of Genres'][0]

['Animation', 'Comedy', 'Family']

In [195]:
#Runs describe to get sense of numbers for each columns
df_1.describe()

Unnamed: 0,Budget,Popularity,Revenue,Runtime,Vote Average,Vote Count,Month,Season Released,Action,Adventure,...,Crime,Documentary,Drama,Family,Fantasy,Horror,Mystery,Romance,Science Fiction,Thriller
count,7198.0,7196.0,7196.0,7191.0,7196.0,7196.0,7195.0,7195.0,7198.0,7198.0,...,7198.0,7198.0,7198.0,7198.0,7198.0,7198.0,7198.0,7198.0,7198.0,7198.0
mean,22784340.0,8.575205,69749750.0,107.45543,6.214258,566.525848,6.837387,2.592634,0.166157,0.066407,...,0.046402,0.026118,0.269103,0.010975,0.025285,0.055988,0.012642,0.024451,0.016116,0.035982
std,37155090.0,12.290457,147892900.0,21.158648,1.013983,1113.110457,3.385765,1.134971,0.372248,0.24901,...,0.210368,0.159498,0.443524,0.104194,0.157,0.229914,0.111733,0.154456,0.125929,0.186259
min,0.0,1e-06,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,3.998489,2411482.0,94.0,5.7,42.0,4.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8000000.0,7.482195,17104380.0,104.0,6.3,164.0,7.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,29000000.0,10.953738,68394150.0,117.0,6.9,548.0,10.0,4.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,380000000.0,547.488298,2787965000.0,338.0,10.0,14075.0,12.0,4.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
