Import Basic Libraries

In [21]:
#Lets import the basic libraries
import numpy as np
import pandas as pd

#For Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

#For Jupyter notebook widgets
import ipywidgets as widgets
from ipywidgets import interact
from ipywidgets import interact_manual

#For interactive shells
from IPython.display import display

#Setting up the chart size and background
plt.rcParams['figure.figsize'] = (16,8)
plt.style.use('fivethirtyeight')


In [22]:
#Lets read the dataset
data = pd.read_csv('movie_metadata.csv')

In [23]:
#Lets check the shape
print(data.shape)

(5043, 28)


In [24]:
#lets check the columnwise info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      5024 non-null   object 
 1   director_name              4939 non-null   object 
 2   num_critic_for_reviews     4993 non-null   float64
 3   duration                   5028 non-null   float64
 4   director_facebook_likes    4939 non-null   float64
 5   actor_3_facebook_likes     5020 non-null   float64
 6   actor_2_name               5030 non-null   object 
 7   actor_1_facebook_likes     5036 non-null   float64
 8   gross                      4159 non-null   float64
 9   genres                     5043 non-null   object 
 10  actor_1_name               5036 non-null   object 
 11  movie_title                5043 non-null   object 
 12  num_voted_users            5043 non-null   int64  
 13  cast_total_facebook_likes  5043 non-null   int64

In [25]:
#Lets remove unnecessary information from the dataset
#Use the drop function to drop unnecessary columns
data=data.drop(['color', 
                      'director_facebook_likes', 
                      'actor_3_facebook_likes', 
                      'actor_1_facebook_likes', 
                      'cast_total_facebook_likes', 
                      'actor_2_facebook_likes',  
                      'facenumber_in_poster', 
                      'content_rating', 
                      'country', 
                      'movie_imdb_link', 
                      'aspect_ratio',
                      'plot_keywords',],
                              axis=1)
data.columns  

Index(['director_name', 'num_critic_for_reviews', 'duration', 'actor_2_name',
       'gross', 'genres', 'actor_1_name', 'movie_title', 'num_voted_users',
       'actor_3_name', 'num_user_for_reviews', 'language', 'budget',
       'title_year', 'imdb_score', 'movie_facebook_likes'],
      dtype='object')

Missing Value Imputation

In [26]:
#Lets check the rows having high percentage of missing values in the dataset

round(100*(data.isnull().sum()/len(data.index)), 2)

director_name              2.06
num_critic_for_reviews     0.99
duration                   0.30
actor_2_name               0.26
gross                     17.53
genres                     0.00
actor_1_name               0.14
movie_title                0.00
num_voted_users            0.00
actor_3_name               0.46
num_user_for_reviews       0.42
language                   0.24
budget                     9.76
title_year                 2.14
imdb_score                 0.00
movie_facebook_likes       0.00
dtype: float64

In [27]:
#since 'gross' and 'budget' columns have large number of NaN values, dropall the rows with NaNs at this column using the 'isnan'
# function of NumPy alongwith a negation '~'

data = data [~np.isnan(data['gross'])]
data = data [~np.isnan(data['budget'])]

#we have the remove all the rows where the leading actor's name is missing 
# =data [~np.isnan(data['actor_1_name'])]

#Now lets again  check the missing values column wise
data.isnull().sum()

director_name              0
num_critic_for_reviews     1
duration                   1
actor_2_name               5
gross                      0
genres                     0
actor_1_name               3
movie_title                0
num_voted_users            0
actor_3_name              10
num_user_for_reviews       0
language                   3
budget                     0
title_year                 0
imdb_score                 0
movie_facebook_likes       0
dtype: int64

In [28]:
# The rows for which the sum of NULL is less than two aare restrained 

data = data[data.isnull().sum(axis=1)<=2]
data.isnull().sum()

director_name             0
num_critic_for_reviews    1
duration                  1
actor_2_name              2
gross                     0
genres                    0
actor_1_name              0
movie_title               0
num_voted_users           0
actor_3_name              7
num_user_for_reviews      0
language                  3
budget                    0
title_year                0
imdb_score                0
movie_facebook_likes      0
dtype: int64

In [36]:
#Lets imput the missing values 

#using mean for numerical columns
data['num_critic_for_reviews'].fillna(data['num_critic_for_reviews'].mean(), inplace = True)
data['duration'].fillna(data ['duration'].mean(), inplace = True)

#using mode for categorical column 
data['language'].fillna(data['language'].mode()[0],inplace = True)

#As we know that we cannot use the statistical values for imputing the missing values of actor names, so we will replace the  
#actor names with "Unknown Actor"

data['actor_2_name'].fillna('Unknown Actor', inplace = True )
data['actor_3_name'].fillna('Unknown Actor', inplace = True )

#As we imputed all the missing values lets check the no of total misiing values in the dataset
data.isnull().sum().sum()


0

Featured Engineering

In [34]:
#Lets convert the gross and budget from $ to million $ to make our analysis easier

data['gross']= data['gross']/1000000
data['budget']=data['budget']/1000000

In [35]:
#Lets create a profit column using the budget and gross 

data['profit']= data['gross']-data['budget']

# lets also check the name of Top 10 Profitable Movies
data[['Profit','movie_title']].sort_values(by = 'Profit', ascending  = False).head(10) 

KeyError: "['Profit'] not in index"