Data Acquisition and Loading

In [None]:
# Importing all the necessary libraries needed for the project
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
# Loading the dataset in to the dataFrame
file_path = "IMDbMovies.csv"

In [None]:
# Read the csv file into a dataFrame
df = pd.read_csv(file_path)

Data Exploration and Cleaning

In [None]:
# Displaying the first 5 rows of the DataFrame
df.head()

In [None]:
# Displaying the last 5 rows of the DataFrame
df.tail()

In [None]:
# Concise summary of the DataFrame
df.info()

In [None]:
df = df[df['Release Year'] > 2014]

In [None]:
df.info()

In [None]:
# Removing duplicate rows
df.drop_duplicates(inplace=True)

In [None]:
# Remove rows where 'budget' has missing values (NaN)
df_cleaned = df.dropna(subset=['Budget'])

In [None]:
# Display the first 5 rows in the new DataFrame
df_cleaned.head()

In [None]:
# Changing column names and removing unnecessary characters
# 'Rating' column to 'Rating (Out of 10)'
df_cleaned.rename(columns={'Rating': 'Rating (Out of 10)'},inplace=True)
df_cleaned['Rating (Out of 10)'] = df_cleaned['Rating (Out of 10)'].str.replace(r'/10', '')

# 'Number of Ratings' column to 'Number of Ratings (in thousands)'
df_cleaned.rename(columns={'Number of Ratings': 'Number of Ratings (in thousands)'},inplace=True)
df_cleaned['Number of Ratings (in thousands)'] = df_cleaned['Number of Ratings (in thousands)'].str.replace(r'\D', '', regex=True)

# 'Budget' column to 'Budget (in millions)'
df_cleaned.rename(columns={'Budget': 'Budget (in millions)'}, inplace=True)

# 'Gross in US & Canada' column to 'Gross in US & Canada (in millions)'
df_cleaned.rename(columns={'Gross in US & Canada': 'Gross in US & Canada (in millions)'}, inplace=True)
df_cleaned['Gross in US & Canada (in millions)'] = df_cleaned['Gross in US & Canada (in millions)'].str.replace(r'\D', '', regex=True)

# 'Gross worldwide' column to 'Gross worldwide (in millions)'
df_cleaned.rename(columns={'Gross worldwide': 'Gross worldwide (in millions)'}, inplace=True)

# 'Opening Weekend Gross in US & Canada' column to 'Opening Weekend Gross in US & Canada (in millions)'
df_cleaned.rename(columns={'Opening Weekend Gross in US & Canada': 'Opening Weekend Gross in US & Canada (in millions)'}, inplace=True)
df_cleaned.head(2)

In [None]:
# Change the data type for the 'Release Year', 'Gross in US & Canada (in millions)' and 'Number of Ratings (in thousands)' columns
df_cleaned = df_cleaned.astype({'Release Year': 'Int64', 'Gross in US & Canada (in millions)': 'Int64', 'Number of Ratings (in thousands)': 'Int64'})

In [None]:
# Custom function to convert Rating (Out of 10) column to float data type
def convert_to_float(rating):
    # Check if the rating is already a float
    if isinstance(rating, float):
        return rating
    try:
        # Split the string by '/' and convert the first part to float
        return float(rating.split('/')[0])
    except ValueError:
        # if conversion fails return NaN
        return float('nan')

# Apply the custom function to the 'Rating (Out of 10)' column
df_cleaned['Rating (Out of 10)'] = df_cleaned['Rating (Out of 10)'].apply(convert_to_float)

Feature Engineering

In [None]:
# Removing the currency symbol and letters from the data on the 'Budget' and 'Gross worldwide' columns
df_cleaned['Budget (in millions)'] = df_cleaned['Budget (in millions)'].str.replace(r'\D', '', regex=True)
df_cleaned['Gross worldwide (in millions)'] = df_cleaned['Gross worldwide (in millions)'].str.replace(r'\D', '', regex=True)
df_cleaned.head(2)

In [None]:
# Change the data type for the 'Budget' and 'Gross worldwide' columns
df_cleaned = df_cleaned.astype({'Budget (in millions)': 'Int64', 'Gross worldwide (in millions)': 'Int64'})

In [None]:
# Creating a feature called revenue
df_cleaned['Revenue'] = df_cleaned['Gross worldwide (in millions)'] - df_cleaned['Budget (in millions)']

In [None]:
# Renaming the 'Revenue' column to 'Revenue (in millions)'
df_cleaned.rename(columns={'Revenue': 'Revenue (in millions)'}, inplace=True)
df_cleaned.info()

Answering Specific Questions

In [None]:
# Which year had the highest average voting?
highest_avg_rating_by_year = df_cleaned.groupby('Release Year')['Number of Ratings (in thousands)'].mean().idxmax()
highest_avg_rating_by_year

In [None]:
# Which year had the highest average revenue?
highest_avg_revenue_by_year = df_cleaned.groupby('Release Year')['Revenue (in millions)'].mean().idxmax()
highest_avg_revenue_by_year

In [None]:
# What is the average rating for each director?
avg_rating_per_director = df_cleaned.groupby('Director')['Rating (Out of 10)'].mean()
avg_rating_per_director

In [None]:
# Does rating affect revenue?
# correlation = df_cleaned['Number of Ratings (in thousands)'].corr(df_cleaned['Revenue (in millions)'])
# correlation

# Create a smaller DataFrame with just the columns you want to correlate
data_for_heatmap = df_cleaned[['Revenue (in millions)', 'Number of Ratings (in thousands)']]

# Calculate the correlation matrix
correlation_matrix = data_for_heatmap.corr()

# Create a heat map
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Heat Map of Correlation between Revenue and Number of Ratings')
plt.show()

In [None]:
# Saving the dataFrame back to a csv file
df_cleaned.to_csv('cleaned_imdbmovies_data.csv', index=False)

Hypothesis testing


The hypothesis: Action movies have higher revenue than Drama movies

In [None]:
# Import the necessary library to perform the test
import scipy.stats as stats

In [None]:
# Filter the data for action and drama movies
action_movies = df_cleaned[df_cleaned['Main Genres'].str.contains('Action', case=False)]
drama_movies = df_cleaned[df_cleaned['Main Genres'].str.contains('Drama', case=False)]

In [None]:
action_movies['Revenue (in millions)'] = action_movies['Revenue (in millions)'].astype('float64')
drama_movies['Revenue (in millions)'] = drama_movies['Revenue (in millions)'].astype('float64')

In [None]:
action_movies['Revenue (in millions)'] = action_movies['Revenue (in millions)'].fillna(0)
drama_movies['Revenue (in millions)'] = drama_movies['Revenue (in millions)'].fillna(0)

In [None]:
# Perform a t-test
t_statistic, p_value = stats.ttest_ind(action_movies['Revenue (in millions)'], drama_movies['Revenue (in millions)'])

In [None]:
# Print the results
print("T-statistic:", t_statistic)
print("P-value:", p_value)

In [None]:
# Interpret the results
if p_value < 0.05:
    print("We reject the null hypothesis and conclude that action movies have higher revenues than drama movies.")
else:
    print("We fail to reject the null hypothesis and cannot conclude that action movies have higher revenues than drama movies.")