# Feature Engineering

This notbeook contains feature engineering on two datasets, one where I merged data from The Numbers with TMDB movie info and one where I merged the same data from The Numbers with IMDB movie info.

In [2]:
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
%matplotlib inline



## TN + TMDB

In [18]:
df = pd.read_csv('../data/processed/tn_tmdb_merged.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1900 entries, 0 to 1899
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1900 non-null   int64 
 1   release_date       1900 non-null   object
 2   title              1900 non-null   object
 3   production_budget  1900 non-null   object
 4   domestic_gross     1900 non-null   object
 5   worldwide_gross    1900 non-null   object
 6   year               1900 non-null   int64 
 7   month              1900 non-null   int64 
 8   day                1900 non-null   int64 
 9   genre_ids          1900 non-null   object
 10  original_language  1900 non-null   object
dtypes: int64(4), object(7)
memory usage: 163.4+ KB


In [19]:
df.head()

Unnamed: 0,id,release_date,title,production_budget,domestic_gross,worldwide_gross,year,month,day,genre_ids,original_language
0,1,2009-12-18,Avatar,"$425,000,000","$760,507,625","$2,776,345,279",2009,12,18,"[28, 12, 14, 878]",en
1,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",2011,5,20,"[12, 28, 14]",en
2,4,2015-05-01,Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963",2015,5,1,"[28, 12, 878]",en
3,7,2018-04-27,Avengers: Infinity War,"$300,000,000","$678,815,482","$2,048,134,200",2018,4,27,"[12, 28, 14]",en
4,9,2017-11-17,Justice League,"$300,000,000","$229,024,295","$655,945,209",2017,11,17,"[28, 12, 14, 878]",en


In [20]:
type(df['genre_ids'][0])

str

In [21]:
# Before I can proceed, I need to convert all the numerical genre_ids to string genre names
genre_dict = {
    28: 'Action',
    12: 'Adventure',
    16: 'Animation',
    35: 'Comedy',
    80: 'Crime',
    99: 'Documentary',
    18: 'Drama',
    10751: 'Family',
    14: 'Fantasy',
    36: 'History',
    27: 'Horror',
    10402: 'Music',
    9648: 'Mystery',
    10749: 'Romance',
    878: 'Science Fiction',
    10770: 'TV Movie',
    53: 'Thriller',
    10752: 'War',
    37: 'Western'
}

# Function to map the genre_ids to genre names
def map_genre_ids_to_names(genre_ids):
    # Convert the genre_ids string to a list and map each ID to its corresponding name
    return [genre_dict.get(genre_id, 'Unknown') for genre_id in eval(genre_ids)]

# Apply the function to the 'genre_ids' column and create a new column 'genre_names'
df['genre_names'] = df['genre_ids'].apply(map_genre_ids_to_names)

In [22]:
# Removing the square brackets from genre_names
df['genre_names'] = df['genre_names'].apply(lambda x: ', '.join(x))
df['genre_names'].head()

0    Action, Adventure, Fantasy, Science Fiction
1                     Adventure, Action, Fantasy
2             Action, Adventure, Science Fiction
3                     Adventure, Action, Fantasy
4    Action, Adventure, Fantasy, Science Fiction
Name: genre_names, dtype: object

In [23]:
# Dropping irrelevant columns
df.drop(columns=['genre_ids', 'id'], inplace=True)

In [24]:
df.head()

Unnamed: 0,release_date,title,production_budget,domestic_gross,worldwide_gross,year,month,day,original_language,genre_names
0,2009-12-18,Avatar,"$425,000,000","$760,507,625","$2,776,345,279",2009,12,18,en,"Action, Adventure, Fantasy, Science Fiction"
1,2011-05-20,Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",2011,5,20,en,"Adventure, Action, Fantasy"
2,2015-05-01,Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963",2015,5,1,en,"Action, Adventure, Science Fiction"
3,2018-04-27,Avengers: Infinity War,"$300,000,000","$678,815,482","$2,048,134,200",2018,4,27,en,"Adventure, Action, Fantasy"
4,2017-11-17,Justice League,"$300,000,000","$229,024,295","$655,945,209",2017,11,17,en,"Action, Adventure, Fantasy, Science Fiction"


In [25]:
# domestic_gross is inlcuded in worldwide_gross so that can be dropped
df.drop(columns=['domestic_gross'], inplace=True)

In [16]:
# Function to clean and convert to integer
def clean_and_convert_to_int(value):
    # Remove any commas or dollar signs and convert to integer
    if pd.isna(value):
        return 0  # Handle missing values by setting them to 0 or another appropriate value
    return int(str(value).replace(',', '').replace('$', ''))

In [28]:
# Converting production_budget and worldwide_gross into integers

# Apply the function to the 'worldwide_gross' and 'production_budget' columns
df['worldwide_gross'] = df['worldwide_gross'].apply(clean_and_convert_to_int)
df['production_budget'] = df['production_budget'].apply(clean_and_convert_to_int)


In [30]:
# Create ROI column 
df['ROI'] = (df['worldwide_gross'] - df['production_budget']) / df['production_budget']
df.head()

Unnamed: 0,release_date,title,production_budget,worldwide_gross,year,month,day,original_language,genre_names,ROI
0,2009-12-18,Avatar,425000000,2776345279,2009,12,18,en,"Action, Adventure, Fantasy, Science Fiction",5.532577
1,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,1045663875,2011,5,20,en,"Adventure, Action, Fantasy",1.546673
2,2015-05-01,Avengers: Age of Ultron,330600000,1403013963,2015,5,1,en,"Action, Adventure, Science Fiction",3.243841
3,2018-04-27,Avengers: Infinity War,300000000,2048134200,2018,4,27,en,"Adventure, Action, Fantasy",5.827114
4,2017-11-17,Justice League,300000000,655945209,2017,11,17,en,"Action, Adventure, Fantasy, Science Fiction",1.186484


In [26]:
# Create dummy variables from genre_names
genre_dummies = df['genre_names'].str.get_dummies(sep=', ')
genre_dummies.head()

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0


In [31]:
# Concatenate genre_dummies onto the df and drop genre_names
df = pd.concat([df, genre_dummies], axis=1)

In [33]:
df.drop(columns=['genre_names'], inplace=True)

In [36]:
df.rename(columns={'year': 'premiere_year', 'month': 'premiere_month'}, inplace=True)

In [37]:
# Splitting movie release month into different buckets using dummy variables
df = pd.get_dummies(df, columns=['premiere_month'])

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1900 entries, 0 to 1899
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   release_date       1900 non-null   object 
 1   title              1900 non-null   object 
 2   production_budget  1900 non-null   int64  
 3   worldwide_gross    1900 non-null   int64  
 4   premiere_year      1900 non-null   int64  
 5   day                1900 non-null   int64  
 6   original_language  1900 non-null   object 
 7   ROI                1900 non-null   float64
 8   Action             1900 non-null   int64  
 9   Adventure          1900 non-null   int64  
 10  Animation          1900 non-null   int64  
 11  Comedy             1900 non-null   int64  
 12  Crime              1900 non-null   int64  
 13  Documentary        1900 non-null   int64  
 14  Drama              1900 non-null   int64  
 15  Family             1900 non-null   int64  
 16  Fantasy            1900 

In [43]:
# Convert the boolean values in premiere month columns to integers
df[['premiere_month_1', 'premiere_month_2', 'premiere_month_3', 'premiere_month_4', 'premiere_month_5', 'premiere_month_6',
'premiere_month_7', 'premiere_month_8', 'premiere_month_9', 'premiere_month_10', 'premiere_month_11', 'premiere_month_12']] = df[['premiere_month_1', 'premiere_month_2', 'premiere_month_3', 'premiere_month_4', 'premiere_month_5', 'premiere_month_6',
'premiere_month_7', 'premiere_month_8', 'premiere_month_9', 'premiere_month_10', 'premiere_month_11', 'premiere_month_12']].astype(int)

In [47]:
# Export to CSV for use in EDA and regression
df.to_csv('../data/final/tn_tmdb_final.csv', index=False)

## TN + IMDB

In [3]:
tn_imdb = pd.read_csv('../data/processed/tn_imdb_merged.csv')
tn_imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2312 entries, 0 to 2311
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 2312 non-null   int64  
 1   release_date       2312 non-null   object 
 2   title              2312 non-null   object 
 3   production_budget  2312 non-null   object 
 4   domestic_gross     2312 non-null   object 
 5   worldwide_gross    2312 non-null   object 
 6   runtime_minutes    2138 non-null   float64
 7   genres             2287 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 144.6+ KB


#### Converting 'release_date' into datetime objects because I forgot to do this earlier for this particular table

In [6]:
# Create a mapping of month names to numbers
month_mapping = {
    "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04",
    "May": "05", "Jun": "06", "Jul": "07", "Aug": "08",
    "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"
}

In [8]:
# Define a function to replace the month name with a number in each string
def replace_month_name(date_string):
    for month, num in month_mapping.items():
        date_string = date_string.replace(month, num)
    return date_string

In [9]:
# Apply the function to the 'release_date' to replace month names with numbers
tn_imdb['release_date'] = tn_imdb['release_date'].apply(replace_month_name)

In [10]:
# And finally, convert the 'release_date' column into datetime
tn_imdb['release_date'] = pd.to_datetime(tn_imdb['release_date'], format='%m %d, %Y')

In [12]:
# Split the newly converted 'release_date' column into separate columns 'year', 'month', and 'day' 
tn_imdb['premiere_year'] = tn_imdb['release_date'].dt.year 
tn_imdb['premiere_month'] = tn_imdb['release_date'].dt.month 
tn_imdb['premiere_day'] = tn_imdb['release_date'].dt.day 

In [13]:
tn_imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2312 entries, 0 to 2311
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 2312 non-null   int64         
 1   release_date       2312 non-null   datetime64[ns]
 2   title              2312 non-null   object        
 3   production_budget  2312 non-null   object        
 4   domestic_gross     2312 non-null   object        
 5   worldwide_gross    2312 non-null   object        
 6   runtime_minutes    2138 non-null   float64       
 7   genres             2287 non-null   object        
 8   premiere_year      2312 non-null   int32         
 9   premiere_month     2312 non-null   int32         
 10  premiere_day       2312 non-null   int32         
dtypes: datetime64[ns](1), float64(1), int32(3), int64(1), object(5)
memory usage: 171.7+ KB


In [14]:
# Dropping irrelevant columns
tn_imdb.drop(columns=['id', 'domestic_gross', 'release_date'], inplace=True)

In [17]:
# As before, converting worldwide_gross and production_budget to integers
tn_imdb['worldwide_gross'] = tn_imdb['worldwide_gross'].apply(clean_and_convert_to_int)
tn_imdb['production_budget'] = tn_imdb['production_budget'].apply(clean_and_convert_to_int)

In [19]:
# Create ROI column 
tn_imdb['ROI'] = (tn_imdb['worldwide_gross'] - tn_imdb['production_budget']) / tn_imdb['production_budget']

In [20]:
tn_imdb.head()

Unnamed: 0,title,production_budget,worldwide_gross,runtime_minutes,genres,premiere_year,premiere_month,premiere_day,ROI
0,Avatar,425000000,2776345279,93.0,Horror,2009,12,18,5.532577
1,Pirates of the Caribbean: On Stranger Tides,410600000,1045663875,136.0,"Action,Adventure,Fantasy",2011,5,20,1.546673
2,Dark Phoenix,350000000,149762350,113.0,"Action,Adventure,Sci-Fi",2019,6,7,-0.572108
3,Avengers: Age of Ultron,330600000,1403013963,141.0,"Action,Adventure,Sci-Fi",2015,5,1,3.243841
4,Avengers: Infinity War,300000000,2048134200,149.0,"Action,Adventure,Sci-Fi",2018,4,27,5.827114


In [21]:
more_dummies = tn_imdb['genres'].str.get_dummies(sep=',')
more_dummies.head()

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Music,Musical,Mystery,News,Romance,Sci-Fi,Sport,Thriller,War,Western
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [23]:
# Same operations for genres and premiere months as in the TN + TMDB section above
tn_imdb = pd.concat([tn_imdb, more_dummies], axis=1)
tn_imdb.drop(columns=['genres'], inplace=True)

In [25]:
tn_imdb = pd.get_dummies(tn_imdb, columns=['premiere_month'])

In [27]:
# Convert the boolean values in premiere month columns to integers
tn_imdb[['premiere_month_1', 'premiere_month_2', 'premiere_month_3', 'premiere_month_4', 'premiere_month_5', 'premiere_month_6',
'premiere_month_7', 'premiere_month_8', 'premiere_month_9', 'premiere_month_10', 'premiere_month_11', 'premiere_month_12']] = tn_imdb[['premiere_month_1', 'premiere_month_2', 'premiere_month_3', 'premiere_month_4', 'premiere_month_5', 'premiere_month_6',
'premiere_month_7', 'premiere_month_8', 'premiere_month_9', 'premiere_month_10', 'premiere_month_11', 'premiere_month_12']].astype(int)

In [28]:
tn_imdb.to_csv('../data/final/tn_imdb_final.csv', index=False)