## Transform "month" Column

Months itself as an integer value does not convey meaningful information, thus, in this notebook we will manually "one-hot" encode it into categorical 1s and 0s.

In [22]:
import pandas as pd
import os

In [23]:
path = os.path.join('..', '..', 'resources','cleaned_data', 'movies_complete_cleaned.csv')
raw_df = pd.read_csv(path)
raw_df.head()

Unnamed: 0,name,production,director,runtime,released,year,month,country_kaggle,country_omdb,star_kaggle,...,plot,awards,score_imdb,votes_imdb,score_metacritic,budget,genre_kaggle,gross,genres_omdb,rating
0,Doctor Strange,Marvel Studios,Scott Derrickson,115,2016-11-04,2016,11,USA,USA,Benedict Cumberbatch,...,"Marvel's ""Doctor Strange"" follows the story of...",Nominated for 1 Oscar. Another 19 wins & 67 no...,7.5,348307,72.0,165000000,Action,232641920,"Action, Adventure, Fantasy, Sci-Fi",PG-13
1,Sleight,Diablo Entertainment (II),J.D. Dillard,89,2017-04-28,2016,4,USA,USA,Jacob Latimore,...,A young street magician (Jacob Latimore) is le...,3 nominations.,5.9,4012,62.0,250000,Action,3986245,"Crime, Drama, Sci-Fi",R
2,Silence,Cappa Defina Productions,Martin Scorsese,161,2017-01-13,2016,1,USA,"USA, UK, Taiwan, Japan, Mexico, Italy",Andrew Garfield,...,The story of two Catholic missionaries (Andrew...,Nominated for 1 Oscar. Another 6 wins & 56 nom...,7.2,61798,79.0,46000000,Adventure,7100177,"Drama, History",R
3,Manchester by the Sea,Amazon Studios,Kenneth Lonergan,137,2016-12-16,2016,12,USA,USA,Casey Affleck,...,"Lee Chandler is a brooding, irritable loner wh...",Won 2 Oscars. Another 127 wins & 263 nominations.,7.9,159673,96.0,8500000,Drama,47695371,Drama,R
4,Dirty Grandpa,Lionsgate,Dan Mazer,102,2016-01-22,2016,1,USA,"United States, United Kingdom",Robert De Niro,...,"Jason Kelly, the grandson of Dick Kelly, loses...",2 wins & 11 nominations,6.0,82289,21.0,27500000,Comedy,35593113,Comedy,R


In [24]:
# Select only month column
df = raw_df[['name', 'month']]
df.head(2)

Unnamed: 0,name,month
0,Doctor Strange,11
1,Sleight,4


In [25]:
df.isna().sum()

name     0
month    0
dtype: int64

In [26]:
# Transform
df_months = df.copy()

for index, row in df.iterrows():
    
    if index % 1000 == 0:
        print(f'Counting row #{index}...')
        
    month = str(df.loc[index, 'month'])
    
    if month not in df_months.columns:
        df_months[month] = 0
        df_months.loc[index, month] += 1
    else:
        df_months.loc[index, month] += 1

print(f'---------------')
print(f'Mapping completed.')

df_months.head()

Counting row #0...
Counting row #1000...
Counting row #2000...
Counting row #3000...
Counting row #4000...
Counting row #5000...
Counting row #6000...
---------------
Mapping completed.


Unnamed: 0,name,month,11,4,1,12,8,3,7,9,10,5,2,6
0,Doctor Strange,11,1,0,0,0,0,0,0,0,0,0,0,0
1,Sleight,4,0,1,0,0,0,0,0,0,0,0,0,0
2,Silence,1,0,0,1,0,0,0,0,0,0,0,0,0
3,Manchester by the Sea,12,0,0,0,1,0,0,0,0,0,0,0,0
4,Dirty Grandpa,1,0,0,1,0,0,0,0,0,0,0,0,0


In [27]:
# Double check if there're duplicated months being recorded for the same movie
# Only values in min and max should be 0 and 1 respectively...
check_dup = pd.DataFrame(df_months.describe().loc['min'])
check_dup['max'] = pd.DataFrame(df_months.describe().loc['max'])
print(check_dup[1:].value_counts())
check_dup[1:]

min  max
0.0  1.0    12
dtype: int64


Unnamed: 0,min,max
11,0.0,1.0
4,0.0,1.0
1,0.0,1.0
12,0.0,1.0
8,0.0,1.0
3,0.0,1.0
7,0.0,1.0
9,0.0,1.0
10,0.0,1.0
5,0.0,1.0


In [28]:
# Check dtypes
df_months.dtypes

name     object
month     int64
11        int64
4         int64
1         int64
12        int64
8         int64
3         int64
7         int64
9         int64
10        int64
5         int64
2         int64
6         int64
dtype: object

In [29]:
# Rearrange columns order based on ascending months
reordered_df = df_months.copy()
reordered_df = reordered_df[['name', 'month',
                             '1', '2', '3', '4', '5', '6',
                             '7', '8', '9', '10', '11', '12',
                            ]]
reordered_df.head()

Unnamed: 0,name,month,1,2,3,4,5,6,7,8,9,10,11,12
0,Doctor Strange,11,0,0,0,0,0,0,0,0,0,0,1,0
1,Sleight,4,0,0,0,1,0,0,0,0,0,0,0,0
2,Silence,1,1,0,0,0,0,0,0,0,0,0,0,0
3,Manchester by the Sea,12,0,0,0,0,0,0,0,0,0,0,0,1
4,Dirty Grandpa,1,1,0,0,0,0,0,0,0,0,0,0,0


In [31]:
# Also rename month columns to the string representations of months
abbre_df = reordered_df.copy()
abbre_df.rename(columns={
    '1': 'Jan', '2': 'Feb', '3': 'Mar', '4': 'Apr', '5': 'May', '6': 'Jun',
    '7': 'Jul', '8': 'Aug', '9': 'Sep', '10': 'Oct', '11': 'Nov', '12': 'Dec'
}, inplace=True)
abbre_df

Unnamed: 0,name,month,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,Doctor Strange,11,0,0,0,0,0,0,0,0,0,0,1,0
1,Sleight,4,0,0,0,1,0,0,0,0,0,0,0,0
2,Silence,1,1,0,0,0,0,0,0,0,0,0,0,0
3,Manchester by the Sea,12,0,0,0,0,0,0,0,0,0,0,0,1
4,Dirty Grandpa,1,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6251,Hoosiers,2,0,1,0,0,0,0,0,0,0,0,0,0
6252,Off Beat,4,0,0,0,1,0,0,0,0,0,0,0,0
6253,Big Trouble in Little China,7,0,0,0,0,0,0,1,0,0,0,0,0
6254,Biggles: Adventures in Time,1,1,0,0,0,0,0,0,0,0,0,0,0


In [34]:
# Now, in order to give all the month inputs more meaning, we will also group them
# into four different seasons: spring, summer, fall, and winter.
final_df = abbre_df.copy()

final_df['spring'] = final_df['Feb'] + final_df['Mar'] + final_df['Apr']
final_df['summer'] = final_df['May'] + final_df['Jun'] + final_df['Jul']
final_df['fall'] = final_df['Aug'] + final_df['Sep'] + final_df['Oct']
final_df['winter'] = final_df['Nov'] + final_df['Dec'] + final_df['Jan']

final_df

Unnamed: 0,name,month,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,spring,summer,fall,winter
0,Doctor Strange,11,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
1,Sleight,4,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
2,Silence,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,Manchester by the Sea,12,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
4,Dirty Grandpa,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6251,Hoosiers,2,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
6252,Off Beat,4,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
6253,Big Trouble in Little China,7,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
6254,Biggles: Adventures in Time,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [35]:
# Export final_df to CSV
path = os.path.join('..', '..', 'resources','cleaned_data', 'parsed_months_table.csv')
final_df.to_csv(path, index=False)