## Feature Engineering
#### Import package

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

#### Read dataset

In [2]:
dataset_folder = os.getcwd() + '/../dataset/'
train = pd.read_csv(dataset_folder + 'train.csv')
test = pd.read_csv(dataset_folder + 'test.csv')
dataset = pd.concat([train, test], ignore_index=True)
dataset = dataset[train.columns]

In [3]:
train.shape, test.shape, dataset.shape

((3000, 23), (4398, 22), (7398, 23))

#### Fill 'revenue', 'budget' and 'runtime' by external data source
The data is taken from the kernel uploaded by <b>Enric Rovira</b>

In [4]:
dataset.loc[dataset['id'] == 16, 'revenue'] = 192864
dataset.loc[dataset['id'] == 313, 'revenue'] = 12000000       
dataset.loc[dataset['id'] == 451, 'revenue'] = 12000000
dataset.loc[dataset['id'] == 1865, 'revenue'] = 25000000 
dataset.loc[dataset['id'] == 2491, 'revenue'] = 6800000

In [5]:
dataset.loc[dataset['id'] == 90, 'budget'] = 30000000                
dataset.loc[dataset['id'] == 118, 'budget'] = 60000000  
dataset.loc[dataset['id'] == 149, 'budget'] = 18000000  
dataset.loc[dataset['id'] == 464, 'budget'] = 20000000       
dataset.loc[dataset['id'] == 470, 'budget'] = 13000000      
dataset.loc[dataset['id'] == 513, 'budget'] = 930000          
dataset.loc[dataset['id'] == 797, 'budget'] = 8000000       
dataset.loc[dataset['id'] == 819, 'budget'] = 90000000       
dataset.loc[dataset['id'] == 850, 'budget'] = 90000000  
dataset.loc[dataset['id'] == 1112, 'budget'] = 7500000  
dataset.loc[dataset['id'] == 1131, 'budget'] = 4300000      
dataset.loc[dataset['id'] == 1359, 'budget'] = 10000000      
dataset.loc[dataset['id'] == 1542, 'budget'] = 1500000          
dataset.loc[dataset['id'] == 1542, 'budget'] = 15800000      
dataset.loc[dataset['id'] == 1571, 'budget'] = 4000000        
dataset.loc[dataset['id'] == 1714, 'budget'] = 46000000       
dataset.loc[dataset['id'] == 1721, 'budget'] = 17500000            
dataset.loc[dataset['id'] == 2268, 'budget'] = 17500000      
dataset.loc[dataset['id'] == 2602, 'budget'] = 31000000
dataset.loc[dataset['id'] == 2612, 'budget'] = 15000000
dataset.loc[dataset['id'] == 2696, 'budget'] = 10000000
dataset.loc[dataset['id'] == 2801, 'budget'] = 10000000
dataset.loc[dataset['id'] == 3889, 'budget'] = 15000000       
dataset.loc[dataset['id'] == 6733, 'budget'] = 5000000     
dataset.loc[dataset['id'] == 3197, 'budget'] = 8000000     
dataset.loc[dataset['id'] == 6683, 'budget'] = 50000000     
dataset.loc[dataset['id'] == 5704, 'budget'] = 4300000     
dataset.loc[dataset['id'] == 6109, 'budget'] = 281756      
dataset.loc[dataset['id'] == 7242, 'budget'] = 10000000     
dataset.loc[dataset['id'] == 7021, 'budget'] = 17540562
dataset.loc[dataset['id'] == 5591, 'budget'] = 4000000      
dataset.loc[dataset['id'] == 4282, 'budget'] = 20000000

In [6]:
dataset.loc[dataset['id'] == 391, 'runtime'] = 86 
dataset.loc[dataset['id'] == 592, 'runtime'] = 90 
dataset.loc[dataset['id'] == 925, 'runtime'] = 95 
dataset.loc[dataset['id'] == 978, 'runtime'] = 93 
dataset.loc[dataset['id'] == 1256, 'runtime'] = 92 
dataset.loc[dataset['id'] == 1542, 'runtime'] = 93
dataset.loc[dataset['id'] == 1875, 'runtime'] = 86 
dataset.loc[dataset['id'] == 2151, 'runtime'] = 108
dataset.loc[dataset['id'] == 2499, 'runtime'] = 108 
dataset.loc[dataset['id'] == 2646, 'runtime'] = 98
dataset.loc[dataset['id'] == 2786, 'runtime'] = 111
dataset.loc[dataset['id'] == 2866, 'runtime'] = 96
dataset.loc[dataset['id'] == 4074, 'runtime'] = 103 
dataset.loc[dataset['id'] == 4222, 'runtime'] = 93
dataset.loc[dataset['id'] == 4431, 'runtime'] = 100 
dataset.loc[dataset['id'] == 5520, 'runtime'] = 86 
dataset.loc[dataset['id'] == 5845, 'runtime'] = 83 
dataset.loc[dataset['id'] == 5849, 'runtime'] = 140
dataset.loc[dataset['id'] == 6210, 'runtime'] = 104
dataset.loc[dataset['id'] == 6804, 'runtime'] = 145 
dataset.loc[dataset['id'] == 7321, 'runtime'] = 87

#### Feature 'release_date'

In [7]:
dataset.loc[dataset.release_date.isnull(), 'release_date'] = '05/01/2000'

dataset['release_year'] = dataset.release_date.str.extract('\S+/\S+/(\S+)', expand=False).astype(np.int16)
dataset['release_month'] = dataset.release_date.str.extract('(\S+)/\S+/\S+', expand=False).astype(np.int16)
dataset['release_day'] = dataset.release_date.str.extract('\S+/(\S+)/\S+', expand=False).astype(np.int16)

dataset.loc[(21 <= dataset.release_year) & (dataset.release_year <= 99), 'release_year'] += 1900
dataset.loc[dataset.release_year < 21, 'release_year'] += 2000

dataset['release_date'] = pd.to_datetime(dataset.release_day.astype(str) + '-' + 
                                         dataset.release_month.astype(str) + '-' + 
                                         dataset.release_year.astype(str))

dataset['release_weekday'] = dataset.release_date.dt.weekday + 1
dataset['release_quarter'] = dataset.release_date.dt.quarter

In [8]:
dataset['release_weekday_sine'] = np.sin(2 * np.pi * dataset.release_weekday / 7)
dataset['release_weekday_cosine'] = np.cos(2 * np.pi * dataset.release_weekday / 7)
dataset['release_day_sine'] = np.sin(2 * np.pi * dataset.release_day / 31)
dataset['release_day_cosine'] = np.cos(2 * np.pi * dataset.release_day / 31)
dataset['release_month_sine'] = np.sin(2 * np.pi * dataset.release_month / 12)
dataset['release_month_cosine'] = np.cos(2 * np.pi * dataset.release_month / 12)
dataset['release_quarter_sine'] = np.sin(2 * np.pi * dataset.release_quarter / 4)
dataset['release_quarter_cosine'] = np.cos(2 * np.pi * dataset.release_quarter / 4)

#### Feature 'original_language'

In [9]:
def merge_small_category(x, group_name, threshold, value_counts):
    if value_counts[x] < threshold:
        return group_name
    else:
        return x

language_count = dataset.original_language.value_counts()
dataset['original_language'] = dataset.original_language.apply(merge_small_category, 
                                                               args=('original_language_others', 10, language_count))
dataset = pd.concat([dataset, dataset.original_language.str.get_dummies()], axis=1)

#### Feature 'genres', 'production_companies', 'production_countries' and 'spoken_languages'

In [10]:
threshold = 10
for feature in ['genres', 'production_companies', 'production_countries', 'spoken_languages']:
    dataset.loc[dataset[feature].isnull(), feature] = '{}'
    dataset[feature] = dataset[feature].apply(lambda x: sorted([d['name'] for d in eval(x)])).apply(lambda x: ','.join(map(str, x)))
    tmp = dataset[feature].str.get_dummies(sep=',')
    tmp = tmp.loc[:, tmp.sum() > threshold]
    dataset = pd.concat([dataset, tmp], axis=1)

#### Normalize numeric features

In [11]:
scaler = MinMaxScaler()
for feature in ['runtime', 'budget', 'popularity', 'release_year']:
    dataset.loc[dataset[feature].isnull(), feature] = np.nanmedian(dataset[feature])
    dataset[feature] = scaler.fit_transform(dataset[feature].values.reshape(-1, 1))

#### Generate features

In [12]:
dataset['has_collection'] = 1
dataset.loc[dataset.belongs_to_collection.isnull(), 'has_collection'] = 0
dataset['has_homepage'] = 1
dataset.loc[dataset.homepage.isnull(), 'has_homepage'] = 0
dataset['has_tagline'] = 1
dataset.loc[dataset.tagline.isnull(), 'has_tagline'] = 0

#### Drop features not used

In [13]:
dataset = dataset.drop(['id', 'belongs_to_collection', 'genres', 'homepage', 'imdb_id', 'original_language',
                        'original_title', 'overview', 'poster_path', 'production_companies', 'production_countries', 
                        'release_date', 'spoken_languages', 'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 
                        'release_day', 'release_month', 'release_weekday', 'release_quarter'], axis=1)

#### Write processed dataset to file

In [14]:
processed_train, processed_test = dataset.loc[0:2999, :], dataset.loc[3000:, :]
processed_test.drop(['revenue'], axis=1)

processed_train.to_csv(dataset_folder + 'processed_train.csv')
processed_test.to_csv(dataset_folder + 'processed_test.csv')