In [239]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = 8, 5

from importlib import reload

import plotting
import toolbox as tb
import feature_creation
import ensembling

# reload modules
tb = reload(tb)
plotting = reload(plotting)
feature_creation = reload(feature_creation)
ensembling = reload(ensembling)

from plotting import Plotter
from feature_creation import YNormal, featurize, my_get_dummies

# ignore warnings
import warnings
warnings.filterwarnings(action="ignore")

Load data.

In [3]:
train_data = pd.read_csv('dataset/train.csv')
validation = pd.read_csv('dataset/validation.csv')
test_data = pd.read_csv('dataset/test.csv')

train_data.shape, validation.shape, test_data.shape

((2250, 24), (750, 24), (4398, 22))

In [4]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,750,751,,30000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0332047,en,Fever Pitch,When relaxed and charming Ben Wrightman meets ...,...,4/6/05,103.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,A Comedy About The Game of Love.,Fever Pitch,"[{'id': 1480, 'name': 'baseball'}, {'id': 3419...","[{'cast_id': 1, 'character': 'Lindsey Meeks', ...","[{'credit_id': '52fe443f9251416c7502df99', 'de...",50451307
1,751,752,"[{'id': 5039, 'name': 'Rambo Collection', 'pos...",50000000,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",http://www.rambofilm.com/,tt0462499,en,Rambo,When governments fail to act on behalf of capt...,...,1/24/08,92.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Heroes never die... They just reload.,Rambo,"[{'id': 440, 'name': 'missionary'}, {'id': 126...","[{'cast_id': 12, 'character': 'John Rambo', 'c...","[{'credit_id': '55c8e0ff92514177c9000085', 'de...",113244290
2,752,753,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 99, 'nam...",,tt0175844,en,Lenny Bruce: Swear to Tell the Truth,"In 1948, Lenny Bruce was just another comic wh...",...,10/21/98,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A documentary about the comic who invented con...,Lenny Bruce: Swear to Tell the Truth,"[{'id': 3485, 'name': 'comedian'}, {'id': 5565...","[{'cast_id': 1, 'character': 'Himself / Narrat...","[{'credit_id': '52fe4a19c3a36847f81b92ff', 'de...",21350
3,753,754,"[{'id': 59586, 'name': 'The Blue Lagoon collec...",4500000,"[{'id': 10749, 'name': 'Romance'}, {'id': 12, ...",,tt0080453,en,The Blue Lagoon,Two small children and a ship's cook survive a...,...,7/5/80,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A sensuous story of natural love.,The Blue Lagoon,"[{'id': 566, 'name': 'sexual identity'}, {'id'...","[{'cast_id': 7, 'character': 'Emmeline', 'cred...","[{'credit_id': '52fe4418c3a36847f8081fa3', 'de...",58853106
4,754,755,,6000000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",http://www.paramountvantage.com/virginsuicides...,tt0159097,en,The Virgin Suicides,A group of male friends become obsessed with f...,...,4/21/99,97.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"Beautiful, mysterious, haunting, invariably fa...",The Virgin Suicides,"[{'id': 933, 'name': 'michigan'}, {'id': 1228,...","[{'cast_id': 17, 'character': 'Mr. Lisbon', 'c...","[{'credit_id': '52fe42f8c3a36847f80308d5', 'de...",10409377


In [5]:
train_data.columns

Index(['Unnamed: 0', 'id', 'belongs_to_collection', 'budget', 'genres',
       'homepage', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue'],
      dtype='object')

Make a copy of *train_data* and *test_data* for not modifing them.

In [6]:
train_df = train_data.copy()
test_df = test_data.copy()

# checking the ids
np.any((id(train_df) == id(train_data)) and (id(test_df) == id(test_data)))

False

**Featurize** function for building features.

In [240]:
#valid_y: targets of validation
valid_y = validation["revenue"]

#train_feat: featurize's train data without target
#valid_feat: featurize's validation without target
#train_y: targets of train data?
train_feat, valid_feat, train_y = featurize(train_df, validation)

all_train = pd.read_csv('kaggleDataset/train.csv')
#all_train_feat: featurized train_data + validation
#test_feat: featurized test_data
#target: train_y + valid_y
all_train_feat, test_feat, target = featurize(all_train, test_df)

In [241]:
train_feat.columns

Index(['genders_0_crew', 'genders_1_crew', 'genders_2_crew', 'is_from_coll',
       'not_from_coll', 'budget', 'popularity', 'runtime', 'is_tagline',
       'no_tagline', 'is_en', 'not_en', 'is_homepage', 'no_homepage',
       'is_date', 'not_date', 'is_same_title', 'not_same_title',
       'genres_feature_len', 'cast_feature_len', 'crew_len',
       'crew_name_Avy Kaufman', 'depar_name_Avy Kaufman',
       'crew_name_Steven Spielberg', 'depar_name_Steven Spielberg',
       'crew_name_Luc Besson', 'depar_name_Luc Besson',
       'crew_name_Deborah Aquila', 'depar_name_Deborah Aquila',
       'crew_name_James Newton Howard', 'depar_name_James Newton Howard',
       'crew_name_James Horner', 'depar_name_James Horner',
       'crew_name_Tricia Wood', 'depar_name_Tricia Wood',
       'crew_name_Francine Maisler', 'depar_name_Francine Maisler',
       'crew_name_Jerry Goldsmith', 'depar_name_Jerry Goldsmith',
       'crew_name_Kerry Barden', 'depar_name_Kerry Barden',
       'crew_name_Mary

In [242]:
train_feat.head(10)

Unnamed: 0,genders_0_crew,genders_1_crew,genders_2_crew,is_from_coll,not_from_coll,budget,popularity,runtime,is_tagline,no_tagline,...,Français,Deutsch,Warner Bros.,Universal Pictures,Paramount Pictures,United States of America,United Kingdom,Germany,Canada,Steven Spielberg
0,3,7,16,1,0,0.367497,-0.142863,-0.219963,1,0,...,0,0,0,0,0,1,0,1,0,0
1,7,1,24,1,0,1.109922,0.119984,-0.718172,1,0,...,0,0,0,0,0,1,0,1,0,0
2,3,0,0,1,0,-1.875672,-0.669556,-0.355838,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,5,1,0,-1.33492,0.257378,-0.174671,1,0,...,0,0,0,0,0,1,0,0,0,0
4,3,8,8,1,0,-1.182908,0.168714,-0.491713,1,0,...,0,0,0,0,0,1,0,0,0,0
5,1,0,2,1,0,-1.875672,-0.505809,-0.537005,1,0,...,0,0,0,0,0,1,0,0,0,0
6,13,0,5,1,0,-1.875672,-0.187922,-0.219963,1,0,...,0,0,0,0,0,0,0,0,0,0
7,5,3,12,1,0,1.873843,-0.103511,-0.899339,1,0,...,0,0,0,0,0,1,0,0,0,0
8,5,3,3,1,0,-1.875672,-0.193815,-0.40113,0,1,...,1,0,0,0,0,0,0,0,0,0
9,34,4,20,1,0,0.025157,0.0942893,0.731164,1,0,...,0,0,0,0,0,1,0,0,0,0


In [243]:
X_train, y_train = train_feat.values, train_y.values.ravel()
X_valid, y_valid = valid_feat.values, valid_y.values.ravel()

# Train models and measure validation.

Choose the best model with root mean squared log error function and predict *test* data.

## xgboost

In [244]:
import xgboost as xgb

params = {
    'objective': 'reg:linear', 
    'eta': 0.01, 
    'max_depth': 6, 
    'subsample': 0.6, 
    'colsample_bytree': 0.7,  
    'eval_metric': 'rmse', 
    'seed': 127, 
    'silent': True,
}

record = dict()

xgboost = xgb.train(
        params, 
        xgb.DMatrix(X_train, y_train), 
        100000,
        [(xgb.DMatrix(X_train, y_train), 'train'), (xgb.DMatrix(X_valid, y_valid), 'valid')], 
        verbose_eval=False,
        early_stopping_rounds=500,
        callbacks = [xgb.callback.record_evaluation(record)])

best_score_index = np.argmin(np.array(record['valid']['rmse']))
xgboost, record['valid']['rmse'][best_score_index]

(<xgboost.core.Booster at 0x7f3b00b10048>, 152379584.0)

In [245]:
xgboost_pred = xgboost.predict(xgb.DMatrix(X_valid))
tb.root_mean_squared_log_error(np.exp(xgboost_pred), y_valid)

(2.270579724802664, 2.270579724802664)

## CatBoostRegressor

In [321]:
from catboost import CatBoostRegressor

cat = CatBoostRegressor(iterations=100000,
                        learning_rate=0.004,
                        depth=5,
                        eval_metric='RMSE',
                        bagging_temperature = 0.2,
                        metric_period = None,
                        early_stopping_rounds=200,
                        )                                    

cat, cat_score = tb.cross_validation(cat, X_train, y_train, folds=3,
                                        eval_set=(X_valid, y_valid), 
                                        use_best_model=True, 
                                        verbose=False)

cat, cat_score

(<catboost.core.CatBoostRegressor at 0x7f3acafdc7f0>, 0.057625972092129815)

In [322]:
cat_pred = cat.predict(X_valid)
tb.root_mean_squared_log_error(np.exp(cat_pred), y_valid)

(2.2970188554481084, 2.2970188554481084)

## RidgeCV

In [265]:
from sklearn.linear_model import RidgeCV

best_ridge_reg_model, ridge_reg_loss = tb.cross_validation(RidgeCV(), X_train, y_train)

best_ridge_reg_model, ridge_reg_loss

In [186]:
ridge_reg_pred = best_ridge_reg_model.predict(X_valid)
tb.root_mean_squared_log_error(np.exp(ridge_reg_pred), y_valid)

(2.493854931722519, 2.493854931722519)

## RandomForestRegressor

In [251]:
from sklearn.ensemble import RandomForestRegressor

best_random_forest, forest_loss = tb.cross_validation(RandomForestRegressor(10), 
                                                      X_train, y_train)

best_random_forest, forest_loss

In [188]:
forest_pred = best_random_forest.predict(X_valid)
tb.root_mean_squared_log_error(np.exp(forest_pred), y_valid)

(2.4715818937095277, 2.4715818937095277)

## Stacking

In [323]:
xgb_test_pred = xgboost.predict(xgb.DMatrix(test_feat.values))
cat_test_pred = cat.predict(test_feat.values)

In [324]:
train_stack = np.vstack((xgboost_pred, cat_pred)).T
test_stack = np.vstack((xgb_test_pred, cat_test_pred)).T

In [325]:
model = RidgeCV(alphas=(0.01, 0.1, 1.0, 10.0, 100.0), 
                scoring='neg_mean_squared_error', cv=5)
model.fit(train_stack, y_valid)

RidgeCV(alphas=array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]), cv=5,
    fit_intercept=True, gcv_mode=None, normalize=False,
    scoring='neg_mean_squared_error', store_cv_values=False)

In [339]:
stacking_prediction = model.predict(test_stack)
stacking_prediction

array([ 1.60869190e+07, -1.30344902e+06,  2.99708023e+07, ...,
        1.14621141e+08,  5.65141705e+07,  1.08084195e+07])