In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
import gc
import time
from pandas.core.common import SettingWithCopyWarning
import warnings
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from copy import deepcopy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

# notebook settings
warnings.simplefilter('error', SettingWithCopyWarning)
gc.enable()
%matplotlib inline
%reload_ext autoreload
%autoreload 2

# set path
PATH = '../../data/ga_revenue/'

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you won't need to install the gcc compiler anymore.
Instead of that, you'll need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


# LightGBM Grid Search

In [12]:
def get_folds(df = None, n_splits = 5):
    
    # Get sorted unique visitors
    unique_vis = np.array(sorted(df['fullVisitorId'].unique()))

    # Get folds
    folds = GroupKFold(n_splits=n_splits)
    fold_ids = []
    ids = np.arange(df.shape[0])
    for trn_vis, val_vis in folds.split(X=unique_vis, y=unique_vis, groups=unique_vis):
        fold_ids.append(
            [
                ids[df['fullVisitorId'].isin(unique_vis[trn_vis])],
                ids[df['fullVisitorId'].isin(unique_vis[val_vis])]
            ]
        )

    return fold_ids

## Params to Search

In [9]:
# ordered in which to test first
params_to_search = {
    'n_estimators': [10000, 20000],
    'max_depth': [5, 15, 25],
    'num_leaves': [25, 54, 75],
    'min_child_samples': [20, 36, 50]
}

## Load & Process Data

In [4]:
df = pd.read_feather(PATH+'feathers/main&poly_float32')

train = deepcopy(df[df['dataset'] == 'train'])
test = deepcopy(df[df['dataset'] == 'test'])
del df

# drop dataset identifier and reset index
train.drop('dataset', axis = 1, inplace = True)
test.drop('dataset', axis = 1, inplace = True)
train.reset_index(drop = True, inplace = True)
test.reset_index(drop = True, inplace = True)

y_reg = train['totals.transactionRevenue'].fillna(0)
del train['totals.transactionRevenue']

if 'totals.transactionRevenue' in test.columns:
    del test['totals.transactionRevenue']

In [6]:
categorical_features = [
    
    'channelGrouping', 'visitNumber', 'device.browser', 'device.deviceCategory',
    'device.isMobile', 'device.operatingSystem', 'geoNetwork.city', 'geoNetwork.continent',
    'geoNetwork.country', 'geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region',
    'geoNetwork.subContinent', 'trafficSource.adContent', 'trafficSource.adwordsClickInfo.adNetworkType',
    'trafficSource.adwordsClickInfo.gclId', 'trafficSource.adwordsClickInfo.page', 'trafficSource.adwordsClickInfo.slot',
    'trafficSource.campaign', 'trafficSource.keyword', 'trafficSource.medium', 'trafficSource.referralPath',
    'trafficSource.source', 'visitStartTimeLOCALYear', 'visitStartTimeLOCALMonth', 'visitStartTimeLOCALWeek',
    'visitStartTimeLOCALDay', 'visitStartTimeLOCALDayofweek', 'visitStartTimeLOCALDayofyear', 'visitStartTimeLOCALIs_month_end',
    'visitStartTimeLOCALIs_month_start', 'visitStartTimeLOCALIs_quarter_end', 'visitStartTimeLOCALIs_quarter_start',
    'visitStartTimeLOCALIs_year_end', 'visitStartTimeLOCALIs_year_start', 'visitStartTimeLOCALHourofday',
    'return_visit', 'landing', 'bounce', 'user_has_purchased_before', 'browser_os', 'browser_device', 'os_device',
    'channel_device', 'channel_domain', 'city_country_mismatch', 'is_holiday'
    
]

contin_features = [
    
    'totals.hits', 'totals.pageviews', 'days_after_holiday', 'days_before_holiday', 'holiday_anticipation',
    'holiday_gap', 'days_since_last_visit', 'days_since_first_visit'
    
]

excluded_features = [
    
    'date', 'fullVisitorId', 'sessionId', 'totals.transactionRevenue', 'visitId',
    'visitStartTime', 'visitStartTimeLOCAL', 'visitStartTimeLOCALElapsed'

]

# set remaining cols as continuous
additional_contin_vars = deepcopy([c for c in train.columns
                                   if c not in categorical_features + contin_features + excluded_features
                                  ])

# add the additional engineered cols to continuous
contin_features += additional_contin_vars

In [7]:
for f in categorical_features:
    train[f], indexer = pd.factorize(train[f])
    if test[f].dtype == bool:
        test[f] = indexer.get_indexer(deepcopy(test[f].astype(int)))
    else:
        test[f] = indexer.get_indexer(test[f])

train_features = [_f for _f in train.columns if _f not in excluded_features]
X_train, y_train = train[train_features], y_reg

## Build & Train Model

In [10]:
n_estimators = 20000

params = {
    
    'num_leaves': 54,
    'max_depth': 15,
    'min_child_samples': 36,
    
    'objective':'regression',
    'metric':'rmse',
    'verbose': 1,
    'learning_rate': 0.01,
    'bagging_fraction': 0.99,
    'feature_fraction': 0.99,
    "random_state":42,
    "bagging_seed" : 42,
    "verbosity" : -1,
    "bagging_frequency" : 5,
    'lambda_l2': 0.5,
    'lambda_l1': 0.5
    
}

In [13]:
folds = get_folds(df = train, n_splits = 5)

train_features = [_f for _f in train.columns if _f not in excluded_features]

importances = pd.DataFrame()
oof_reg_preds = np.zeros(train.shape[0])
sub_reg_preds = np.zeros(test.shape[0])

for fold_, (trn_, val_) in enumerate(folds):
    
    print("Fold:",fold_)
    trn_x, trn_y = train[train_features].iloc[trn_], y_reg.iloc[trn_]
    val_x, val_y = train[train_features].iloc[val_], y_reg.iloc[val_]
    
    # init & fit
    print('Fitting model...')
    reg = lgb.LGBMRegressor(**params, n_estimators = n_estimators)
    reg.fit(
        trn_x, np.log1p(trn_y),
        eval_set=[(val_x, np.log1p(val_y))],
        early_stopping_rounds=50,
        verbose=100,
        eval_metric='rmse'
    )
    
    # feature importance
    imp_df = pd.DataFrame()
    imp_df['feature'] = train_features
    imp_df['gain'] = reg.booster_.feature_importance(importance_type='gain')
    
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)
    
    oof_reg_preds[val_] = reg.predict(val_x, num_iteration=reg.best_iteration_)
    oof_reg_preds[oof_reg_preds < 0] = 0
    _preds = reg.predict(test[train_features], num_iteration=reg.best_iteration_)
    _preds[_preds < 0] = 0
    sub_reg_preds += np.expm1(_preds) / len(folds)
    
mean_squared_error(np.log1p(y_reg), oof_reg_preds) ** .5

Fold: 0
Fitting model...
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.66083
[200]	valid_0's rmse: 1.58186
[300]	valid_0's rmse: 1.56004
[400]	valid_0's rmse: 1.55223
[500]	valid_0's rmse: 1.54807
[600]	valid_0's rmse: 1.54727
[700]	valid_0's rmse: 1.54521
[800]	valid_0's rmse: 1.54288
[900]	valid_0's rmse: 1.54124
[1000]	valid_0's rmse: 1.54053
[1100]	valid_0's rmse: 1.53977
Early stopping, best iteration is:
[1128]	valid_0's rmse: 1.53963
Fold: 1
Fitting model...
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.63556
[200]	valid_0's rmse: 1.56116
[300]	valid_0's rmse: 1.54112
[400]	valid_0's rmse: 1.53308
[500]	valid_0's rmse: 1.53019
[600]	valid_0's rmse: 1.52883
[700]	valid_0's rmse: 1.52746
[800]	valid_0's rmse: 1.52617
[900]	valid_0's rmse: 1.52554
[1000]	valid_0's rmse: 1.52461
[1100]	valid_0's rmse: 1.52365
[1200]	valid_0's rmse: 1.52285
[1300]	valid_0's rmse: 1.52259
[1400]	valid_0's rmse: 1.5224
Earl

18.40464367718644

In [14]:
n_estimators = 10000

params = {
    
    'num_leaves': 54,
    'max_depth': 15,
    'min_child_samples': 36,
    
    'objective':'regression',
    'metric':'rmse',
    'verbose': 1,
    'learning_rate': 0.01,
    'bagging_fraction': 0.99,
    'feature_fraction': 0.99,
    "random_state":42,
    "bagging_seed" : 42,
    "verbosity" : -1,
    "bagging_frequency" : 5,
    'lambda_l2': 0.5,
    'lambda_l1': 0.5
    
}

In [15]:
folds = get_folds(df = train, n_splits = 5)

train_features = [_f for _f in train.columns if _f not in excluded_features]

importances = pd.DataFrame()
oof_reg_preds = np.zeros(train.shape[0])
sub_reg_preds = np.zeros(test.shape[0])

for fold_, (trn_, val_) in enumerate(folds):
    
    print("Fold:",fold_)
    trn_x, trn_y = train[train_features].iloc[trn_], y_reg.iloc[trn_]
    val_x, val_y = train[train_features].iloc[val_], y_reg.iloc[val_]
    
    # init & fit
    print('Fitting model...')
    reg = lgb.LGBMRegressor(**params, n_estimators = n_estimators)
    reg.fit(
        trn_x, np.log1p(trn_y),
        eval_set=[(val_x, np.log1p(val_y))],
        early_stopping_rounds=50,
        verbose=100,
        eval_metric='rmse'
    )
    
    # feature importance
    imp_df = pd.DataFrame()
    imp_df['feature'] = train_features
    imp_df['gain'] = reg.booster_.feature_importance(importance_type='gain')
    
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)
    
    oof_reg_preds[val_] = reg.predict(val_x, num_iteration=reg.best_iteration_)
    oof_reg_preds[oof_reg_preds < 0] = 0
    _preds = reg.predict(test[train_features], num_iteration=reg.best_iteration_)
    _preds[_preds < 0] = 0
    sub_reg_preds += np.expm1(_preds) / len(folds)
    
mean_squared_error(np.log1p(y_reg), oof_reg_preds) ** .5

Fold: 0
Fitting model...
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.66083
[200]	valid_0's rmse: 1.58186
[300]	valid_0's rmse: 1.56004
[400]	valid_0's rmse: 1.55223
[500]	valid_0's rmse: 1.54807
[600]	valid_0's rmse: 1.54727
[700]	valid_0's rmse: 1.54521
[800]	valid_0's rmse: 1.54288
[900]	valid_0's rmse: 1.54124
[1000]	valid_0's rmse: 1.54053
[1100]	valid_0's rmse: 1.53977
Early stopping, best iteration is:
[1128]	valid_0's rmse: 1.53963
Fold: 1
Fitting model...
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.63556
[200]	valid_0's rmse: 1.56116
[300]	valid_0's rmse: 1.54112
[400]	valid_0's rmse: 1.53308
[500]	valid_0's rmse: 1.53019
[600]	valid_0's rmse: 1.52883
[700]	valid_0's rmse: 1.52746
[800]	valid_0's rmse: 1.52617
[900]	valid_0's rmse: 1.52554
[1000]	valid_0's rmse: 1.52461
[1100]	valid_0's rmse: 1.52365
[1200]	valid_0's rmse: 1.52285
[1300]	valid_0's rmse: 1.52259
[1400]	valid_0's rmse: 1.5224
Earl

18.660173492797995

In [16]:
n_estimators = 10000

params = {
    
    'num_leaves': 54,
    'max_depth': 5,
    'min_child_samples': 36,
    
    'objective':'regression',
    'metric':'rmse',
    'verbose': 1,
    'learning_rate': 0.01,
    'bagging_fraction': 0.99,
    'feature_fraction': 0.99,
    "random_state":42,
    "bagging_seed" : 42,
    "verbosity" : -1,
    "bagging_frequency" : 5,
    'lambda_l2': 0.5,
    'lambda_l1': 0.5
    
}

In [17]:
folds = get_folds(df = train, n_splits = 5)

train_features = [_f for _f in train.columns if _f not in excluded_features]

importances = pd.DataFrame()
oof_reg_preds = np.zeros(train.shape[0])
sub_reg_preds = np.zeros(test.shape[0])

for fold_, (trn_, val_) in enumerate(folds):
    
    print("Fold:",fold_)
    trn_x, trn_y = train[train_features].iloc[trn_], y_reg.iloc[trn_]
    val_x, val_y = train[train_features].iloc[val_], y_reg.iloc[val_]
    
    # init & fit
    print('Fitting model...')
    reg = lgb.LGBMRegressor(**params, n_estimators = n_estimators)
    reg.fit(
        trn_x, np.log1p(trn_y),
        eval_set=[(val_x, np.log1p(val_y))],
        early_stopping_rounds=50,
        verbose=100,
        eval_metric='rmse'
    )
    
    # feature importance
    imp_df = pd.DataFrame()
    imp_df['feature'] = train_features
    imp_df['gain'] = reg.booster_.feature_importance(importance_type='gain')
    
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)
    
    oof_reg_preds[val_] = reg.predict(val_x, num_iteration=reg.best_iteration_)
    oof_reg_preds[oof_reg_preds < 0] = 0
    _preds = reg.predict(test[train_features], num_iteration=reg.best_iteration_)
    _preds[_preds < 0] = 0
    sub_reg_preds += np.expm1(_preds) / len(folds)
    
mean_squared_error(np.log1p(y_reg), oof_reg_preds) ** .5

Fold: 0
Fitting model...
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.68121
[200]	valid_0's rmse: 1.60424
[300]	valid_0's rmse: 1.58064
[400]	valid_0's rmse: 1.57074
[500]	valid_0's rmse: 1.56517
[600]	valid_0's rmse: 1.5606
[700]	valid_0's rmse: 1.55693
[800]	valid_0's rmse: 1.55362
[900]	valid_0's rmse: 1.55073
[1000]	valid_0's rmse: 1.54851
[1100]	valid_0's rmse: 1.54651
[1200]	valid_0's rmse: 1.54495
[1300]	valid_0's rmse: 1.54378
[1400]	valid_0's rmse: 1.54278
[1500]	valid_0's rmse: 1.54192
[1600]	valid_0's rmse: 1.54143
[1700]	valid_0's rmse: 1.54103
[1800]	valid_0's rmse: 1.54071
[1900]	valid_0's rmse: 1.54053
[2000]	valid_0's rmse: 1.54025
[2100]	valid_0's rmse: 1.54008
[2200]	valid_0's rmse: 1.54001
Early stopping, best iteration is:
[2207]	valid_0's rmse: 1.53997
Fold: 1
Fitting model...
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.6563
[200]	valid_0's rmse: 1.58331
[300]	valid_0's rmse: 1.56155

18.5522348439846

In [18]:
n_estimators = 10000

params = {
    
    'num_leaves': 54,
    'max_depth': 25,
    'min_child_samples': 36,
    
    'objective':'regression',
    'metric':'rmse',
    'verbose': 1,
    'learning_rate': 0.01,
    'bagging_fraction': 0.99,
    'feature_fraction': 0.99,
    "random_state":42,
    "bagging_seed" : 42,
    "verbosity" : -1,
    "bagging_frequency" : 5,
    'lambda_l2': 0.5,
    'lambda_l1': 0.5
    
}

In [19]:
folds = get_folds(df = train, n_splits = 5)

train_features = [_f for _f in train.columns if _f not in excluded_features]

importances = pd.DataFrame()
oof_reg_preds = np.zeros(train.shape[0])
sub_reg_preds = np.zeros(test.shape[0])

for fold_, (trn_, val_) in enumerate(folds):
    
    print("Fold:",fold_)
    trn_x, trn_y = train[train_features].iloc[trn_], y_reg.iloc[trn_]
    val_x, val_y = train[train_features].iloc[val_], y_reg.iloc[val_]
    
    # init & fit
    print('Fitting model...')
    reg = lgb.LGBMRegressor(**params, n_estimators = n_estimators)
    reg.fit(
        trn_x, np.log1p(trn_y),
        eval_set=[(val_x, np.log1p(val_y))],
        early_stopping_rounds=50,
        verbose=100,
        eval_metric='rmse'
    )
    
    # feature importance
    imp_df = pd.DataFrame()
    imp_df['feature'] = train_features
    imp_df['gain'] = reg.booster_.feature_importance(importance_type='gain')
    
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)
    
    oof_reg_preds[val_] = reg.predict(val_x, num_iteration=reg.best_iteration_)
    oof_reg_preds[oof_reg_preds < 0] = 0
    _preds = reg.predict(test[train_features], num_iteration=reg.best_iteration_)
    _preds[_preds < 0] = 0
    sub_reg_preds += np.expm1(_preds) / len(folds)
    
mean_squared_error(np.log1p(y_reg), oof_reg_preds) ** .5

Fold: 0
Fitting model...
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.66083
[200]	valid_0's rmse: 1.58207
[300]	valid_0's rmse: 1.55992
[400]	valid_0's rmse: 1.55176
[500]	valid_0's rmse: 1.54764
[600]	valid_0's rmse: 1.54626
[700]	valid_0's rmse: 1.5444
[800]	valid_0's rmse: 1.54226
[900]	valid_0's rmse: 1.54049
[1000]	valid_0's rmse: 1.53918
[1100]	valid_0's rmse: 1.53829
[1200]	valid_0's rmse: 1.53776
[1300]	valid_0's rmse: 1.53695
[1400]	valid_0's rmse: 1.53653
[1500]	valid_0's rmse: 1.5359
[1600]	valid_0's rmse: 1.53575
[1700]	valid_0's rmse: 1.53525
Early stopping, best iteration is:
[1678]	valid_0's rmse: 1.53524
Fold: 1
Fitting model...
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.63556
[200]	valid_0's rmse: 1.56102
[300]	valid_0's rmse: 1.54142
[400]	valid_0's rmse: 1.53387
[500]	valid_0's rmse: 1.53056
[600]	valid_0's rmse: 1.52934
[700]	valid_0's rmse: 1.52841
[800]	valid_0's rmse: 1.5271
[900]

18.66783627573961

In [20]:
n_estimators = 20000

params = {
    
    'num_leaves': 54,
    'max_depth': 5,
    'min_child_samples': 36,
    
    'objective':'regression',
    'metric':'rmse',
    'verbose': 1,
    'learning_rate': 0.01,
    'bagging_fraction': 0.99,
    'feature_fraction': 0.99,
    "random_state":42,
    "bagging_seed" : 42,
    "verbosity" : -1,
    "bagging_frequency" : 5,
    'lambda_l2': 0.5,
    'lambda_l1': 0.5
    
}

In [21]:
folds = get_folds(df = train, n_splits = 5)

train_features = [_f for _f in train.columns if _f not in excluded_features]

importances = pd.DataFrame()
oof_reg_preds = np.zeros(train.shape[0])
sub_reg_preds = np.zeros(test.shape[0])

for fold_, (trn_, val_) in enumerate(folds):
    
    print("Fold:",fold_)
    trn_x, trn_y = train[train_features].iloc[trn_], y_reg.iloc[trn_]
    val_x, val_y = train[train_features].iloc[val_], y_reg.iloc[val_]
    
    # init & fit
    print('Fitting model...')
    reg = lgb.LGBMRegressor(**params, n_estimators = n_estimators)
    reg.fit(
        trn_x, np.log1p(trn_y),
        eval_set=[(val_x, np.log1p(val_y))],
        early_stopping_rounds=50,
        verbose=100,
        eval_metric='rmse'
    )
    
    # feature importance
    imp_df = pd.DataFrame()
    imp_df['feature'] = train_features
    imp_df['gain'] = reg.booster_.feature_importance(importance_type='gain')
    
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)
    
    oof_reg_preds[val_] = reg.predict(val_x, num_iteration=reg.best_iteration_)
    oof_reg_preds[oof_reg_preds < 0] = 0
    _preds = reg.predict(test[train_features], num_iteration=reg.best_iteration_)
    _preds[_preds < 0] = 0
    sub_reg_preds += np.expm1(_preds) / len(folds)
    
mean_squared_error(np.log1p(y_reg), oof_reg_preds) ** .5

Fold: 0
Fitting model...
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.68121
[200]	valid_0's rmse: 1.60424
[300]	valid_0's rmse: 1.58064
[400]	valid_0's rmse: 1.57074
[500]	valid_0's rmse: 1.56517
[600]	valid_0's rmse: 1.5606
[700]	valid_0's rmse: 1.55693
[800]	valid_0's rmse: 1.55362
[900]	valid_0's rmse: 1.55073
[1000]	valid_0's rmse: 1.54851
[1100]	valid_0's rmse: 1.54651
[1200]	valid_0's rmse: 1.54495
[1300]	valid_0's rmse: 1.54378
[1400]	valid_0's rmse: 1.54278
[1500]	valid_0's rmse: 1.54192
[1600]	valid_0's rmse: 1.54143
[1700]	valid_0's rmse: 1.54103
[1800]	valid_0's rmse: 1.54071
[1900]	valid_0's rmse: 1.54053
[2000]	valid_0's rmse: 1.54025
[2100]	valid_0's rmse: 1.54008
[2200]	valid_0's rmse: 1.54001
Early stopping, best iteration is:
[2207]	valid_0's rmse: 1.53997
Fold: 1
Fitting model...
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.6563
[200]	valid_0's rmse: 1.58331
[300]	valid_0's rmse: 1.56155

18.504558851058256

In [22]:
n_estimators = 20000

params = {
    
    'num_leaves': 54,
    'max_depth': 25,
    'min_child_samples': 36,
    
    'objective':'regression',
    'metric':'rmse',
    'verbose': 1,
    'learning_rate': 0.01,
    'bagging_fraction': 0.99,
    'feature_fraction': 0.99,
    "random_state":42,
    "bagging_seed" : 42,
    "verbosity" : -1,
    "bagging_frequency" : 5,
    'lambda_l2': 0.5,
    'lambda_l1': 0.5
    
}

In [23]:
folds = get_folds(df = train, n_splits = 5)

train_features = [_f for _f in train.columns if _f not in excluded_features]

importances = pd.DataFrame()
oof_reg_preds = np.zeros(train.shape[0])
sub_reg_preds = np.zeros(test.shape[0])

for fold_, (trn_, val_) in enumerate(folds):
    
    print("Fold:",fold_)
    trn_x, trn_y = train[train_features].iloc[trn_], y_reg.iloc[trn_]
    val_x, val_y = train[train_features].iloc[val_], y_reg.iloc[val_]
    
    # init & fit
    print('Fitting model...')
    reg = lgb.LGBMRegressor(**params, n_estimators = n_estimators)
    reg.fit(
        trn_x, np.log1p(trn_y),
        eval_set=[(val_x, np.log1p(val_y))],
        early_stopping_rounds=50,
        verbose=100,
        eval_metric='rmse'
    )
    
    # feature importance
    imp_df = pd.DataFrame()
    imp_df['feature'] = train_features
    imp_df['gain'] = reg.booster_.feature_importance(importance_type='gain')
    
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)
    
    oof_reg_preds[val_] = reg.predict(val_x, num_iteration=reg.best_iteration_)
    oof_reg_preds[oof_reg_preds < 0] = 0
    _preds = reg.predict(test[train_features], num_iteration=reg.best_iteration_)
    _preds[_preds < 0] = 0
    sub_reg_preds += np.expm1(_preds) / len(folds)
    
mean_squared_error(np.log1p(y_reg), oof_reg_preds) ** .5

Fold: 0
Fitting model...
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.66083
[200]	valid_0's rmse: 1.58207
[300]	valid_0's rmse: 1.55992
[400]	valid_0's rmse: 1.55176
[500]	valid_0's rmse: 1.54764
[600]	valid_0's rmse: 1.54626
[700]	valid_0's rmse: 1.5444
[800]	valid_0's rmse: 1.54226
[900]	valid_0's rmse: 1.54049
[1000]	valid_0's rmse: 1.53918
[1100]	valid_0's rmse: 1.53829
[1200]	valid_0's rmse: 1.53776
[1300]	valid_0's rmse: 1.53695
[1400]	valid_0's rmse: 1.53653
[1500]	valid_0's rmse: 1.5359
[1600]	valid_0's rmse: 1.53575
[1700]	valid_0's rmse: 1.53525
Early stopping, best iteration is:
[1678]	valid_0's rmse: 1.53524
Fold: 1
Fitting model...
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.63556
[200]	valid_0's rmse: 1.56102
[300]	valid_0's rmse: 1.54142
[400]	valid_0's rmse: 1.53387
[500]	valid_0's rmse: 1.53056
[600]	valid_0's rmse: 1.52934
[700]	valid_0's rmse: 1.52841
[800]	valid_0's rmse: 1.5271
[900]

18.66783627573961

In [24]:
gs = {
    '20000est_15md_54nl': [1.53963, 1.52233, 1.5173, 1.51455, 1.55163],
    '10000est_15md_54nl': [1.53963, 1.52233, 1.5173, 1.51455, 1.55163],
    '10000est_5md_54nl': [1.53997, 1.53174, 1.52698, 1.52035, 1.55603],
    '10000est_25md_54nl': [1.53524, 1.52283, 1.51882, 1.51569, 1.55264],
    '20000est_5md_54nl': [1.53997, 1.53174, 1.52698, 1.52035, 1.55603],
    '20000est_25md_54nl': [1.53524, 1.52283, 1.51882, 1.51569, 1.55264],
}

In [27]:
for k, v in gs.items(): print('{}:'.format(k), np.array(v).mean())

20000est_15md: 1.529088
10000est_15md: 1.529088
10000est_5md: 1.5350139999999999
10000est_25md: 1.529044
20000est_5md: 1.5350139999999999
20000est_25md: 1.529044


In [None]:
# ordered in which to test first
params_to_search = {
    'num_leaves': [25, 54, 75],
    'min_child_samples': [20, 36, 50]
}

In [28]:
n_estimators = 10000

params = {
    
    'num_leaves': 25,
    'max_depth': 25,
    'min_child_samples': 36,
    
    'objective':'regression',
    'metric':'rmse',
    'verbose': 1,
    'learning_rate': 0.01,
    'bagging_fraction': 0.99,
    'feature_fraction': 0.99,
    "random_state":42,
    "bagging_seed" : 42,
    "verbosity" : -1,
    "bagging_frequency" : 5,
    'lambda_l2': 0.5,
    'lambda_l1': 0.5
    
}

folds = get_folds(df = train, n_splits = 5)

train_features = [_f for _f in train.columns if _f not in excluded_features]

importances = pd.DataFrame()
oof_reg_preds = np.zeros(train.shape[0])
sub_reg_preds = np.zeros(test.shape[0])

for fold_, (trn_, val_) in enumerate(folds):
    
    print("Fold:",fold_)
    trn_x, trn_y = train[train_features].iloc[trn_], y_reg.iloc[trn_]
    val_x, val_y = train[train_features].iloc[val_], y_reg.iloc[val_]
    
    # init & fit
    print('Fitting model...')
    reg = lgb.LGBMRegressor(**params, n_estimators = n_estimators)
    reg.fit(
        trn_x, np.log1p(trn_y),
        eval_set=[(val_x, np.log1p(val_y))],
        early_stopping_rounds=50,
        verbose=100,
        eval_metric='rmse'
    )
    
    # feature importance
    imp_df = pd.DataFrame()
    imp_df['feature'] = train_features
    imp_df['gain'] = reg.booster_.feature_importance(importance_type='gain')
    
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)
    
    oof_reg_preds[val_] = reg.predict(val_x, num_iteration=reg.best_iteration_)
    oof_reg_preds[oof_reg_preds < 0] = 0
    _preds = reg.predict(test[train_features], num_iteration=reg.best_iteration_)
    _preds[_preds < 0] = 0
    sub_reg_preds += np.expm1(_preds) / len(folds)
    
mean_squared_error(np.log1p(y_reg), oof_reg_preds) ** .5

Fold: 0
Fitting model...
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.6768
[200]	valid_0's rmse: 1.5969
[300]	valid_0's rmse: 1.57293
[400]	valid_0's rmse: 1.56427
[500]	valid_0's rmse: 1.55988
[600]	valid_0's rmse: 1.55738
[700]	valid_0's rmse: 1.55557
[800]	valid_0's rmse: 1.5519
[900]	valid_0's rmse: 1.54956
[1000]	valid_0's rmse: 1.5475
[1100]	valid_0's rmse: 1.54582
[1200]	valid_0's rmse: 1.54425
[1300]	valid_0's rmse: 1.54273
[1400]	valid_0's rmse: 1.5418
[1500]	valid_0's rmse: 1.54139
[1600]	valid_0's rmse: 1.54096
[1700]	valid_0's rmse: 1.54059
[1800]	valid_0's rmse: 1.54021
[1900]	valid_0's rmse: 1.53983
[2000]	valid_0's rmse: 1.53957
[2100]	valid_0's rmse: 1.53949
Early stopping, best iteration is:
[2071]	valid_0's rmse: 1.53939
Fold: 1
Fitting model...
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.65062
[200]	valid_0's rmse: 1.57602
[300]	valid_0's rmse: 1.55357
[400]	valid_0's rmse: 1.54532
[50

18.613512748322357

In [29]:
n_estimators = 10000

params = {
    
    'num_leaves': 75,
    'max_depth': 25,
    'min_child_samples': 36,
    
    'objective':'regression',
    'metric':'rmse',
    'verbose': 1,
    'learning_rate': 0.01,
    'bagging_fraction': 0.99,
    'feature_fraction': 0.99,
    "random_state":42,
    "bagging_seed" : 42,
    "verbosity" : -1,
    "bagging_frequency" : 5,
    'lambda_l2': 0.5,
    'lambda_l1': 0.5
    
}

folds = get_folds(df = train, n_splits = 5)

train_features = [_f for _f in train.columns if _f not in excluded_features]

importances = pd.DataFrame()
oof_reg_preds = np.zeros(train.shape[0])
sub_reg_preds = np.zeros(test.shape[0])

for fold_, (trn_, val_) in enumerate(folds):
    
    print("Fold:",fold_)
    trn_x, trn_y = train[train_features].iloc[trn_], y_reg.iloc[trn_]
    val_x, val_y = train[train_features].iloc[val_], y_reg.iloc[val_]
    
    # init & fit
    print('Fitting model...')
    reg = lgb.LGBMRegressor(**params, n_estimators = n_estimators)
    reg.fit(
        trn_x, np.log1p(trn_y),
        eval_set=[(val_x, np.log1p(val_y))],
        early_stopping_rounds=50,
        verbose=100,
        eval_metric='rmse'
    )
    
    # feature importance
    imp_df = pd.DataFrame()
    imp_df['feature'] = train_features
    imp_df['gain'] = reg.booster_.feature_importance(importance_type='gain')
    
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)
    
    oof_reg_preds[val_] = reg.predict(val_x, num_iteration=reg.best_iteration_)
    oof_reg_preds[oof_reg_preds < 0] = 0
    _preds = reg.predict(test[train_features], num_iteration=reg.best_iteration_)
    _preds[_preds < 0] = 0
    sub_reg_preds += np.expm1(_preds) / len(folds)
    
mean_squared_error(np.log1p(y_reg), oof_reg_preds) ** .5

Fold: 0
Fitting model...
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.6557
[200]	valid_0's rmse: 1.57828
[300]	valid_0's rmse: 1.55684
[400]	valid_0's rmse: 1.54941
[500]	valid_0's rmse: 1.5453
[600]	valid_0's rmse: 1.54333
[700]	valid_0's rmse: 1.54098
[800]	valid_0's rmse: 1.53943
[900]	valid_0's rmse: 1.53848
Early stopping, best iteration is:
[921]	valid_0's rmse: 1.53835
Fold: 1
Fitting model...
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.63155
[200]	valid_0's rmse: 1.5574
[300]	valid_0's rmse: 1.53903
[400]	valid_0's rmse: 1.53289
[500]	valid_0's rmse: 1.52944
[600]	valid_0's rmse: 1.52778
[700]	valid_0's rmse: 1.52705
[800]	valid_0's rmse: 1.52633
[900]	valid_0's rmse: 1.52523
[1000]	valid_0's rmse: 1.5243
[1100]	valid_0's rmse: 1.52337
[1200]	valid_0's rmse: 1.52315
[1300]	valid_0's rmse: 1.52281
Early stopping, best iteration is:
[1346]	valid_0's rmse: 1.52255
Fold: 2
Fitting model...
Training u

18.6920437666202

In [34]:
gs = {
    '10000est_25md_54nl': [1.53524, 1.52283, 1.51882, 1.51569, 1.55264],
    '10000est_25md_25nl': [1.53939, 1.52524, 1.52115, 1.51599, 1.55375],
    '10000est_25md_75nl': [1.53835, 1.52255, 1.51728, 1.51628, 1.55158],
}

In [35]:
for k, v in gs.items(): print('{}:'.format(k), np.array(v).mean())

10000est_25md_54nl: 1.529044
10000est_25md_25nl: 1.531104
10000est_25md_75nl: 1.529208
