In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import lightgbm as lgb
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import LabelEncoder
from multiprocessing import Pool
import subprocess
import matplotlib.pyplot as plt
import os
import time
from sklearn.model_selection import TimeSeriesSplit, KFold, StratifiedKFold
from sklearn.model_selection import GroupKFold
from features import get_features
from report import report
from report import get_feature_importance
import operator

 


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you won't need to install the gcc compiler anymore.
Instead of that, you'll need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### Load Data

In [2]:
df_train = pd.read_csv("input/parsed_train.csv", dtype={'fullVisitorId': 'str'})
df_test = pd.read_csv("input/parsed_test.csv", dtype={'fullVisitorId': 'str'})

  interactivity=interactivity, compiler=compiler, result=result)


### Feature Engineering

In [3]:
df_train = get_features(df_train)
df_test = get_features(df_test)

'fea_date_time'  21227.36 ms
'fea_format'  1887.81 ms
'fea_device'  2647.82 ms
'fea_totals'  2931.98 ms
'fea_geo_network'  1363.06 ms
'fea_traffic_source'  3061.27 ms
{'hour_unique_user_count', 'source_country', 'count_hits_per_network_domain', 'mean_hits_per_hour', 'var_hits_per_hour', 'sum_hits_per_day', 'month_unique_s_count', 'hour', 'sum_pageviews_per_network_domain', 'mean_hits_per_network_domain', 'campaign_medium', 'medium_hits_max', 'sum_hits_per_hour', 'mean_hits_per_day', 'sum_hits_per_network_domain', 'weekday_unique_user_count', 'user_hour_min', 'weekofmonth', 'weekofyear', 'user_hour_max', 'day_unique_s_count', 'max_hits_per_day', 'max_hits_per_hour', 'month_unique_user_count', 'mean_pageviews_per_network_domain', 'medium_hits_mean', 'medium_hits_min', 'month', 'weekday', 'var_hits_per_day', 'user_hour_mean', 'min_hits_per_day', 'weekday_unique_s_count', 'totals_pageviews_hit_rate', 'browser_category', 'count_pageviews_per_network_domain', 'medium_hits_sum', 'day_unique_u

In [4]:
# df_test['totals_transactionRevenue'] = 0

# train_size = df_train.shape[0]
# df_merge = pd.concat([df_train, df_test])
# df_merge = get_features(df_merge)
                      
# df_train = df_merge[:train_size]
# df_test = df_merge[train_size:]
# df_test.drop('totals_transactionRevenue', axis=1, inplace=True)

### Split X and y, Remove not_used_cols

In [5]:
target_col = 'totals_transactionRevenue'
df_train_y = df_train[target_col].astype(float)
df_train_y.fillna(0, inplace=True)
df_train_y = np.log1p(df_train_y)

In [6]:
not_used_cols = [
#     "visitNumber", 
    "date", "fullVisitorId", "sessionId", "visitId",
    "visitStartTime", 'trafficSource_referralPath',
    'trafficSource_campaignCode', target_col,
    'trafficSource_adwordsClickInfo.gclId',
    'trafficSource_adContent',
    'trafficSource_adwordsClickInfo.slot',
    'trafficSource_adwordsClickInfo.gclId',
    'trafficSource_adwordsClickInfo.adNetworkType',
    
    'trafficSource_campaign',
    'socialEngagementType',
    'device_browserSize',
    'device_browserVersion',
    'device_flashVersion',
    'device_language',
    'device_mobileDeviceBranding',
    'device_mobileDeviceInfo',
    'device_mobileDeviceMarketingName',
    'device_mobileDeviceModel',
    'device_mobileInputSelector',
    'device_operatingSystemVersion',
    'device_screenColors',
    'device_screenResolution',
    'geoNetwork_cityId',
    'geoNetwork_latitude',
    'geoNetwork_longitude',
    'geoNetwork_networkLocation',
    'totals_visits',
    'trafficSource_adwordsClickInfo.criteriaParameters',
    'trafficSource_adwordsClickInfo.isVideoAd',
    'trafficSource_adwordsClickInfo.page',
]

df_train_idx = df_train[['fullVisitorId', target_col]]
df_train_X = df_train.drop(not_used_cols, axis=1)

df_test_idx = df_test[['fullVisitorId']]
del_cols = list(set(df_test.columns) & set(not_used_cols))
df_test_X = df_test.drop(del_cols, axis=1)

### Label Encoding

In [7]:
categorical_feature = list(df_train_X.select_dtypes(exclude=np.number).columns)
print(categorical_feature)

df_train_X[categorical_feature].fillna('NA_NULL', inplace=True)
df_train_X.fillna(0, inplace=True)

df_test_X[categorical_feature].fillna('NA_NULL', inplace=True)
df_test_X.fillna(0, inplace=True)

df_merge = pd.concat([df_train_X[categorical_feature], df_test_X[categorical_feature]])
train_size = df_train_X.shape[0]
for c in categorical_feature:
    st = time.time()
    
    labels, _ = pd.factorize(df_merge[c].values.astype('str'))
    
    df_train_X[c] = labels[:train_size]
    df_test_X[c] = labels[train_size:]
    print(c, time.time() - st)

['channelGrouping', 'device_browser', 'device_deviceCategory', 'device_isMobile', 'device_operatingSystem', 'geoNetwork_city', 'geoNetwork_continent', 'geoNetwork_country', 'geoNetwork_metro', 'geoNetwork_networkDomain', 'geoNetwork_region', 'geoNetwork_subContinent', 'trafficSource_isTrueDirect', 'trafficSource_keyword', 'trafficSource_medium', 'trafficSource_source', 'browser_category', 'browser_operatingSystem', 'source_country', 'campaign_medium']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


channelGrouping 1.0430803298950195
device_browser 1.2723309993743896
device_deviceCategory 0.9302587509155273
device_isMobile 0.9015848636627197
device_operatingSystem 0.9350080490112305
geoNetwork_city 1.2669692039489746
geoNetwork_continent 0.9648580551147461
geoNetwork_country 1.0812816619873047
geoNetwork_metro 1.18623685836792
geoNetwork_networkDomain 1.176877737045288
geoNetwork_region 1.0024158954620361
geoNetwork_subContinent 0.849539041519165
trafficSource_isTrueDirect 0.9462931156158447
trafficSource_keyword 2.077270984649658
trafficSource_medium 0.6414670944213867
trafficSource_source 0.9077229499816895
browser_category 0.9566898345947266
browser_operatingSystem 0.9681520462036133
source_country 1.0780439376831055
campaign_medium 0.9078779220581055


### Training with CV and TS

In [8]:
N_SPLITS = 5
feature_name = list(df_train_X.columns)

fold_cols = []
prediction_train = np.zeros(df_train_X.shape[0])
prediction = np.zeros(df_test_X.shape[0])

In [14]:
def modeling(fold_col, X_train, X_valid, y_train, y_valid):
    train = lgb.Dataset(
        X_train.values,
        label=y_train.values,
        feature_name=feature_name,
        categorical_feature=categorical_feature)

    valid = lgb.Dataset(
        X_valid.values,
        label=y_valid.values,
        feature_name=feature_name,
        categorical_feature=categorical_feature)

    params = {
        "objective": "regression",
        "metric": "rmse",
        "max_depth": 8,
        "min_child_samples": 20,
        "reg_alpha": 1,
        "reg_lambda": 1,
        "num_leaves": 257,
        "learning_rate": 0.01,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "subsample_freq ": 5,
    }
    #     params = {
    #     "objective": "regression",
    #     "metric": "rmse",
    #     "num_leaves": 30,
    #     "min_child_samples": 100,
    #     "learning_rate": 0.1,
    #     "bagging_fraction": 0.7,
    #     "feature_fraction": 0.5,
    #     "bagging_frequency": 5,
    #     "bagging_seed": 2018,
    #     "verbosity": -1
    # }

    # param = {
    #     "objective": "regression",
    #     "metric": "rmse",
    #     "max_depth": 8,
    #     "min_child_samples": 21,
    #     "reg_alpha": 1,
    #     "reg_lambda": 1,
    #     "num_leaves": 257,
    #     "learning_rate": 0.01,
    #     "subsample": 0.82,
    #     "colsample_bytree": 0.84,
    #     "verbosity": -1
    # }
    evals_result = {}
    train_params = {
        'params': params,
        'train_set': train,
        'valid_sets': [train, valid],
        'valid_names': ['train', 'valid'],
        'early_stopping_rounds': 100,  #50
        'num_boost_round': 500,  #500
        'verbose_eval': 25,
        'feval': None,
        'evals_result': evals_result,
        #     'categorical_feature': 'auto',
    }

    model = lgb.train(**train_params)

    global prediction_train, prediction
    print('predict train set')
    pred_train = model.predict(df_train_X, num_iteration=model.best_iteration)
    prediction_train += pred_train
    df_fold_train = df_train_idx.copy()
    df_fold_train[fold_col] = pred_train

    print('predict test set')
    pred = model.predict(df_test_X, num_iteration=model.best_iteration)
    prediction += pred
    df_fold_test = df_test_idx.copy()
    df_fold_test[fold_col] = pred

    # report
    msg = [fold_col, str(df_train_X.columns), str(train_params)]
    submit_file_path = report(df_fold_train, df_fold_test, pred_train, pred, msg, model)
    print(submit_file_path)

    return model

### KFold

In [15]:
folds = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

for fold_n, (train_index, test_index) in enumerate(folds.split(df_train_X)):
    fold_col = 'KFold_{0:02d}'.format(fold_n)
    fold_cols.append(fold_col)
    print('Fold:', fold_col)

    X_train, X_valid = df_train_X.iloc[train_index], df_train_X.iloc[test_index]
    y_train, y_valid = df_train_y.iloc[train_index], df_train_y.iloc[test_index]
    
    model = modeling(fold_col, X_train, X_valid, y_train, y_valid)


Fold: KFold_00




Training until validation scores don't improve for 100 rounds.
[25]	train's rmse: 1.86511	valid's rmse: 1.86287
[50]	train's rmse: 1.76682	valid's rmse: 1.77749
[75]	train's rmse: 1.69477	valid's rmse: 1.71916
[100]	train's rmse: 1.64406	valid's rmse: 1.68107
[125]	train's rmse: 1.60681	valid's rmse: 1.65605
[150]	train's rmse: 1.57708	valid's rmse: 1.63933
[175]	train's rmse: 1.55371	valid's rmse: 1.62817
[200]	train's rmse: 1.53477	valid's rmse: 1.62089
[225]	train's rmse: 1.51931	valid's rmse: 1.61599
[250]	train's rmse: 1.50548	valid's rmse: 1.61244
[275]	train's rmse: 1.49341	valid's rmse: 1.60972
[300]	train's rmse: 1.48353	valid's rmse: 1.60818
[325]	train's rmse: 1.47535	valid's rmse: 1.60689
[350]	train's rmse: 1.46737	valid's rmse: 1.60629
[375]	train's rmse: 1.46134	valid's rmse: 1.60577
[400]	train's rmse: 1.45625	valid's rmse: 1.60558
[425]	train's rmse: 1.45254	valid's rmse: 1.6056
[450]	train's rmse: 1.44918	valid's rmse: 1.60581
[475]	train's rmse: 1.44627	valid's rmse:

[450]	train's rmse: 1.45113	valid's rmse: 1.6028
[475]	train's rmse: 1.44804	valid's rmse: 1.60272
[500]	train's rmse: 1.44572	valid's rmse: 1.603
Did not meet early stopping. Best iteration is:
[500]	train's rmse: 1.44572	valid's rmse: 1.603
predict train set
predict test set
<feature_importance>
0 ('totals_pageviews_hit_rate', 8255)
1 ('totals_pageviews', 6911)
2 ('visitNumber', 4538)
3 ('totals_hits', 4393)
4 ('weekofyear', 4196)
5 ('geoNetwork_city', 3905)
6 ('geoNetwork_networkDomain', 3102)
7 ('source_country', 2805)
8 ('user_hour_mean', 2338)
9 ('browser_operatingSystem', 1693)
10 ('hour_unique_user_count', 1638)
11 ('user_hour_max', 1627)
12 ('mean_hits_per_day', 1567)
13 ('month', 1534)
14 ('user_hour_min', 1523)
15 ('max_hits_per_day', 1484)
16 ('var_hits_per_day', 1451)
17 ('day', 1436)
18 ('month_unique_user_count', 1383)
19 ('mean_hour_per_browser_operatingSystem', 1358)
20 ('day_unique_user_count', 1286)
21 ('max_hits_per_hour', 1202)
22 ('mean_hits_per_hour', 1131)
23 ('

raw_train: results/2018-09-26T224437__T1.450_V1.578_K/reg_train_2018-09-26T224437.csv
raw_test: results/2018-09-26T224437__T1.450_V1.578_K/reg_test_2018-09-26T224437.csv
submit: results/2018-09-26T224437__T1.450_V1.578_K/aiden_2018-09-26T224437.csv.tar.gz
cp -f reg_lgbm.ipynb results/2018-09-26T224437__T1.450_V1.578_K/reg_lgbm.ipynb
    
results/2018-09-26T224437__T1.450_V1.578_K/aiden_2018-09-26T224437.csv.tar.gz


### StratifiedKFold

In [16]:
folds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=7)
y_categorized = np.digitize(df_train_y.values, bins=np.arange(2,24,2))

for fold_n, (train_index, test_index) in enumerate(folds.split(df_train_X, y_categorized)):
    fold_col = 'StratifiedKFold_{0:02d}'.format(fold_n)
    fold_cols.append(fold_col)
    print('Fold:', fold_col)

    X_train, X_valid = df_train_X.iloc[train_index], df_train_X.iloc[test_index]
    y_train, y_valid = df_train_y.iloc[train_index], df_train_y.iloc[test_index]
    
    model = modeling(fold_col, X_train, X_valid, y_train, y_valid)



Fold: StratifiedKFold_00




Training until validation scores don't improve for 100 rounds.
[25]	train's rmse: 1.86416	valid's rmse: 1.87081
[50]	train's rmse: 1.7667	valid's rmse: 1.78175
[75]	train's rmse: 1.69559	valid's rmse: 1.72043
[100]	train's rmse: 1.64523	valid's rmse: 1.68051
[125]	train's rmse: 1.60769	valid's rmse: 1.65385
[150]	train's rmse: 1.57808	valid's rmse: 1.63531
[175]	train's rmse: 1.55476	valid's rmse: 1.62269
[200]	train's rmse: 1.53595	valid's rmse: 1.61414
[225]	train's rmse: 1.51993	valid's rmse: 1.60843
[250]	train's rmse: 1.50648	valid's rmse: 1.60396
[275]	train's rmse: 1.4941	valid's rmse: 1.60111
[300]	train's rmse: 1.48433	valid's rmse: 1.59932
[325]	train's rmse: 1.47565	valid's rmse: 1.59812
[350]	train's rmse: 1.46774	valid's rmse: 1.59706
[375]	train's rmse: 1.46113	valid's rmse: 1.59635
[400]	train's rmse: 1.45532	valid's rmse: 1.59589
[425]	train's rmse: 1.45029	valid's rmse: 1.59564
[450]	train's rmse: 1.44658	valid's rmse: 1.59537
[475]	train's rmse: 1.4443	valid's rmse: 1

[450]	train's rmse: 1.4505	valid's rmse: 1.60185
[475]	train's rmse: 1.44731	valid's rmse: 1.60158
[500]	train's rmse: 1.44474	valid's rmse: 1.60163
Did not meet early stopping. Best iteration is:
[500]	train's rmse: 1.44474	valid's rmse: 1.60163
predict train set
predict test set
<feature_importance>
0 ('totals_pageviews_hit_rate', 8202)
1 ('totals_pageviews', 7065)
2 ('totals_hits', 4570)
3 ('visitNumber', 4296)
4 ('weekofyear', 3994)
5 ('geoNetwork_city', 3981)
6 ('geoNetwork_networkDomain', 3180)
7 ('source_country', 2786)
8 ('user_hour_mean', 2400)
9 ('browser_operatingSystem', 1758)
10 ('user_hour_max', 1757)
11 ('month', 1584)
12 ('user_hour_min', 1572)
13 ('mean_hits_per_day', 1538)
14 ('hour_unique_user_count', 1509)
15 ('month_unique_user_count', 1479)
16 ('var_hits_per_day', 1420)
17 ('max_hits_per_day', 1387)
18 ('day', 1365)
19 ('max_hits_per_hour', 1297)
20 ('day_unique_user_count', 1282)
21 ('mean_hour_per_browser_operatingSystem', 1218)
22 ('mean_hits_per_hour', 1183)
2

raw_train: results/2018-09-26T225106__T1.447_V1.596_K/reg_train_2018-09-26T225106.csv
raw_test: results/2018-09-26T225106__T1.447_V1.596_K/reg_test_2018-09-26T225106.csv
submit: results/2018-09-26T225106__T1.447_V1.596_K/aiden_2018-09-26T225106.csv.tar.gz
cp -f reg_lgbm.ipynb results/2018-09-26T225106__T1.447_V1.596_K/reg_lgbm.ipynb
    
results/2018-09-26T225106__T1.447_V1.596_K/aiden_2018-09-26T225106.csv.tar.gz


### GroupKFold

In [17]:
groups = df_train['date'].astype(str).str[:6].values
folds = GroupKFold(n_splits=len(np.unique(groups)))

for fold_n, (train_index, test_index) in enumerate(folds.split(df_train_X, df_train_y, groups)):
    fold_col = 'GroupKFold_{0:02d}'.format(fold_n)
    fold_cols.append(fold_col)
    print('Fold:', fold_col)

    X_train, X_valid = df_train_X.iloc[train_index], df_train_X.iloc[test_index]
    y_train, y_valid = df_train_y.iloc[train_index], df_train_y.iloc[test_index]
    
    model = modeling(fold_col, X_train, X_valid, y_train, y_valid)

Fold: GroupKFold_00




Training until validation scores don't improve for 100 rounds.
[25]	train's rmse: 1.91099	valid's rmse: 1.50199
[50]	train's rmse: 1.81034	valid's rmse: 1.43628
[75]	train's rmse: 1.73763	valid's rmse: 1.39247
[100]	train's rmse: 1.68688	valid's rmse: 1.36383
[125]	train's rmse: 1.64953	valid's rmse: 1.34556
[150]	train's rmse: 1.62007	valid's rmse: 1.333
[175]	train's rmse: 1.59698	valid's rmse: 1.32434
[200]	train's rmse: 1.57849	valid's rmse: 1.31887
[225]	train's rmse: 1.56329	valid's rmse: 1.31529
[250]	train's rmse: 1.55015	valid's rmse: 1.31286
[275]	train's rmse: 1.53892	valid's rmse: 1.31156
[300]	train's rmse: 1.52953	valid's rmse: 1.31103
[325]	train's rmse: 1.52164	valid's rmse: 1.31064
[350]	train's rmse: 1.51445	valid's rmse: 1.31037
[375]	train's rmse: 1.50796	valid's rmse: 1.31034
[400]	train's rmse: 1.5029	valid's rmse: 1.31039
[425]	train's rmse: 1.49859	valid's rmse: 1.3107
[450]	train's rmse: 1.4953	valid's rmse: 1.31097
Early stopping, best iteration is:
[357]	trai

predict train set
predict test set
<feature_importance>
0 ('totals_pageviews_hit_rate', 8242)
1 ('totals_pageviews', 7170)
2 ('totals_hits', 4627)
3 ('visitNumber', 4553)
4 ('weekofyear', 4245)
5 ('geoNetwork_city', 4198)
6 ('geoNetwork_networkDomain', 3299)
7 ('source_country', 2962)
8 ('user_hour_mean', 2558)
9 ('user_hour_max', 1947)
10 ('browser_operatingSystem', 1783)
11 ('user_hour_min', 1626)
12 ('month', 1620)
13 ('day', 1610)
14 ('hour_unique_user_count', 1549)
15 ('mean_hits_per_day', 1457)
16 ('var_hits_per_day', 1356)
17 ('day_unique_user_count', 1306)
18 ('max_hits_per_day', 1300)
19 ('max_hits_per_hour', 1280)
20 ('mean_hour_per_browser_operatingSystem', 1278)
21 ('month_unique_user_count', 1191)
22 ('mean_hits_per_hour', 1113)
23 ('mean_pageviews_per_network_domain', 1106)
24 ('sum_pageviews_per_network_domain', 1095)
25 ('hour', 1084)
26 ('trafficSource_isTrueDirect', 1012)
27 ('sum_hits_per_hour', 1006)
28 ('sum_hits_per_day', 954)
29 ('weekday', 871)
30 ('day_unique_s

raw_train: results/2018-09-26T225716__T1.451_V1.684_K/reg_train_2018-09-26T225716.csv
raw_test: results/2018-09-26T225716__T1.451_V1.684_K/reg_test_2018-09-26T225716.csv
submit: results/2018-09-26T225716__T1.451_V1.684_K/aiden_2018-09-26T225716.csv.tar.gz
cp -f reg_lgbm.ipynb results/2018-09-26T225716__T1.451_V1.684_K/reg_lgbm.ipynb
    
results/2018-09-26T225716__T1.451_V1.684_K/aiden_2018-09-26T225716.csv.tar.gz
Fold: GroupKFold_05
Training until validation scores don't improve for 100 rounds.
[25]	train's rmse: 1.86417	valid's rmse: 1.83767
[50]	train's rmse: 1.76471	valid's rmse: 1.76154
[75]	train's rmse: 1.69348	valid's rmse: 1.71731
[100]	train's rmse: 1.64337	valid's rmse: 1.69493
[125]	train's rmse: 1.60691	valid's rmse: 1.68481
[150]	train's rmse: 1.57815	valid's rmse: 1.68172
[175]	train's rmse: 1.55542	valid's rmse: 1.68225
[200]	train's rmse: 1.5371	valid's rmse: 1.68408
[225]	train's rmse: 1.52272	valid's rmse: 1.68681
Early stopping, best iteration is:
[148]	train's rmse

predict train set
predict test set
<feature_importance>
0 ('totals_pageviews_hit_rate', 8349)
1 ('totals_pageviews', 7217)
2 ('totals_hits', 4588)
3 ('visitNumber', 4539)
4 ('geoNetwork_city', 4253)
5 ('weekofyear', 3995)
6 ('geoNetwork_networkDomain', 3327)
7 ('source_country', 2973)
8 ('user_hour_mean', 2466)
9 ('user_hour_max', 1838)
10 ('browser_operatingSystem', 1837)
11 ('hour_unique_user_count', 1705)
12 ('user_hour_min', 1692)
13 ('mean_hits_per_day', 1636)
14 ('month_unique_user_count', 1628)
15 ('var_hits_per_day', 1504)
16 ('day', 1465)
17 ('month', 1459)
18 ('max_hits_per_day', 1342)
19 ('max_hits_per_hour', 1246)
20 ('mean_hour_per_browser_operatingSystem', 1194)
21 ('day_unique_user_count', 1164)
22 ('hour', 1123)
23 ('mean_pageviews_per_network_domain', 1107)
24 ('sum_hits_per_hour', 1054)
25 ('sum_pageviews_per_network_domain', 1041)
26 ('sum_hits_per_day', 1039)
27 ('trafficSource_isTrueDirect', 1021)
28 ('mean_hits_per_hour', 975)
29 ('weekday', 925)
30 ('day_unique_s

raw_train: results/2018-09-26T230304__T1.518_V1.506_K/reg_train_2018-09-26T230304.csv
raw_test: results/2018-09-26T230304__T1.518_V1.506_K/reg_test_2018-09-26T230304.csv
submit: results/2018-09-26T230304__T1.518_V1.506_K/aiden_2018-09-26T230304.csv.tar.gz
cp -f reg_lgbm.ipynb results/2018-09-26T230304__T1.518_V1.506_K/reg_lgbm.ipynb
    
results/2018-09-26T230304__T1.518_V1.506_K/aiden_2018-09-26T230304.csv.tar.gz
Fold: GroupKFold_10
Training until validation scores don't improve for 100 rounds.
[25]	train's rmse: 1.85409	valid's rmse: 2.01207
[50]	train's rmse: 1.75795	valid's rmse: 1.91402
[75]	train's rmse: 1.68867	valid's rmse: 1.84518
[100]	train's rmse: 1.63954	valid's rmse: 1.79861
[125]	train's rmse: 1.60341	valid's rmse: 1.76738
[150]	train's rmse: 1.57509	valid's rmse: 1.74454
[175]	train's rmse: 1.55332	valid's rmse: 1.72907
[200]	train's rmse: 1.53588	valid's rmse: 1.71878
[225]	train's rmse: 1.52137	valid's rmse: 1.71215
[250]	train's rmse: 1.50864	valid's rmse: 1.70711
[2

[225]	train's rmse: 1.53131	valid's rmse: 1.92551
[250]	train's rmse: 1.51878	valid's rmse: 1.92257
[275]	train's rmse: 1.50781	valid's rmse: 1.92056
[300]	train's rmse: 1.49909	valid's rmse: 1.91835
[325]	train's rmse: 1.49142	valid's rmse: 1.91549
[350]	train's rmse: 1.48475	valid's rmse: 1.91567
[375]	train's rmse: 1.47876	valid's rmse: 1.91513
[400]	train's rmse: 1.47366	valid's rmse: 1.91511
[425]	train's rmse: 1.4692	valid's rmse: 1.9151
[450]	train's rmse: 1.46609	valid's rmse: 1.91473
[475]	train's rmse: 1.46327	valid's rmse: 1.91463
[500]	train's rmse: 1.46095	valid's rmse: 1.91364
Did not meet early stopping. Best iteration is:
[500]	train's rmse: 1.46095	valid's rmse: 1.91364
predict train set
predict test set
<feature_importance>
0 ('totals_pageviews_hit_rate', 8703)
1 ('totals_pageviews', 7228)
2 ('totals_hits', 4758)
3 ('visitNumber', 4664)
4 ('weekofyear', 4346)
5 ('geoNetwork_city', 4318)
6 ('geoNetwork_networkDomain', 3524)
7 ('source_country', 2958)
8 ('user_hour_mean

### Submit

In [24]:
df_train_idx['y_true'] = df_train_y.values
df_train_idx['y_pred_sum'] = prediction_train
df_train_idx['y_pred_avg'] = prediction_train / 13
df_train_idx['y_diff'] = df_train_idx['y_true'] - df_train_idx['y_pred_avg']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [26]:
df_train_idx[df_train_idx['y_true'] > 0]

Unnamed: 0,fullVisitorId,totals_transactionRevenue,y_pred,y_pred_sum,y_pred_avg,y_true
752,6194193421514403509,37860000.0,76.324816,76.324816,5.871140,17.449406
753,5327166854580374902,306670000.0,85.517484,85.517484,6.578268,19.541283
799,8885051388942907862,68030000.0,174.511709,174.511709,13.423978,18.035459
802,0185467632009737931,26250000.0,68.420434,68.420434,5.263110,17.083177
859,3244885836845029978,574150000.0,168.051073,168.051073,12.927006,20.168401
866,3351538799616866750,8380000.0,55.299643,55.299643,4.253819,15.941359
893,1776658355119092313,395730000.0,47.535508,47.535508,3.656578,19.796243
910,770431600902969839,24080000.0,134.129131,134.129131,10.317625,16.996892
922,7147112211830167925,35480000.0,35.014300,35.014300,2.693408,17.384480
925,6664733704830724714,35080000.0,106.130502,106.130502,8.163885,17.373142


In [36]:
df_train_X[df_train['fullVisitorId']=='8007300615127214527']

Unnamed: 0,channelGrouping,visitNumber,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,geoNetwork_country,geoNetwork_metro,...,count_pageviews_per_network_domain,mean_pageviews_per_network_domain,sum_hits_per_network_domain,count_hits_per_network_domain,mean_hits_per_network_domain,campaign_medium,medium_hits_mean,medium_hits_max,medium_hits_min,medium_hits_sum
1000,0,1.386294,0,0,0,1,47,3,33,3,...,244881,1.413671,362835.303771,244881,1.48168,0,1.318723,6.216606,0.693147,503173.377428
62193,0,0.693147,0,0,0,1,47,3,33,3,...,244881,1.413671,362835.303771,244881,1.48168,0,1.318723,6.216606,0.693147,503173.377428
188632,0,1.098612,0,0,0,1,47,3,33,3,...,244881,1.413671,362835.303771,244881,1.48168,0,1.318723,6.216606,0.693147,503173.377428
407085,0,1.609438,0,0,0,1,47,3,33,3,...,244881,1.413671,362835.303771,244881,1.48168,0,1.318723,6.216606,0.693147,503173.377428


In [None]:
from sklearn.metrics import mean_squared_error

df_g = df_train_idx.groupby('fullVisitorId').agg({'y_true':'sum', 'y_pred_avg':'sum'})

In [31]:
mse = mean_squared_error(df_g['y_true'], df_g['y_pred_avg'])
np.sqrt(mse)

2.315754261394708

### to Kaggle with MSG

In [None]:
# cmd = """kaggle competitions submit -c ga-customer-revenue-prediction -f {file_path} -m "{msg}"
# """.format(**{'file_path': submit_file_path, 'msg': ','.join(msg)[:500]})

cmd = """kaggle competitions submit -c ga-customer-revenue-prediction -f {file_path} -m "{msg}"
""".format(**{'file_path': submit_file_path, 'msg': 'msg'})

print(cmd)
subprocess.call(cmd, shell=True)