In [29]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import lightgbm as lgb
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import LabelEncoder
from multiprocessing import Pool
import subprocess
import matplotlib.pyplot as plt
import os
import time
from sklearn.model_selection import TimeSeriesSplit, KFold, StratifiedKFold
from sklearn.model_selection import GroupKFold
from features import get_features
from report import report
from report import get_feature_importance
import operator

 


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load Data

In [48]:
df_train = pd.read_csv("input/parsed_train.csv", dtype={'fullVisitorId': 'str', 'trafficSource_adwordsClickInfo.page':'str'})
df_test = pd.read_csv("input/parsed_test.csv", dtype={'fullVisitorId': 'str', 'trafficSource_adwordsClickInfo.page':'str'})


const_cols = ['socialEngagementType',
 'device_browserSize',
 'device_browserVersion',
 'device_flashVersion',
 'device_language',
 'device_mobileDeviceBranding',
 'device_mobileDeviceInfo',
 'device_mobileDeviceMarketingName',
 'device_mobileDeviceModel',
 'device_mobileInputSelector',
 'device_operatingSystemVersion',
 'device_screenColors',
 'device_screenResolution',
 'geoNetwork_cityId',
 'geoNetwork_latitude',
 'geoNetwork_longitude',
 'geoNetwork_networkLocation',
 'totals_visits',
 'trafficSource_adwordsClickInfo.criteriaParameters']

df_train.drop(const_cols, axis=1, inplace=True)
df_test.drop(const_cols, axis=1, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


### Feature Engineering

In [63]:
df_train = get_features(df_train)
df_test = get_features(df_test)

'fea_date_time'  21788.77 ms
'fea_format'  1211.72 ms
'fea_device'  1843.65 ms
'fea_totals'  2193.58 ms
'fea_geo_network'  1349.52 ms
'fea_traffic_source'  2272.41 ms
{'count_pageviews_per_network_domain', 'mean_hits_per_network_domain', 'mean_hits_per_day', 'hour', 'weekday', 'campaign_medium', 'medium_hits_max', 'source_country', 'day', 'mean_hits_per_hour', 'min_hits_per_hour', 'user_hour_min', 'mean_hour_per_browser_operatingSystem', 'day_unique_user_count', 'mean_pageviews_per_network_domain', 'user_hour_max', 'hour_unique_s_count', 'sum_hits_per_network_domain', 'count_hits_per_network_domain', 'weekday_unique_user_count', 'var_hits_per_hour', 'totals_pageviews_hit_rate', 'month_unique_s_count', 'var_hits_per_day', 'medium_hits_min', 'sum_hits_per_day', 'weekofmonth', 'weekday_unique_s_count', 'max_hits_per_day', 'min_hits_per_day', 'browser_category', 'medium_hits_sum', 'max_hits_per_hour', 'month_unique_user_count', 'hour_unique_user_count', 'medium_hits_mean', 'weekofyear', 'u

In [50]:
# df_test['totals_transactionRevenue'] = 0

# train_size = df_train.shape[0]
# df_merge = pd.concat([df_train, df_test])
# df_merge = get_features(df_merge)
                      
# df_train = df_merge[:train_size]
# df_test = df_merge[train_size:]
# df_test.drop('totals_transactionRevenue', axis=1, inplace=True)

### Split X and y, Remove not_used_cols

In [51]:
target_col = 'totals_transactionRevenue'
df_train_y = df_train[target_col].astype(float)
df_train_y.fillna(0, inplace=True)
df_train_y = np.log1p(df_train_y)

In [58]:
not_used_cols = [
#     "visitNumber", 
    "date", "fullVisitorId", "sessionId", "visitId",
#     "visitStartTime",
    'trafficSource_referralPath',
    'trafficSource_campaignCode', target_col,

#     'trafficSource_adwordsClickInfo.gclId',
#     'trafficSource_adContent',
#     'trafficSource_adwordsClickInfo.slot',
#     'trafficSource_adwordsClickInfo.gclId',
#     'trafficSource_adwordsClickInfo.adNetworkType',
    
#     'trafficSource_campaign',
#     'socialEngagementType',
#     'device_browserSize',
#     'device_browserVersion',
#     'device_flashVersion',
#     'device_language',
#     'device_mobileDeviceBranding',
#     'device_mobileDeviceInfo',
#     'device_mobileDeviceMarketingName',
#     'device_mobileDeviceModel',
#     'device_mobileInputSelector',
#     'device_operatingSystemVersion',
#     'device_screenColors',
#     'device_screenResolution',
#     'geoNetwork_cityId',
#     'geoNetwork_latitude',
#     'geoNetwork_longitude',
#     'geoNetwork_networkLocation',
#     'totals_visits',
#     'trafficSource_adwordsClickInfo.criteriaParameters',
#     'trafficSource_adwordsClickInfo.isVideoAd',
#     'trafficSource_adwordsClickInfo.page',
]

df_train_idx = df_train[['fullVisitorId', target_col]]
df_train_X = df_train.drop(not_used_cols, axis=1)

df_test_idx = df_test[['fullVisitorId']]
del_cols = list(set(df_test.columns) & set(not_used_cols))
df_test_X = df_test.drop(del_cols, axis=1)

### Label Encoding

In [59]:
categorical_feature = list(df_train_X.select_dtypes(exclude=np.number).columns)
print(categorical_feature)

# df_train_X[categorical_feature].fillna('NA_NULL', inplace=True)
# df_train_X.fillna(0, inplace=True)

# df_test_X[categorical_feature].fillna('NA_NULL', inplace=True)
# df_test_X.fillna(0, inplace=True)

# for col in categorical_feature:
#     print(col)
#     lbl = LabelEncoder()
#     lbl.fit(list(df_train_X[col].values.astype('str')) + list(df_test_X[col].values.astype('str')))
#     df_train_X[col] = lbl.transform(list(df_train_X[col].values.astype('str')))
#     df_test_X[col] = lbl.transform(list(df_test_X[col].values.astype('str')))

df_merge = pd.concat([df_train_X[categorical_feature], df_test_X[categorical_feature]])
train_size = df_train_X.shape[0]
for c in categorical_feature:
    st = time.time()
    
    labels, _ = pd.factorize(df_merge[c].values.astype('str'))
    
    df_train_X[c] = labels[:train_size]
    df_test_X[c] = labels[train_size:]
    print(c, time.time() - st)

['channelGrouping', 'device_browser', 'device_deviceCategory', 'device_isMobile', 'device_operatingSystem', 'geoNetwork_city', 'geoNetwork_continent', 'geoNetwork_country', 'geoNetwork_metro', 'geoNetwork_networkDomain', 'geoNetwork_region', 'geoNetwork_subContinent', 'trafficSource_adContent', 'trafficSource_adwordsClickInfo.adNetworkType', 'trafficSource_adwordsClickInfo.gclId', 'trafficSource_adwordsClickInfo.isVideoAd', 'trafficSource_adwordsClickInfo.page', 'trafficSource_adwordsClickInfo.slot', 'trafficSource_campaign', 'trafficSource_isTrueDirect', 'trafficSource_keyword', 'trafficSource_medium', 'trafficSource_source']
channelGrouping 1.1314396858215332
device_browser 1.2942581176757812
device_deviceCategory 1.0111286640167236
device_isMobile 0.8050539493560791
device_operatingSystem 0.9678449630737305
geoNetwork_city 1.097965955734253
geoNetwork_continent 0.7887811660766602
geoNetwork_country 0.9889791011810303
geoNetwork_metro 1.0761537551879883
geoNetwork_networkDomain 1.164

### Training with CV and TS

In [60]:
N_SPLITS = 5
feature_name = list(df_train_X.columns)

fold_cols = []
prediction_train = np.zeros(df_train_X.shape[0])
prediction = np.zeros(df_test_X.shape[0])

In [61]:
def modeling(fold_col, X_train, X_valid, y_train, y_valid):
    print('X_train:', X_train.shape, 'X_valid:', X_valid.shape, 'Rate:', X_valid.shape[0] / X_train.shape[0])
    if X_valid.shape[0] / X_train.shape[0] > 1.0:
        print('No Execution')
        return
    
    train = lgb.Dataset(
        X_train.values,
        label=y_train.values,
        feature_name=feature_name,
        categorical_feature=categorical_feature)

    valid = lgb.Dataset(
        X_valid.values,
        label=y_valid.values,
        feature_name=feature_name,
        categorical_feature=categorical_feature)

    params = {
        "objective": "regression",
        "metric": "rmse",
        "max_depth": 8,
        "min_child_samples": 20,
        "reg_alpha": 1,
        "reg_lambda": 1,
        "num_leaves": 257,
        "learning_rate": 0.01,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "subsample_freq ": 5,
    }
    #     params = {
    #     "objective": "regression",
    #     "metric": "rmse",
    #     "num_leaves": 30,
    #     "min_child_samples": 100,
    #     "learning_rate": 0.1,
    #     "bagging_fraction": 0.7,
    #     "feature_fraction": 0.5,
    #     "bagging_frequency": 5,
    #     "bagging_seed": 2018,
    #     "verbosity": -1
    # }

    # param = {
    #     "objective": "regression",
    #     "metric": "rmse",
    #     "max_depth": 8,
    #     "min_child_samples": 21,
    #     "reg_alpha": 1,
    #     "reg_lambda": 1,
    #     "num_leaves": 257,
    #     "learning_rate": 0.01,
    #     "subsample": 0.82,
    #     "colsample_bytree": 0.84,
    #     "verbosity": -1
    # }
    
    params = {
        "objective" : "regression",
        "metric" : "rmse", 
        "num_leaves" : 30,
        "min_child_samples" : 100,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.5,
        "bagging_frequency" : 5,
        "bagging_seed" : 1989,
        "verbosity" : -1,
        'seed': 1989
    }
    
    evals_result = {}
    train_params = {
        'params': params,
        'train_set': train,
        'valid_sets': [train, valid],
        'valid_names': ['train', 'valid'],
        'early_stopping_rounds': 500,  #50
        'num_boost_round': 10000,  #500
        'verbose_eval': 25,
        'feval': None,
        'evals_result': evals_result,
        #     'categorical_feature': 'auto',
    }

    model = lgb.train(**train_params)

    global prediction_train, prediction
    print('predict train set')
    pred_train = model.predict(df_train_X, num_iteration=model.best_iteration)
    prediction_train += pred_train
    df_fold_train = df_train_idx.copy()
    df_fold_train[fold_col] = pred_train

    print('predict test set')
    pred = model.predict(df_test_X, num_iteration=model.best_iteration)
    prediction += pred
    df_fold_test = df_test_idx.copy()
    df_fold_test[fold_col] = pred

    # report
    msg = [fold_col, str(df_train_X.columns), str(train_params)]
    submit_file_path = report(df_fold_train, df_fold_test, pred_train, pred, msg, model)
    print(submit_file_path)

    return model

### GroupKFold

In [62]:
groups = df_train['date'].apply(lambda x: 1 if x > 20170531 else 0)
folds = GroupKFold(n_splits=2)

for fold_n, (train_index, test_index) in enumerate(folds.split(df_train_X, df_train_y, groups)):
    fold_col = 'GroupKFold_{0:02d}'.format(fold_n)
    fold_cols.append(fold_col)
    print('Fold:', fold_col)

    X_train, X_valid = df_train_X.iloc[train_index], df_train_X.iloc[test_index]
    y_train, y_valid = df_train_y.iloc[train_index], df_train_y.iloc[test_index]
    
    model = modeling(fold_col, X_train, X_valid, y_train, y_valid)

Fold: GroupKFold_00
X_train: (137946, 29) X_valid: (765707, 29) Rate: 5.550773491076218
No Execution
Fold: GroupKFold_01
X_train: (765707, 29) X_valid: (137946, 29) Rate: 0.18015507237102443




Training until validation scores don't improve for 500 rounds.
[25]	train's rmse: 1.62667	valid's rmse: 1.73084
[50]	train's rmse: 1.58857	valid's rmse: 1.70542
[75]	train's rmse: 1.57189	valid's rmse: 1.70031
[100]	train's rmse: 1.55546	valid's rmse: 1.69783
[125]	train's rmse: 1.54261	valid's rmse: 1.69649
[150]	train's rmse: 1.53273	valid's rmse: 1.6951
[175]	train's rmse: 1.52525	valid's rmse: 1.69516
[200]	train's rmse: 1.51839	valid's rmse: 1.69472
[225]	train's rmse: 1.51253	valid's rmse: 1.69515
[250]	train's rmse: 1.50665	valid's rmse: 1.69619
[275]	train's rmse: 1.50202	valid's rmse: 1.69626
[300]	train's rmse: 1.49637	valid's rmse: 1.69594
[325]	train's rmse: 1.49062	valid's rmse: 1.69578
[350]	train's rmse: 1.48668	valid's rmse: 1.69644
[375]	train's rmse: 1.48196	valid's rmse: 1.69652
[400]	train's rmse: 1.47682	valid's rmse: 1.69742
[425]	train's rmse: 1.47315	valid's rmse: 1.69779
[450]	train's rmse: 1.46955	valid's rmse: 1.69812
[475]	train's rmse: 1.46657	valid's rmse:

### Submit

In [11]:
df_train_idx['y_true'] = df_train_y.values
df_train_idx['y_pred_sum'] = prediction_train
df_train_idx['y_pred_avg'] = prediction_train / 13
df_train_idx['y_diff'] = df_train_idx['y_true'] - df_train_idx['y_pred_avg']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [12]:
df_train_idx[df_train_idx['y_true'] > 0]

Unnamed: 0,fullVisitorId,totals_transactionRevenue,y_true,y_pred_sum,y_pred_avg,y_diff
752,6194193421514403509,37860000.0,17.449406,1.621772,0.124752,17.324654
753,5327166854580374902,306670000.0,19.541283,3.071386,0.236260,19.305022
799,8885051388942907862,68030000.0,18.035459,8.251632,0.634741,17.400718
802,0185467632009737931,26250000.0,17.083177,6.187256,0.475943,16.607234
859,3244885836845029978,574150000.0,20.168401,13.237015,1.018232,19.150169
866,3351538799616866750,8380000.0,15.941359,9.543988,0.734153,15.207206
893,1776658355119092313,395730000.0,19.796243,6.214708,0.478054,19.318188
910,770431600902969839,24080000.0,16.996892,12.081975,0.929383,16.067510
922,7147112211830167925,35480000.0,17.384480,1.074680,0.082668,17.301812
925,6664733704830724714,35080000.0,17.373142,10.884931,0.837302,16.535839


In [13]:
df_train_X[df_train['fullVisitorId']=='8007300615127214527']

Unnamed: 0,channelGrouping,visitNumber,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,geoNetwork_country,geoNetwork_metro,...,geoNetwork_region,geoNetwork_subContinent,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_source
1000,0,3,0,0,0,1,47,3,33,3,...,22,10,0.0,34,0.0,30.0,1,0,0,0
62193,0,1,0,0,0,1,47,3,33,3,...,22,10,0.0,3,1.0,3.0,0,0,0,0
188632,0,2,0,0,0,1,47,3,33,3,...,22,10,0.0,13,0.0,8.0,1,0,0,0
407085,0,4,0,0,0,1,47,3,33,3,...,22,10,0.0,6,0.0,6.0,1,0,0,0


In [14]:
from sklearn.metrics import mean_squared_error

df_g = df_train_idx.groupby('fullVisitorId').agg({'y_true':'sum', 'y_pred_avg':'sum'})

In [15]:
mse = mean_squared_error(df_g['y_true'], df_g['y_pred_avg'])
np.sqrt(mse)

2.6908648336960663

### to Kaggle with MSG

In [16]:
# cmd = """kaggle competitions submit -c ga-customer-revenue-prediction -f {file_path} -m "{msg}"
# """.format(**{'file_path': submit_file_path, 'msg': ','.join(msg)[:500]})

cmd = """kaggle competitions submit -c ga-customer-revenue-prediction -f {file_path} -m "{msg}"
""".format(**{'file_path': submit_file_path, 'msg': 'msg'})

print(cmd)
subprocess.call(cmd, shell=True)

NameError: name 'submit_file_path' is not defined