In [1]:
%matplotlib inline
import lightgbm as lgb
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import LabelEncoder
from multiprocessing import Pool
import subprocess
import matplotlib.pyplot as plt
import os
import time
from sklearn.model_selection import TimeSeriesSplit, KFold, StratifiedKFold
from features import get_features

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you won't need to install the gcc compiler anymore.
Instead of that, you'll need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### Load Data

In [2]:
df_train = pd.read_csv("input/parsed_train.csv", dtype={'fullVisitorId': 'str'})
df_test = pd.read_csv("input/parsed_test.csv", dtype={'fullVisitorId': 'str'})

  interactivity=interactivity, compiler=compiler, result=result)


### Split X and y, Remove not_used_cols

In [3]:
target_col = 'totals_transactionRevenue'
df_train_y = df_train[target_col].astype(float)
df_train_y.fillna(0, inplace=True)
df_train_y = np.log1p(df_train_y)

In [4]:
not_used_cols = [
    "visitNumber", "date", "fullVisitorId", "sessionId", "visitId",
    "visitStartTime", 'trafficSource_referralPath',
    'trafficSource_campaignCode', target_col,
]
df_train_idx = df_train[['fullVisitorId']]
df_train_X = df_train.drop(not_used_cols, axis=1)

df_test_idx = df_test[['fullVisitorId']]
del_cols = list(set(df_test.columns) & set(not_used_cols))
df_test_X = df_test.drop(del_cols, axis=1)

### Label Encoding

In [5]:
categorical_feature = list(df_train_X.select_dtypes(exclude=np.number).columns)
# print(categorical_feature)

df_train_X[categorical_feature].fillna('NA_NULL', inplace=True)
df_train_X.fillna(0, inplace=True)

df_test_X[categorical_feature].fillna('NA_NULL', inplace=True)
df_test_X.fillna(0, inplace=True)

for c in categorical_feature:
    st = time.time()
    lbl = LabelEncoder()
    lbl.fit(
        list(df_train_X[c].values.astype('str')) +
        list(df_test_X[c].values.astype('str')))
    df_train_X[c] = lbl.transform(list(df_train_X[c].values.astype('str')))
    df_test_X[c] = lbl.transform(list(df_test_X[c].values.astype('str')))
    print(c, time.time() - st)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


channelGrouping 5.4396071434021
socialEngagementType 6.248616933822632
device_browser 8.595309972763062
device_browserSize 6.87174391746521
device_browserVersion 7.259109020233154
device_deviceCategory 4.693512916564941
device_flashVersion 7.017986059188843
device_isMobile 4.323090314865112
device_language 6.987238168716431
device_mobileDeviceBranding 6.812239170074463
device_mobileDeviceInfo 6.790982961654663
device_mobileDeviceMarketingName 6.757170915603638
device_mobileDeviceModel 6.677722930908203
device_mobileInputSelector 6.5731000900268555
device_operatingSystem 5.097864866256714
device_operatingSystemVersion 6.586817979812622
device_screenColors 6.508098602294922
device_screenResolution 6.413700819015503
geoNetwork_city 6.688896894454956
geoNetwork_cityId 6.519043207168579
geoNetwork_continent 4.476720094680786
geoNetwork_country 5.459803104400635
geoNetwork_latitude 6.239119291305542
geoNetwork_longitude 6.264742374420166
geoNetwork_metro 8.410763025283813
geoNetwork_networkD

### Training no val set

In [None]:
prediction = np.zeros(df_test_X.shape[0])
prediction_train = np.zeros(df_train_X.shape[0])

X_train, X_valid = df_train_X, df_train_X
y_train, y_valid = df_train_y, df_train_y

feature_name = list(df_train_X.columns)

train = lgb.Dataset(
    X_train.values,
    label=y_train.values,
    feature_name=feature_name,
    categorical_feature=categorical_feature)

valid = lgb.Dataset(
    X_valid.values,
    label=y_valid.values,
    feature_name=feature_name,
    categorical_feature=categorical_feature)

params = {
    "objective": "regression",
    "metric": "rmse",
    "max_depth": 8,
    "min_child_samples": 20,
    "reg_alpha": 1,
    "reg_lambda": 1,
    "num_leaves": 257,
    "learning_rate": 0.01,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "subsample_freq ": 5,
}
evals_result = {}
train_params = {
    'params': params,
    'train_set': train,
    'valid_sets': [train, valid],
    'valid_names': ['train', 'valid'],
    'early_stopping_rounds': 100,  #50
    'num_boost_round': 300,  #500
    'verbose_eval': 25,
    'feval': None,
    'evals_result':evals_result,
    #     'categorical_feature': 'auto',
}


model = lgb.train(**train_params)    

### Prediction

In [7]:
print('predict test set')
y_pred = model.predict(df_test_X, num_iteration=model.best_iteration)

print('predict train set')
y_pred_train = model.predict(df_train_X, num_iteration=model.best_iteration)


predict test set
predict train set


### Report

In [8]:
# Init result configurations
RESULT_PATH = 'results'

time_tag = datetime.datetime.now().strftime('%Y-%m-%dT%H%M%S')
rmse_tag = 'T{0:.3f}_V{0:.3f}_K'.format(model.best_score['train']['rmse'], model.best_score['valid']['rmse'])
result_path = os.path.join(RESULT_PATH, '{}__{}'.format(time_tag, rmse_tag))

os.makedirs(result_path, exist_ok=True)
# Create train set raw result file
df_res = df_train_idx.copy()
df_res['y_true'] = df_train_y.values
df_res['y_pred'] = y_pred_train
file_name = 'reg_train_{}.csv'.format(time_tag)
df_res.to_csv(os.path.join(result_path, file_name), index=False)
print('raw_train:', os.path.join(result_path, file_name))

# Create train set raw result file
df_res = df_test_idx.copy()
df_res['y_pred'] = y_pred
file_name = 'reg_test_{}.csv'.format(time_tag)
df_res.to_csv(os.path.join(result_path, file_name), index=False)
print('raw_test:', os.path.join(result_path, file_name))

# Create submit file
df_test_idx['PredictedLogRevenue'] = 0
df_test_idx['PredictedLogRevenue'] = y_pred
df_test_idx["PredictedLogRevenue"] = df_test_idx["PredictedLogRevenue"].apply(lambda x : 0.0 if x < 0 else x)
df_test_idx["PredictedLogRevenue"] = df_test_idx["PredictedLogRevenue"].fillna(0.0)
df_test_idx["PredictedLogRevenue"] = np.expm1(df_test_idx["PredictedLogRevenue"])

df_submit = df_test_idx[['fullVisitorId', 'PredictedLogRevenue']].groupby('fullVisitorId').sum().reset_index()
df_submit["PredictedLogRevenue"] = np.log1p(df_submit["PredictedLogRevenue"])

file_name = 'aiden_{}.csv.tar.gz'.format(time_tag)
df_submit.to_csv(os.path.join(result_path, file_name), index=False, compression='gzip')
print('submit:', os.path.join(result_path, file_name))

# Write MSG
msg = []
msg.append(str(df_train_X.columns))
# del train_params['evals_result']
# del train_params['valid_sets']
msg.append(str(train_params))

with open(os.path.join(result_path, 'result.log'), 'w') as f:
    f.write('\n'.join(msg))

# Copy notebook to results for history
cmd = """cp -f {notebook_name} {result_path}/{notebook_name}
""".format(**{'notebook_name': 'reg_lgbm.ipynb', 'result_path': result_path})
print(cmd)
subprocess.call(cmd, shell=True)


raw_train: results/2018-09-25T220946__T1.584_V1.584_K/reg_train_2018-09-25T220946.csv
raw_test: results/2018-09-25T220946__T1.584_V1.584_K/reg_test_2018-09-25T220946.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

submit: results/2018-09-25T220946__T1.584_V1.584_K/aiden_2018-09-25T220946.csv.tar.gz
cp -f reg_lgbm.ipynb results/2018-09-25T220946__T1.584_V1.584_K/reg_lgbm.ipynb



0

In [None]:
df_submit

### Feature Importance

In [None]:
fig, ax = plt.subplots(figsize=(15,15))
lgb.plot_importance(model, ax=ax, max_num_features=30)  
plt.savefig(os.path.join(result_path, 'feature_importance.jpg'))

### to Kaggle with MSG

In [None]:
# cmd = """kaggle competitions submit -c ga-customer-revenue-prediction -f {file_path} -m "{msg}"
# """.format(**{'file_path': os.path.join(result_path, file_name), 'msg': ','.join(msg)[:500]})
# print(cmd)
# subprocess.call(cmd, shell=True)