In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import lightgbm as lgb
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import LabelEncoder
from multiprocessing import Pool
import subprocess
import matplotlib.pyplot as plt
import os
import time
from sklearn.model_selection import TimeSeriesSplit, KFold, StratifiedKFold
from sklearn.model_selection import GroupKFold
from features import get_features
from features import encode_label
from features import encode_mean_k_fold
from features import encode_frequency

from report import report
from report import get_feature_importance
from report import submit_to_kaggle
import operator
from tqdm import tqdm


def drop_cols(df, cols):
    drop_cols = [c for c in df.columns if c in cols]
    df.drop(drop_cols, axis=1, inplace=True)
    return df

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you won't need to install the gcc compiler anymore.
Instead of that, you'll need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### Load Data

In [2]:
%%time
df_train = pd.read_pickle("input/train.pickle")
df_test = pd.read_pickle("input/test.pickle")

CPU times: user 8.79 s, sys: 3.67 s, total: 12.5 s
Wall time: 12.7 s


#### Const Cols

In [3]:
const_cols = [
    'socialEngagementType', 'device_browserSize', 'device_browserVersion',
    'device_flashVersion', 'device_language', 'device_mobileDeviceBranding',
    'device_mobileDeviceInfo', 'device_mobileDeviceMarketingName',
    'device_mobileDeviceModel', 'device_mobileInputSelector',
    'device_operatingSystemVersion', 'device_screenColors',
    'device_screenResolution', 'geoNetwork_cityId', 'geoNetwork_latitude',
    'geoNetwork_longitude', 'geoNetwork_networkLocation', 'totals_visits',
    'trafficSource_adwordsClickInfo.criteriaParameters',
    'trafficSource_campaignCode'
]

df_train = drop_cols(df_train, const_cols)
df_test = drop_cols(df_test, const_cols)

#### Cat Cols

In [4]:
categorical_feature = [
    'trafficSource_adwordsClickInfo.isVideoAd',
    'device_isMobile',
    "channelGrouping",
    "device_browser",
    "device_deviceCategory",
    "device_operatingSystem",
    "geoNetwork_city",
    "geoNetwork_continent",
    "geoNetwork_country",
    "geoNetwork_metro",
    "geoNetwork_networkDomain",
    "geoNetwork_region",
    "geoNetwork_subContinent",
    "trafficSource_adContent",
    "trafficSource_adwordsClickInfo.adNetworkType",
    "trafficSource_adwordsClickInfo.gclId",
    "trafficSource_adwordsClickInfo.page",
    "trafficSource_adwordsClickInfo.slot",
    "trafficSource_campaign",
    "trafficSource_keyword",
    "trafficSource_medium",
    "trafficSource_referralPath",
    "trafficSource_source",
    'trafficSource_isTrueDirect',
]

for col in categorical_feature:
    df_train[col] = df_train[col].astype(str)
    df_test[col] = df_test[col].astype(str)

df_train[categorical_feature].fillna('NA_NULL', inplace=True)
df_test[categorical_feature].fillna('NA_NULL', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


#### Num Cols

In [5]:
num_cols = [
    "totals_hits", "totals_pageviews", "visitNumber", "visitStartTime",
    'totals_bounces', 'totals_newVisits'
]

for col in num_cols:
    df_train[col] = df_train[col].astype(float)
    df_test[col] = df_test[col].astype(float)

df_train["totals_transactionRevenue"] = df_train[
    "totals_transactionRevenue"].astype('float')

In [6]:
print('df_train.shape:', df_train.shape)
print('df_test.shape:', df_test.shape)
df_train.info()

df_train.shape: (903653, 35)
df_test.shape: (804684, 34)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 35 columns):
channelGrouping                                 903653 non-null object
date                                            903653 non-null int64
fullVisitorId                                   903653 non-null object
sessionId                                       903653 non-null object
visitId                                         903653 non-null int64
visitNumber                                     903653 non-null float64
visitStartTime                                  903653 non-null float64
device_browser                                  903653 non-null object
device_deviceCategory                           903653 non-null object
device_isMobile                                 903653 non-null object
device_operatingSystem                          903653 non-null object
geoNetwork_city                                 903

In [7]:
%%time
df_train.to_pickle("input/train_lv01.pickle")
df_test.to_pickle("input/test_lv01.pickle")

CPU times: user 11.6 s, sys: 1.71 s, total: 13.3 s
Wall time: 13.5 s
