# 0. Load Dataset

In [2]:
import pandas as pd
import time
import numpy as np
from sklearn.cross_validation import train_test_split
import lightgbm as lgb
import gc



In [3]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint8',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8',
    'click_id': 'uint32'
}
print('loading train data...')
train_df = pd.read_csv('/Users/mlx/Downloads/Document/talking_data/train_sample.csv', dtype=dtypes, parse_dates=['click_time'],
                       usecols=['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed'])
print('loading finished.')

loading train data...
loading finished.


# 1. Feature Engineering

In [4]:
predictors=[]

## 1.1 Extracting time information

In [5]:
train_df['second'] = pd.to_datetime(train_df.click_time).dt.second.astype('uint8')
train_df['minute'] = pd.to_datetime(train_df.click_time).dt.minute.astype('uint8')
train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8')
train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8')
train_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,second,minute,hour,day
0,87540,12,1,13,497,2017-11-07 09:30:38,0,38,30,9,7
1,105560,25,1,17,259,2017-11-07 13:40:27,0,27,40,13,7
2,101424,12,1,19,212,2017-11-07 18:05:24,0,24,5,18,7
3,94584,13,1,13,477,2017-11-07 04:58:08,0,8,58,4,7
4,68413,12,1,1,178,2017-11-09 09:00:09,0,9,0,9,9


## 1.2 Get Next/Prev Click

In [25]:
def calNextOrPrevClick(df, agg_suffix, agg_type='float32'):
    print(f">> \nExtracting {agg_suffix} time calculation features...\n")
    
    GROUP_BY_NEXT_CLICKS = [
    {'groupby': ['ip', 'app', 'device', 'os', 'channel']},
    {'groupby': ['ip', 'os', 'device']},
    {'groupby': ['ip', 'os', 'device', 'app']}
    ]

    # Calculate the time to next click for each group
    for spec in GROUP_BY_NEXT_CLICKS:

       # Name of new feature
        new_feature = '{}_{}'.format('_'.join(spec['groupby']),agg_suffix)    
    
        # Unique list of features to select
        all_features = spec['groupby'] + ['click_time']

        # Run calculation
        print(f">> Grouping by {spec['groupby']}, and saving time to {agg_suffix} in: {new_feature}")
        if agg_suffix=="nextClick":
            df[new_feature] = (df[all_features].groupby(spec['groupby']).click_time.shift(-1) - df.click_time).dt.seconds.astype(agg_type)
        elif agg_suffix== "prevClick":
            df[new_feature] = (df.click_time - df[all_features].groupby(spec['groupby']).click_time.shift(+1)).dt.seconds.astype(agg_type)
        predictors.append(new_feature)
        gc.collect()
    return df


In [26]:
train_df=calNextOrPrevClick(train_df, 'prevClick')

>> 
Extracting prevClick time calculation features...

>> Grouping by ['ip', 'app', 'device', 'os', 'channel'], and saving time to prevClick in: ip_app_device_os_channel_prevClick
>> Grouping by ['ip', 'os', 'device'], and saving time to prevClick in: ip_os_device_prevClick
>> Grouping by ['ip', 'os', 'device', 'app'], and saving time to prevClick in: ip_os_device_app_prevClick


In [22]:
train_df

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,ip_app_device_os_channel_prevClick,ip_os_device_prevClick,ip_os_device_app_prevClick
0,87540,12,1,13,497,2017-11-07 09:30:38,0,,,
1,105560,25,1,17,259,2017-11-07 13:40:27,0,,,
2,101424,12,1,19,212,2017-11-07 18:05:24,0,,,
3,94584,13,1,13,477,2017-11-07 04:58:08,0,,,
4,68413,12,1,1,178,2017-11-09 09:00:09,0,,,
5,93663,3,1,17,115,2017-11-09 01:22:13,0,,,
6,17059,1,1,17,135,2017-11-09 01:17:58,0,,,
7,121505,9,1,25,442,2017-11-07 10:01:53,0,,,
8,192967,2,2,22,364,2017-11-08 09:35:17,0,,,
9,143636,3,1,19,135,2017-11-08 12:35:26,0,,,


# 2. Model Training

In [None]:
def lgb_modelfit_nocv(params, dtrain, dvalid, predictors, target='target', objective='binary', metrics='auc',
                 feval=None, early_stopping_rounds=20, num_boost_round=3000, verbose_eval=10, categorical_features=None):
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': objective,
        'metric':metrics,
        'learning_rate': 0.2,
        #'is_unbalance': 'true',  #because training data is unbalance (replaced with scale_pos_weight)
        'num_leaves': 31,  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 255,  # Number of bucketed bin for feature values
        'subsample': 0.6,  # Subsample ratio of the training instance.
        'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 4,
        'verbose': 0,
        'metric':metrics
    }

    lgb_params.update(params)

    print("preparing validation datasets")

    xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )
    xgvalid = lgb.Dataset(dvalid[predictors].values, label=dvalid[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )

    evals_results = {}

    bst1 = lgb.train(lgb_params, 
                     xgtrain, 
                     valid_sets=[xgtrain, xgvalid], 
                     valid_names=['train','valid'], 
                     evals_result=evals_results, 
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=10, 
                     feval=feval)

    print("\nModel Report")
    print("bst1.best_iteration: ", bst1.best_iteration)
    print(metrics+":", evals_results['valid'][metrics][bst1.best_iteration-1])

    return bst1

In [2]:
import pandas as pd
whole_train_df = pd.read_csv('/Users/mlx/Downloads/Document/talking_data/train.csv')

In [5]:
one_m_entries = whole_train_df.sample(n=1000000).sample(frac=1)

In [6]:
one_m_entries.to_csv('/Users/mlx/Downloads/Document/talking_data/train_1m.csv', index=False)