In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.cross_validation import train_test_split
import lightgbm as lgb
import gc



In [2]:
def lgb_modelfit_nocv(params
                     ,dtrain
                     ,dvalid
                     ,predictors
                     ,target='target'
                     ,objective='binary'
                     ,metrics='auc'
                     ,feval=None
                     ,early_stopping_rounds=20
                     ,num_boost_round=3000
                     ,verbose_eval=10
                     ,categorical_features=None
                     ):
    
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': objective,
        'metric': metrics,
        'learning_rate': 0.01,
        # 'is_unbalance': 'true', # because training data is unbalance (replaced with scale_pos_weight)
        'num_leaves': 31, # we should let it be smaller than 2^()
        'max_depth': -1, # -1 means no limit
        'min_child_samples': 20, # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 255, # Number of bucketed bin for feature values
        'subsample': 0.6, # Subsample ratio of the training instance
        'subsample_freq': 0, # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.3, # Subsample ratio of columns when constructing each tree
        'min_child_weight': 5, # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'subsample_for_bin': 200000, # Number of samples for constructing bin
        'min_split_gain': 0, # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0, # L1 regularization term on weights
        'reg_lambda': 0, # L2 regularization term on weights
        'nthread': 4,
        'verbose': 0,
        'metrics': metrics
    }
    
    lgb_params.update(params)
    
    print("preparing validation datasets")
    
    xgtrain = lgb.Dataset(
        dtrain[predictors].values,
        label=dtrain[target].values,
        feature_name=predictors,
        categorical_feature=categorical_features
                          )
    
    xgvalid = lgb.Dataset(
        dvalid[predictors].values,
        label=dvalid[target].values,
        feature_name=predictors,
        categorical_feature=categorical_features
                         )
    
    evals_results = {}
    
    bst1 = lgb.train(
        lgb_params,
        xgtrain,
        valid_sets=[xgtrain, xgvalid],
        valid_names=['train', 'valid'],
        evals_result=evals_rsults,
        num_boost_round=num_boost_round,
        early_stopping_rounds=early_stopping_rounds,
        verbose_eval=10,
        feval=feval)
    
    n_estimators = bst1.best_iteration
    print("\nModel Report")
    print("n_estimators : ", n_estimators)
    print(metrics + ":", evals_results['valid'][metrics][n_estimators-1])
    
    return bst1
    

In [3]:
path = '../input/'

In [4]:
dtypes = {
    'ip'             :'uint32',
    'app'            :'uint16',
    'device'         :'uint16',
    'os'             :'uint16',
    'channel'        :'uint16',
    'is_attributed'  :'uint8',
    'click_id'       :'uint32',
}

In [19]:
print('loading train data...')
train_df = pd.read_csv(path + "train.csv", skiprows=range(1, 144903891), nrows=40000000, dtype=dtypes, usecols=['ip','app','device','os','channel','click_time','is_attributed'])

loading train data...


In [20]:
print('loading test data...')
test_df = pd.read_csv(path + "test.csv", dtype=dtypes, usecols=['ip','app','device','os','channel','click_time','click_id'])

loading test data...


In [21]:
len_train = len(train_df)
train_df = train_df.append(test_df)

In [9]:
train_df.head()

Unnamed: 0,app,channel,click_id,click_time,device,ip,is_attributed,os
0,15,111,,2017-11-09 04:03:08,1,33924,0.0,19
1,3,280,,2017-11-09 04:03:08,1,37383,0.0,13
2,15,245,,2017-11-09 04:03:08,1,122294,0.0,10
3,9,145,,2017-11-09 04:03:08,1,73258,0.0,25
4,15,430,,2017-11-09 04:03:08,1,73347,0.0,13


In [10]:
test_df.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
0,0,5744,9,1,3,107,2017-11-10 04:00:00
1,1,119901,9,1,3,466,2017-11-10 04:00:00
2,2,72287,21,1,19,128,2017-11-10 04:00:00
3,3,78477,15,1,13,111,2017-11-10 04:00:00
4,4,123080,12,1,13,328,2017-11-10 04:00:00


In [22]:
del test_df
gc.collect()

689

In [23]:
print('Extracting new features...')
train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8')
train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8')

gc.collect()

Extracting new features...


14

In [25]:
print('grouping by ip-day-hour combination...')
gp = train_df[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_tcount'})
train_df = train_df.merge(gp, on=['ip','day','hour'], how='left')
del gp
gc.collect()

grouping by ip-day-hour combination...


133

In [26]:
print('grouping by ip-app combination...')
gp = train_df[['ip','app','channel']].groupby(by=['ip','app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'})
train_df = train_df.merge(gp, on=['ip', 'app'], how='left')
del gp
gc.collect()

grouping by ip-app combination...


101

In [27]:
print('grouping by ip-app-os combination...')
gp = train_df[['ip','app','os','channel']].groupby(by=['ip','app','os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'})
train_df = train_df.merge(gp, on=['ip','app','os'], how='left')
del gp
gc.collect()

grouping by ip-app-os combination...


117

In [28]:
# Adding features with var and mean hour (inspired from nuhsikander's script)
print('grouping by : ip_day_chl_var_hour')
gp = train_df[['ip','day','hour','channel']].groupby(by=['ip','day','channel'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_tchan_count'})
train_df = train_df.merge(gp, on=['ip', 'day', 'channel'], how='left')
del gp
gc.collect()

grouping by : ip_day_chl_var_hour


117

In [29]:
print('grouping by : ip_app_os_var_hour')
gp = train_df[['ip','app','os','hour']].groupby(by=['ip','app','os'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_app_os_var'})
train_df = train_df.merge(gp, on=['ip','app', 'os'], how='left')
del gp
gc.collect()

grouping by : ip_app_os_var_hour


116

In [30]:
print('grouping by : ip_app_channel_var_day')
gp = train_df[['ip', 'app', 'channel', 'day']].groupby(by=['ip', 'app', 'channel'])[['day']].var().reset_index().rename(index=str, columns={'day': 'ip_app_channel_var_day'})
train_df = train_df.merge(gp, on=['ip','app','channel'], how='left')
del gp
gc.collect()

grouping by : ip_app_channel_var_day


116

In [31]:
print('grouping by : ip_app_chl_mean_hour')
gp = train_df[['ip','app','channel','hour']].groupby(by=['ip','app','channel'])[['hour']].mean().reset_index().rename(index=str, columns={'hour': 'ip_app_channel_mean_hour'})
print("merging...")
train_df = train_df.merge(gp, on=['ip','app','channel'], how='left')
del gp
gc.collect()

grouping by : ip_app_chl_mean_hour
merging...


116

In [32]:
print("var and data type: ")
train_df.info()
train_df['ip_tcount'] = train_df['ip_tcount'].astype('uint16')
train_df['ip_app_count'] = train_df['ip_app_count'].astype('uint16')
train_df['ip_app_os_count'] = train_df['ip_app_os_count'].astype('uint16')

var and data type: 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 58790469 entries, 0 to 58790468
Data columns (total 17 columns):
app                         uint16
channel                     uint16
click_id                    float64
click_time                  object
device                      uint16
ip                          uint32
is_attributed               float64
os                          uint16
hour                        uint8
day                         uint8
ip_tcount                   int64
ip_app_count                int64
ip_app_os_count             int64
ip_tchan_count              float64
ip_app_os_var               float64
ip_app_channel_var_day      float64
ip_app_channel_mean_hour    float64
dtypes: float64(6), int64(3), object(1), uint16(4), uint32(1), uint8(2)
memory usage: 5.6+ GB


In [33]:
test_df = train_df[len_train:]
val_df = train_df[(len_train-2500000):len_train]
train_df = train_df[:(len_train-2500000)]

In [34]:
print("train size: ", len(train_df))
print("valid size: ", len(val_df))
print("test size: ", len(test_df) )

train size:  37500000
valid size:  2500000
test size:  18790469


In [35]:
target = 'is_attributed'
predictors = [
    'app','device','os','channel','hour','day','ip_tcount','ip_tchan_count','ip_app_count','ip_app_os_count','ip_app_os_var','ip_app_channel_var_day','ip_app_channel_mean_hour'
]
categorical = ['app', 'device', 'os', 'channel', 'hour', 'day']

In [36]:
sub = pd.DataFrame()
sub['click_id'] = test_df['click_id'].astype('int')
gc.collect()

45

In [None]:
params = {
    'learning_rate': 0.15,
    #'is_unbalance': 'true', # replaced with scale_pos_weight argument
    'num_leaves': 7, # 2^max_depth - 1
    'max_depth': 3, # -1 means no limit
    'min_child_samples': 100, # Minimum number of data need in a child(min_data_in_leaf)
    'max_bin': 100, # Number of backeted bin for feature values
    
}

In [None]:
print("Training...")
start_time = time.time()