In [None]:
import pandas as pd
import numpy as np
import time
import lightgbm as lgb
import gc
from sklearn.model_selection import train_test_split
from contextlib import contextmanager
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

path = '../input/'

dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

reading_cols_train = ['ip','app','device','os', 'channel', 'click_time', 'is_attributed']
reading_cols_test =  ['ip','app','device','os', 'channel', 'click_time', 'click_id']

# Features

In [None]:
target = 'is_attributed'
categoricals = ['app', 'device', 'os', 'channel', 'hour', 'day']
predictors =   ['app', 'device', 'os', 'channel', 'hour', 'day', 'next_click']

def df_add_counts(df, cols):
    arr_slice = df[cols].values
    unq, unqtags, counts = np.unique(np.ravel_multi_index(arr_slice.T, arr_slice.max(0) + 1),
                                     return_inverse=True, return_counts=True)
    df["_".join(cols)+'_count'] = counts[unqtags]
    predictors.append("_".join(cols)+'_count')

def df_add_cum_counts(df, cols):
    df["_".join(cols)+'_cum_count']=df.groupby(cols).cumcount()
    predictors.append("_".join(cols)+'_cum_count')
    
def df_transform(df):
    df.reset_index(drop=True, inplace=True)
    with timer("Adding counts"):
        df['click_time']= pd.to_datetime(df['click_time'])
        dt= df['click_time'].dt
        df['day'] = dt.day.astype('uint8')
        df['hour'] = dt.hour.astype('uint8')
        del(dt)
        df_add_counts(df, ['ip', 'day', 'hour'])
        df_add_counts(df, ['ip', 'app'])
        df_add_counts(df, ['ip', 'app', 'os'])
        df_add_counts(df, ['ip', 'device'])
        df_add_counts(df, ['app', 'channel'])
        df_add_counts(df, ['ip', 'device','os'])
        df_add_counts(df, ['ip', 'device','os','app'])
        df_add_cum_counts(df, ['ip', 'device','os'])
        df_add_cum_counts(df, ['ip', 'device','os','app'])
    with timer("Adding next click times"):
        D= 2**26
        df['category'] = (df['ip'].astype(str) + "_" + df['app'].astype(str) + "_" + df['device'].astype(str) \
                         + "_" + df['os'].astype(str)).apply(hash) % D
        click_buffer= np.full(D, 3000000000, dtype=np.uint32)
        df['epochtime']= df['click_time'].astype(np.int64) // 10 ** 9
        next_clicks= []
        for category, time in zip(reversed(df['category'].values), reversed(df['epochtime'].values)):
            next_clicks.append(click_buffer[category]-time)
            click_buffer[category]= time
        del(click_buffer)
        df['next_click']= list(reversed(next_clicks))
        df.drop(['click_time','category','epochtime'],axis=1,inplace=True)

# Model

In [None]:
num_boost_round = 2000
early_stopping_rounds = 50
verbose_eval = 10

params = {
    'learning_rate': 0.1,
    #'is_unbalance': 'true', # replaced with scale_pos_weight argument
    'num_leaves': 7,  # 2^max_depth - 1
    'max_depth': 4,  # -1 means no limit
    'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
    'max_bin': 100,  # Number of bucketed bin for feature values
    'subsample': 0.7,  # Subsample ratio of the training instance.
    'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
    'colsample_bytree': 0.7,  # Subsample ratio of columns when constructing each tree.
    'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
    'min_split_gain' : 0,
    'scale_pos_weight':99.7, # because training data is extremely unbalanced 

    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'subsample_for_bin': 200000,  # Number of samples for constructing bin
    'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
    'reg_alpha': 0,  # L1 regularization term on weights
    'reg_lambda': 0,  # L2 regularization term on weights
    'nthread': 4,
    'verbose': 0,
}

def lgb_modelfit_nocv(xgtrain, xgvalid, predictors,feval=None, 
                      early_stopping_rounds=early_stopping_rounds, 
                      num_boost_round=num_boost_round, 
                      verbose_eval=verbose_eval):

    evals_results = {}

    bst1 = lgb.train(params,
                     xgtrain, 
                     valid_sets=[xgtrain, xgvalid], 
                     valid_names=['train','valid'], 
                     evals_result=evals_results, 
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=verbose_eval, 
                     feval=feval)

    return bst1

# Train

In [None]:
TESTING = False

if TESTING : NROWS =  1000000
else :       NROWS = 40000000

TOTAL_ROWS = 184903890
SKIP_ROWS = TOTAL_ROWS-NROWS

print('loading train data...')
train = pd.read_csv(path+"train.csv", skiprows=range(1,SKIP_ROWS+1), nrows=NROWS, dtype=dtypes, usecols=reading_cols_train)

In [None]:
print('creating new features...')
df_transform(train)
print('predictors : \n - '+'\n - '.join(predictors))

print('\ntrain_test_split...')
train_df, val_df = train_test_split(train, test_size=0.05, random_state=0)
del train
gc.collect()
print("train size: ", len(train_df))
print("valid size: ", len(val_df))

In [None]:
print('creating LGB dataset...')
xgtrain = lgb.Dataset(train_df[predictors].values, label=train_df[target].values,
                      feature_name=predictors,categorical_feature=categoricals)
xgvalid = lgb.Dataset(val_df[predictors].values, label=val_df[target].values,
                      feature_name=predictors,categorical_feature=categoricals)
del train_df
del val_df
gc.collect()

In [None]:
print("Training...")
bst = lgb_modelfit_nocv(xgtrain, xgvalid, predictors)

In [None]:
del xgtrain
del xgvalid
gc.collect()

# Predict

In [None]:
print('loading test data and creating new features...')
test = pd.read_csv(path+"test.csv", dtype=dtypes, usecols=reading_cols_test)
df_transform(test)

sub = pd.DataFrame()
sub['click_id'] = test['click_id'].astype('int')

print("predicting...")
sub['is_attributed'] = bst.predict(test[predictors])
del test
gc.collect()

print("writing...")
sub.to_csv('sub_lightGBM_py.csv',index=False)
print("done...")