In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import gc

In [2]:
gc.collect()

0

In [3]:
from contextlib import contextmanager

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

### Functions for Features

In [4]:
def df_add_counts(df, cols):
    arr_slice = df[cols].values
    unq, unqtags, counts = np.unique(np.ravel_multi_index(arr_slice.T, arr_slice.max(axis=0) + 1),
                                     return_inverse=True, return_counts=True)
    df["_".join(cols)+'_count'] = counts[unqtags]

In [5]:
def make_count_features(df):
    with timer("add count features"):
        df['click_time']= pd.to_datetime(df['click_time'])
        dt= df['click_time'].dt
        df['day'] = dt.day.astype('uint8')
        df['hour'] = dt.hour.astype('uint8')
        df['minute'] = dt.minute.astype('uint8')
        del(dt)
        
        df_add_counts(df, ['ip'])
        df_add_counts(df, ['ip','day','hour','minute'])
        df_add_counts(df, ['os','device'])
        df_add_counts(df, ['os','app','channel'])
        
        df_add_counts(df, ['ip', 'day', 'hour'])
        df_add_counts(df, ['ip', 'app'])
        df_add_counts(df, ['ip', 'app', 'os'])
        df_add_counts(df, ['ip', 'device'])
        df_add_counts(df, ['app', 'channel'])

In [6]:
def make_click_features(df):
    D= 2**26
    df['category'] = (df['ip'].astype(str) + "_" + df['app'].astype(str) + "_" + df['device'].astype(str) \
                      + "_" + df['os'].astype(str)).apply(hash) % D
    
    df['epochtime']= df['click_time'].astype(np.int64) // 10 ** 9
        
    with timer("Adding next click times"):
        
        click_buffer= np.full(D, 3000000000, dtype=np.uint32)
        next_clicks= []
        for category, time in zip(reversed(df['category'].values), reversed(df['epochtime'].values)):
            next_clicks.append(click_buffer[category]-time)
            click_buffer[category]= time
        del(click_buffer)
        df['next_click']= list(reversed(next_clicks))
   
    with timer("Adding previous click times"):
        
        click_buffer= np.zeros(D, dtype=np.uint32)
        prev_clicks= []
        for category, time in zip(df['category'].values, df['epochtime'].values):
            prev_clicks.append(time-click_buffer[category])
            click_buffer[category]= time
        del(click_buffer)
        df['prev_click']= prev_clicks
        

In [1]:
hash('da_rou_mao')

613739358056957992

In [7]:
def log_bin_feature(df, features):
    with timer("Log-binning features"):
        for fea in features: 
            df[fea]= np.log2(1 + df[fea].values).astype(int)

### Read Data

In [8]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'int8'
}

to_read = ['ip','app','device','os','channel','click_time','is_attributed'] 

In [9]:
train_df_tmp = pd.read_csv('./data/train.csv.zip', parse_dates=['click_time'], 
                           usecols=to_read, dtype=dtypes, chunksize = 10**6)

In [10]:
train_df = pd.concat(train_df_tmp)

In [13]:
del train_df_tmp
gc.collect()

0

In [14]:
test_df_tmp = pd.read_csv('./data/test.csv.zip', parse_dates=['click_time'], dtype=dtypes, chunksize = 10**6)

In [15]:
test_df = pd.concat(test_df_tmp)

In [16]:
del test_df_tmp
gc.collect()

126

In [17]:
num_train = train_df.shape[0]
y = train_df.is_attributed.values

In [18]:
train_df.shape

(184903890, 7)

In [19]:
test_df.shape

(18790469, 7)

Concat Train & Test

In [20]:
common_column = ['ip','app','device','os','channel','click_time']
concat_df = pd.concat([train_df[common_column],test_df[common_column]])

In [21]:
del train_df, test_df
gc.collect()

67

In [22]:
make_count_features(concat_df)

[add count features] done in 1215 s


In [23]:
make_click_features(concat_df)

[Adding next click times] done in 685 s
[Adding previous click times] done in 225 s


In [24]:
gc.collect()

35

In [25]:
concat_df.dtypes

ip                                  uint32
app                                 uint16
device                              uint16
os                                  uint16
channel                             uint16
click_time                  datetime64[ns]
day                                  uint8
hour                                 uint8
minute                               uint8
ip_count                             int64
ip_day_hour_minute_count             int64
os_device_count                      int64
os_app_channel_count                 int64
ip_day_hour_count                    int64
ip_app_count                         int64
ip_app_os_count                      int64
ip_device_count                      int64
app_channel_count                    int64
category                             int64
epochtime                            int64
next_click                           int64
prev_click                           int64
dtype: object

In [26]:
features = ['ip_count',
            'ip_day_hour_minute_count',
            'os_device_count',
            'os_app_channel_count',
            'ip_day_hour_count',
            'ip_app_count',
            'ip_app_os_count',
            'ip_device_count',
            'app_channel_count',
            'next_click',
            'prev_click']

In [27]:
log_bin_feature(concat_df, features)

[Log-binning features] done in 134 s


Save concat_df to csv

In [37]:
col_to_write = [col for col in list(concat_df) if col not in ['click_time','category','epochtime']]

In [40]:
concat_df.to_csv('tmp/concat_df.csv.gz', chunksize=10**6, compression='gzip', index=False, 
                 columns = col_to_write)

features = ['ip_count',
            'ip_day_hour_minute_count',
            'os_device_count',
            'os_app_channel_count',
            'ip_day_hour_count',
            'ip_app_count',
            'ip_app_os_count',
            'ip_device_count',
            'app_channel_count',
            'next_click',
            'prev_click']

log_bin_feature(concat_df, features)

### Model

In [None]:
target = "is_attributed"
categorical_features = ['ip','app','os','channel','device']
predictors = list(set(concat_df.columns)-set([target])-set(['click_time','category','epochtime']))

In [None]:
lgbtrain = lgb.Dataset(concat_df.iloc[:num_train][predictors].values, label=y,
                      feature_name=predictors,
                      categorical_feature=categorical_features
                      )

In [None]:
lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'subsample_for_bin': 200000,  # Number of samples for constructing bin
    'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
    'reg_alpha': 0,  # L1 regularization term on weights
    'reg_lambda': 0,  # L2 regularization term on weights
    'nthread': 4,
    'verbose': 0,
    'metric':'auc',     
 
    'learning_rate': 0.15,
    'num_leaves': 7,  # 2^max_depth - 1
    'max_depth': 3,  # -1 means no limit
    'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
    'max_bin': 100,  # Number of bucketed bin for feature values
    'subsample': 0.7,  # Subsample ratio of the training instance.
    'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
    'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.
    'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
    'scale_pos_weight':99
}

In [None]:
evals_results = {}
num_boost_round = 200
early_stopping_rounds = 30

booster = lgb.train(
     lgb_params, 
     lgbtrain, 
     valid_sets=[lgbtrain], 
     valid_names=['train'], 
     evals_result=evals_results, 
     num_boost_round=num_boost_round,
     early_stopping_rounds=early_stopping_rounds,
     verbose_eval=1
    )

In [None]:
def batch_pred(model, df, batch_size=10**5):
    
    N_batch = int(df.shape[0]/batch_size)
    pred = []
    
    for i in range(N_batch+1):
        if i%100 == 0 & i>0:
            print("finished batch %d/%d" %(i, N_batch))
        
        batch_X = df.iloc[i*batch_size : (i+1)*batch_size] 
        batch_pred = model.predict(batch_X.values)
        pred.extend(list(batch_pred))
    
    return pred

In [None]:
preditions = batch_pred(booster, concat_df.iloc[num_train: ][predictors], batch_size=10**5)

In [None]:
len(preditions)

In [None]:
concat_df.shape

### Sub

In [None]:
sub = pd.read_csv('data/sample_submission.csv.zip')

In [None]:
sub.shape

In [None]:
sub['is_attributed'] =  preditions