In [2]:
import pandas as pd
import time
import numpy as np
from sklearn.cross_validation import train_test_split
import lightgbm as lgb
import gc

In [3]:
def lgb_modelfit_nocv(params, dtrain, dvalid, predictors, target='target', objective='binary', metrics='auc',
                      feval=None, early_stopping_rounds=20, num_boost_round=3000, verbose_eval=10,
                      categorical_features=None):
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': objective,
        'metric': metrics,
        'learning_rate': 0.01,
        # 'is_unbalance': 'true',  #because training data is unbalance (replaced with scale_pos_weight)
        'num_leaves': 31,  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 255,  # Number of bucketed bin for feature values
        'subsample': 0.6,  # Subsample ratio of the training instance.
        'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 4,
        'verbose': 0,
        'metric': metrics
    }

    lgb_params.update(params)

    print("preparing validation datasets")

    xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )
    xgvalid = lgb.Dataset(dvalid[predictors].values, label=dvalid[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )

    evals_results = {}

    bst1 = lgb.train(lgb_params,
                     xgtrain,
                     valid_sets=[xgtrain, xgvalid],
                     valid_names=['train', 'valid'],
                     evals_result=evals_results,
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=10,
                     feval=feval)

    n_estimators = bst1.best_iteration
    print("\nModel Report")
    print("n_estimators : ", n_estimators)
    print(metrics + ":", evals_results['valid'][metrics][n_estimators - 1])

    return bst1


In [67]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8',
    'click_id': 'uint32'
}
print('loading train data...')
train_df = pd.read_csv('/Users/mlx/Downloads/Document/talking_data/train_sample.csv', dtype=dtypes,
                       usecols=['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed'])

loading train data...


In [68]:
print('Extracting new time-related features...')
train_df['second'] = pd.to_datetime(train_df.click_time).dt.second.astype('uint8')
train_df['minute'] = pd.to_datetime(train_df.click_time).dt.minute.astype('uint8')
train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8')
train_df['day'] = pd.to_datetime(train_df.click_time).dt.day.astype('uint8')
train_df['month'] = pd.to_datetime(train_df.click_time).dt.month.astype('uint8')

Extracting new time-related features...


In [69]:
total = len(train_df)
r_size = total // 4
val_df = train_df[:r_size]
train_df = train_df[r_size:]

In [76]:
target = 'is_attributed'
predictors = ['ip', 'app', 'device', 'os', 'channel', 'second', 'minute','hour', 'day', 'month']
categorical = ['ip', 'app', 'device', 'os', 'channel', 'second', 'minute','hour', 'day', 'month']
params = {
    'learning_rate': 0.15,
    # 'is_unbalance': 'true', # replaced with scale_pos_weight argument
    'num_leaves': 12,  # 2^max_depth - 1
    'max_depth': 4,  # -1 means no limit
    'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
    'max_bin': 100,  # Number of bucketed bin for feature values
    'subsample': 0.7,  # Subsample ratio of the training instance.
    'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
    'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.
    'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
    'scale_pos_weight': 99  # because training data is extremely unbalanced
}

bst = lgb_modelfit_nocv(params,
                        train_df,
                        val_df,
                        predictors,
                        target,
                        objective='binary',
                        metrics='auc',
                        early_stopping_rounds=100,
                        verbose_eval=True,
                        num_boost_round=500,
                        categorical_features=categorical)

preparing validation datasets
Training until validation scores don't improve for 100 rounds.
[10]	train's auc: 0.993087	valid's auc: 0.966497
[20]	train's auc: 0.997953	valid's auc: 0.95894




[30]	train's auc: 0.998722	valid's auc: 0.953376
[40]	train's auc: 0.999206	valid's auc: 0.953392
[50]	train's auc: 0.999489	valid's auc: 0.940056
[60]	train's auc: 0.999701	valid's auc: 0.934006
[70]	train's auc: 0.999801	valid's auc: 0.928214
[80]	train's auc: 0.999902	valid's auc: 0.923562
[90]	train's auc: 0.999965	valid's auc: 0.914762
[100]	train's auc: 0.999985	valid's auc: 0.90927
[110]	train's auc: 0.999997	valid's auc: 0.905816
Early stopping, best iteration is:
[10]	train's auc: 0.993087	valid's auc: 0.966497

Model Report
n_estimators :  10
auc: 0.966497107426


In [77]:
test_df = pd.read_csv('/Users/mlx/Downloads/Document/talking_data/test.csv', dtype=dtypes,
                      usecols=['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id'])

In [79]:
test_df['second'] = pd.to_datetime(test_df.click_time).dt.second.astype('uint8')
test_df['minute'] = pd.to_datetime(test_df.click_time).dt.minute.astype('uint8')
test_df['hour'] = pd.to_datetime(test_df.click_time).dt.hour.astype('uint8')
test_df['day'] = pd.to_datetime(test_df.click_time).dt.day.astype('uint8')
test_df['month'] = pd.to_datetime(test_df.click_time).dt.month.astype('uint8')
test_df['click_id'] = test_df['click_id'].astype('int')

In [80]:
test_df['is_attributed'] = bst.predict(test_df[predictors])
test_df.to_csv('baseline_balanced99.csv', index=False)

In [82]:
test_df[['click_id', 'is_attributed']].to_csv('baseline_balanced99.csv', index=False)

In [83]:
train_df_2 = pd.read_csv('/Users/mlx/Downloads/Document/talking_data/train_sample.csv', dtype=dtypes,
                       usecols=['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed'])

In [84]:
r_size = 10
sample_idx = random.sample(range(100), r_size)

<pandas.core.indexing._iLocIndexer at 0x1a1056ea58>