In [1]:
import pandas as pd
import numpy as np
import time
import lightgbm as lgb
import sys

In [3]:
t1 = time.time()
trainset = pd.read_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/train_countAndmean_index_all_shuffle_0405.csv')
print('loading train done!')
valset = pd.read_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/val_countAndmean_index_all_shuffle_0405.csv')
print('loading test done!')


t2 = time.time()
print('training loading done! Time: {}'.format(t2 - t1))
print('size is: {}'.format(sys.getsizeof(trainset) / 1024 ** 3))
print('length is {}'.format(len(trainset)))

loading train done!
loading test done!
training loading done! Time: 203.82083177566528
size is: 12.740492917597294
length is 45000000


# Specify Categorical Columns

In [4]:
categorical_col = ['ip', 'app', 'device', 'os', 'channel', 'hour', 'minute', 'second']
target = 'is_attributed'
feature_cols = list(set(trainset.columns) - set([target]))

In [5]:
y_train = trainset[target].values
y_val = valset[target].values

lgb_train = lgb.Dataset(trainset[feature_cols], y_train, categorical_feature = categorical_col)
lgb_val = lgb.Dataset(valset[feature_cols], y_val, categorical_feature = categorical_col)

zeros = len(y_train[y_train == 0])
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train) * 100
import gc
# del df_train
gc.collect()

print('train negative rate: {}'.format(scale_pos_weight))
print('val negative rate: {}'.format(len(y_val[y_val == 0]) / len(y_val) * 100))

train negative rate: 99.72957555555556
val negative rate: 99.73499


# Train Lightgbm

In [6]:
params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'num_rounds': 2000,
        'learning_rate': 0.1,
        'num_leaves': 11,
        'num_threads': 4, # best speed: set to number of real cpu cores, which is vCPU/2
        'device': 'cpu',
        'max_depth': 5, # no limit. This is used to deal with over-fitting when #data is small.
        'min_data_in_leaf': 200,  #minimal number of data in one leaf. Can be used to deal with over-fitting
        'feature_fraction': 0.6, #For example, if set to 0.8, will select 80% features before training each tree.  speed up training / deal with over-fitting
        'feature_fraction_seed': 1,
        'early_stopping_round':50,
        'bagging_fraction': 0.9, #Randomly select part of data without resampling
        'bagging_freq': 1, #frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration. to enable bagging, bagging_fraction should be set as well
        'bagging_seed': 1,
        #'max_bin': 255,
        'verbose': 0,
        'scale_pos_weight': scale_pos_weight,
        'metric' : ['binary_logloss', 'auc']
    }

model = lgb.train(params, train_set=lgb_train, valid_sets=lgb_val, verbose_eval=10)



Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.969273	valid_0's binary_logloss: 0.23867
[20]	valid_0's auc: 0.970535	valid_0's binary_logloss: 0.119948
[30]	valid_0's auc: 0.973288	valid_0's binary_logloss: 0.0807544
[40]	valid_0's auc: 0.974276	valid_0's binary_logloss: 0.0669475
[50]	valid_0's auc: 0.974622	valid_0's binary_logloss: 0.0619284
[60]	valid_0's auc: 0.975342	valid_0's binary_logloss: 0.0596413
[70]	valid_0's auc: 0.975904	valid_0's binary_logloss: 0.058458
[80]	valid_0's auc: 0.976171	valid_0's binary_logloss: 0.0577492
[90]	valid_0's auc: 0.976406	valid_0's binary_logloss: 0.0573921
[100]	valid_0's auc: 0.976679	valid_0's binary_logloss: 0.0569997
[110]	valid_0's auc: 0.976793	valid_0's binary_logloss: 0.0567822
[120]	valid_0's auc: 0.976835	valid_0's binary_logloss: 0.0565162
[130]	valid_0's auc: 0.976941	valid_0's binary_logloss: 0.0563142
[140]	valid_0's auc: 0.977001	valid_0's binary_logloss: 0.0561102
[150]	valid_0's auc: 0.9771

In [7]:
from sklearn.metrics import roc_auc_score
pred_val = model.predict(valset[feature_cols])
print(roc_auc_score(y_val,pred_val))

0.97746189202


In [8]:
importance = pd.Series(model.feature_importance(), index=feature_cols)
importance = importance.sort_values(ascending=False)
if len(model.feature_importance()) != len(feature_cols):
    raise ValueError('Feature importance has length: {}, \n while feature number is {}'.
                     format(len(model.feature_importance()), len(feature_cols)))
importance.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/output/importance__countAndmean75m_0405_index_all_shuffle_11leaf.csv')

In [9]:
importance

ip                           904
channel                      385
minute                       255
second                       234
app_os_channel_mean          212
app                          175
os                           117
app_device_os_mean           108
ip_mean                       96
ip_app_os_count               82
ip_count                      74
ip_device_count               73
ip_device_hour_count          46
ip_day_count                  45
hour                          40
ip_app_mean                   39
app_channel_hour_mean         37
ip_app_device_mean            37
device_os_hour_mean           31
ip_os_hour_count              30
ip_second_count               25
ip_app_count                  25
app_os_hour_mean              24
ip_app_hour_count             24
ip_minute_count               22
app_channel_day_count         18
app_channel_count             18
device                        13
ip_hour_count                 13
device_minute_mean            11
ip_day_hou

In [10]:
df_test = pd.read_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/test_countAndmean_index_all_shuffle_0405.csv')
print('loading test done!')
# prediction
df_test_raw = pd.read_csv('/home/kai/data/kaggle/talkingdata/data/test.csv')
df_test = df_test[list(valset[feature_cols].columns)]
df_sub = pd.DataFrame()
df_sub['click_id'] = df_test_raw['click_id']
df_sub['is_attributed'] = model.predict(df_test)
print('predicting done!')
df_sub.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/submission/train_countAndmean_0405_index_all_shuffle_11leaf.csv.gz', compression='gzip', index=False)


loading test done!
predicting done!


In [22]:
a = ['ip_day_hour_count', 'ip', 'day', 'hour']
df_train.loc[:20,a ]

Unnamed: 0,ip_day_hour_count,ip,day,hour
0,43,143414,,12
1,18,173096,,12
2,42,8210,,12
3,19,5746,,12
4,65,31475,,12
5,11,251465,,12
6,164,163593,,12
7,158,58288,,12
8,134,27038,,12
9,414,67682,,12
