In [2]:
import pandas as pd
import numpy as np
import time
import lightgbm as lgb
import sys

In [38]:
t1 = time.time()
df_train = pd.read_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/train_lasttimediff_combine5_0409.csv')
# df_test = pd.read_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/test_fold_last_in_12_count.csv')
t2 = time.time()
print('training loading done! Time: {}'.format(t2 - t1))
print('size is: {}'.format(sys.getsizeof(df_train) / 1024 ** 3))
print('length is {}'.format(len(df_train)))

training loading done! Time: 138.6672875881195
size is: 16.01874838024378
length is 50000000


# Specify Categorical Columns

In [39]:

# remove minute and second
categorical_col = [ 'device', 'app', 'os', 'channel', 'hour',  ]
target = 'is_attributed'
feature_cols = list(set(df_train.columns) - set([target]) - set(['minute', 'second', 'ip', 'intesthh' , 'timestamp']))
print(feature_cols)

['os', 'channel_lasttimediff', 'ip_device_lasttimediff', 'ip_lasttimediff', 'hour', 'app', 'ip_app_device_lasttimediff', 'device_channel_lasttimediff', 'app_os_lasttimediff', 'app_device_os_channel_lasttimediff', 'app_os_channel_lasttimediff', 'ip_app_os_lasttimediff', 'ip_os_channel_lasttimediff', 'ip_device_channel_lasttimediff', 'ip_app_channel_lasttimediff', 'app_device_channel_lasttimediff', 'ip_app_device_channel_lasttimediff', 'ip_device_os_channel_lasttimediff', 'device_lasttimediff', 'ip_device_os_lasttimediff', 'ip_app_os_channel_lasttimediff', 'device_os_lasttimediff', 'ip_app_lasttimediff', 'app_device_os_lasttimediff', 'app_channel_lasttimediff', 'ip_channel_lasttimediff', 'os_lasttimediff', 'device_os_channel_lasttimediff', 'ip_app_device_os_channel_lasttimediff', 'app_lasttimediff', 'app_device_lasttimediff', 'channel', 'ip_app_device_os_lasttimediff', 'day', 'os_channel_lasttimediff', 'device', 'ip_os_lasttimediff']


# Get Validation

In [40]:
from sklearn.model_selection import train_test_split
trainset, valset = train_test_split(df_train,test_size=0.2, random_state=42)
print(sys.getsizeof(trainset)/ 1024 **3)
print(len(trainset))

13.11302187293768
40000000


In [41]:
y_train = trainset[target].values
y_val = valset[target].values

lgb_train = lgb.Dataset(trainset[feature_cols], y_train, categorical_feature = categorical_col)
lgb_val = lgb.Dataset(valset[feature_cols], y_val, categorical_feature = categorical_col)

zeros = len(y_train[y_train == 0])
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train) * 100
import gc
# del df_train
gc.collect()

print(scale_pos_weight)

99.7439675


# Train Lightgbm

In [42]:
params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'num_rounds': 2000,
        'learning_rate': 0.1,
        'num_leaves': 31,
        'num_threads': 4, # best speed: set to number of real cpu cores, which is vCPU/2
        'device': 'cpu',
        'max_depth': -1, # no limit. This is used to deal with over-fitting when #data is small.
        'min_data_in_leaf': 100,  #minimal number of data in one leaf. Can be used to deal with over-fitting
        'feature_fraction': 0.7, #For example, if set to 0.8, will select 80% features before training each tree.  speed up training / deal with over-fitting
        'feature_fraction_seed': 1,
        'early_stopping_round':50,
        'bagging_fraction': 0.9, #Randomly select part of data without resampling
        'bagging_freq': 1, #frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration. to enable bagging, bagging_fraction should be set as well
        'bagging_seed': 1,
        #'max_bin': 255,
        'verbose': 0,
        'scale_pos_weight': scale_pos_weight,
        'metric' : ['binary_logloss', 'auc']
    }

model = lgb.train(params, train_set=lgb_train, valid_sets=lgb_val, verbose_eval=10)



Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.963879	valid_0's binary_logloss: 0.245545
[20]	valid_0's auc: 0.967525	valid_0's binary_logloss: 0.125871
[30]	valid_0's auc: 0.969651	valid_0's binary_logloss: 0.0847902
[40]	valid_0's auc: 0.971455	valid_0's binary_logloss: 0.0700751
[50]	valid_0's auc: 0.973014	valid_0's binary_logloss: 0.0644466
[60]	valid_0's auc: 0.97399	valid_0's binary_logloss: 0.061994
[70]	valid_0's auc: 0.97453	valid_0's binary_logloss: 0.060772
[80]	valid_0's auc: 0.974913	valid_0's binary_logloss: 0.0601626
[90]	valid_0's auc: 0.975114	valid_0's binary_logloss: 0.0597081
[100]	valid_0's auc: 0.975322	valid_0's binary_logloss: 0.0593972
[110]	valid_0's auc: 0.975404	valid_0's binary_logloss: 0.0592188
[120]	valid_0's auc: 0.975484	valid_0's binary_logloss: 0.0591099
[130]	valid_0's auc: 0.975495	valid_0's binary_logloss: 0.0589709
[140]	valid_0's auc: 0.97546	valid_0's binary_logloss: 0.059049
[150]	valid_0's auc: 0.975454	v

In [43]:
from sklearn.metrics import roc_auc_score
pred_val = model.predict(valset[feature_cols])
print(roc_auc_score(y_val,pred_val))

0.975508659871


In [44]:
importance = pd.Series(model.feature_importance(), index=feature_cols)
importance = importance.sort_values(ascending=False)
if len(model.feature_importance()) != len(feature_cols):
    raise ValueError('Feature importance has length: {}, \n while feature number is {}'.
                     format(len(model.feature_importance()), len(feature_cols)))
importance.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/output/importance_train_lasttimediff_0405_combine5_61.csv')

In [18]:
importance

channel                                        748
app                                            462
os                                             399
hour                                           222
ip_time2previousclick                          193
device                                          92
ip_app_time2previousclick                       89
ip_device_time2previousclick                    87
ip_channel_time2previousclick                   87
ip_app_device_time2previousclick                83
ip_device_os_time2previousclick                 68
ip_app_device_os_time2previousclick             67
ip_os_time2previousclick                        66
ip_app_os_time2previousclick                    62
ip_app_channel_time2previousclick               61
ip_device_channel_time2previousclick            53
ip_app_os_channel_time2previousclick            52
app_channel_time2previousclick                  50
ip_app_device_channel_time2previousclick        38
app_os_channel_time2previouscli

In [25]:
aa = model.predict(valset[feature_cols])

In [16]:
df_test = pd.read_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/test_fold_last_in_12_count.csv')


In [18]:
df_test = df_test[list(valset[feature_cols].columns)]
bb = model.predict(df_test)

In [19]:
list(set(valset[feature_cols].columns) - set(df_test.columns))

[]

In [20]:
bb

array([ 0.00341477,  0.00191355,  0.0016563 , ...,  0.9424411 ,
        0.00297018,  0.67816511])

In [33]:
list(valset[feature_cols].columns)

['ip_os_second_count',
 'app_day_second_count',
 'os_day_minute_count',
 'ip_channel_hour_count',
 'os_day_hour_count',
 'day_hour_count',
 'os_day_second_count',
 'app_count',
 'device_os_day_count',
 'day_second_count',
 'app_os_hour_count',
 'channel',
 'channel_minute_count',
 'ip_app_count',
 'day_count',
 'channel_day_count',
 'app_day_minute_count',
 'ip_minute_count',
 'app_hour_minute_count',
 'ip_os_count',
 'os_channel_count',
 'ip_device_minute_count',
 'os_count',
 'app_minute_count',
 'device_day_minute_count',
 'minute',
 'channel_day_second_count',
 'ip_device_hour_count',
 'ip_app_minute_count',
 'second',
 'hour',
 'device_os_count',
 'ip_device_day_count',
 'ip_os_minute_count',
 'ip_channel_day_count',
 'ip_device_count',
 'device_channel_count',
 'channel_second_count',
 'ip_os_channel_count',
 'ip_channel_minute_count',
 'app_second_count',
 'os_hour_minute_count',
 'minute_second_count',
 'ip_device_channel_count',
 'app_device_os_count',
 'channel_hour_second_co

In [18]:
df_sub.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/submission/train_fold_last_in_12csv.gz', compression='gzip', index=False)

In [21]:
# prediction
df_test_raw = pd.read_csv('/home/kai/data/kaggle/talkingdata/data/test.csv')

df_sub = pd.DataFrame()
df_sub['click_id'] = df_test_raw['click_id']
df_sub['is_attributed'] = bb

In [22]:
df_sub.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/submission/train_fold_6_in_6_last_count_0403_2_noscale.csv.gz', compression='gzip', index=False)

In [46]:
valset.head(1)

Unnamed: 0,ip,app,device,os,channel,day,hour,timestamp,minute,second,...,ip_app_device_count,ip_app_os_count,ip_app_channel_count,ip_device_os_count,ip_device_channel_count,ip_os_channel_count,app_device_os_count,app_device_channel_count,app_os_channel_count,device_os_channel_count
21638813,2348,15,1,13,245,8,23,1510182584,9,44,...,608.0,190.0,222.0,1963.0,313.0,96.0,2829042.0,4553892.0,1062250.0,1579048.0


In [45]:
df_test.head(1)

Unnamed: 0,ip,app,device,os,channel,day,hour,timestamp,minute,second,...,ip_app_device_count,ip_app_os_count,ip_app_channel_count,ip_device_os_count,ip_device_channel_count,ip_os_channel_count,app_device_os_count,app_device_channel_count,app_os_channel_count,device_os_channel_count
0,5744,9,1,3,107,10,4,1510286400,0,0,...,64.0,1.0,10.0,2.0,34.0,0.0,195701.0,504264.0,11217.0,174692.0
