In [2]:
import pandas as pd
import numpy as np
import time
import lightgbm as lgb
import sys

In [3]:
t1 = time.time()
df_train = pd.read_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/train_last6.5k.csv')
df_test = pd.read_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/test_last6.5k.csv')
t2 = time.time()
print('training loading done! Time: {}'.format(t2 - t1))
print('size is: {}'.format(sys.getsizeof(df_train) / 1024 ** 3))
print('length is {}'.format(len(df_train)))

training loading done! Time: 182.99911332130432
size is: 12.107193566858768
length is 65000000


# Specify Categorical Columns

In [4]:
# categorical_col = ['app','os', 'channel', 'ip']
categorical_col = ['app','os', 'channel']
target = 'is_attributed'
feature_cols = list(set(df_train.columns) - set([target]))

In [5]:
feature_cols

['app',
 'second',
 'channel',
 'os',
 'device_minute_second_mean',
 'app_os_hour_mean',
 'hour_minute_second_mean',
 'hour',
 'ip_app_mean',
 'ip_channel_count',
 'ip_os_hour_count',
 'device_minute_mean',
 'minute',
 'ip_mean',
 'app_channel_hour_mean',
 'ip_minute_count',
 'ip_app_device_mean',
 'day',
 'app_os_channel_mean',
 'ip',
 'ip_channel_mean',
 'timestamp',
 'device',
 'ip_second_mean']

# Get Validation

In [6]:
from sklearn.model_selection import train_test_split
trainset, valset = train_test_split(df_train,test_size=0.1, random_state=31)
print(sys.getsizeof(trainset)/ 1024 **3)

11.332333110272884


In [29]:
y_train = trainset[target].values
y_val = valset[target].values

lgb_train = lgb.Dataset(trainset[feature_cols], y_train, categorical_feature = categorical_col)
lgb_val = lgb.Dataset(valset[feature_cols], y_val, categorical_feature = categorical_col)

zeros = len(y_train[y_train == 0])
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
import gc
# del df_train
gc.collect()

71

# Train Lightgbm

In [30]:
params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'num_rounds': 2000,
        'learning_rate': 0.05,
        'num_leaves': 31,
        'num_threads': 16, # best speed: set to number of real cpu cores, which is vCPU/2
        'device': 'cpu',
        'max_depth': -1, # no limit. This is used to deal with over-fitting when #data is small.
        'min_data_in_leaf': 100,  #minimal number of data in one leaf. Can be used to deal with over-fitting
        'feature_fraction': 0.85, #For example, if set to 0.8, will select 80% features before training each tree.  speed up training / deal with over-fitting
        'feature_fraction_seed': 1,
        'early_stopping_round':50,
        'bagging_fraction': 0.8, #Randomly select part of data without resampling
        'bagging_freq': 1, #frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration. to enable bagging, bagging_fraction should be set as well
        'bagging_seed': 1,
        #'max_bin': 255,
        'verbose': 0,
        'scale_pos_weight': 99,
        'metric' : ['binary_logloss', 'auc']
    }

model = lgb.train(params, train_set=lgb_train, valid_sets=lgb_val, verbose_eval=10)



Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.963692	valid_0's binary_logloss: 0.386506
[20]	valid_0's auc: 0.966248	valid_0's binary_logloss: 0.242008
[30]	valid_0's auc: 0.967445	valid_0's binary_logloss: 0.165128
[40]	valid_0's auc: 0.968135	valid_0's binary_logloss: 0.121927
[50]	valid_0's auc: 0.968654	valid_0's binary_logloss: 0.0969461
[60]	valid_0's auc: 0.969147	valid_0's binary_logloss: 0.0823357
[70]	valid_0's auc: 0.969762	valid_0's binary_logloss: 0.0736235
[80]	valid_0's auc: 0.970306	valid_0's binary_logloss: 0.0683908
[90]	valid_0's auc: 0.970704	valid_0's binary_logloss: 0.0652209
[100]	valid_0's auc: 0.97108	valid_0's binary_logloss: 0.0632824
[110]	valid_0's auc: 0.97147	valid_0's binary_logloss: 0.0619961
[120]	valid_0's auc: 0.971776	valid_0's binary_logloss: 0.0612041
[130]	valid_0's auc: 0.972014	valid_0's binary_logloss: 0.0606777
[140]	valid_0's auc: 0.972162	valid_0's binary_logloss: 0.0603617
[150]	valid_0's auc: 0.972262

In [31]:
from sklearn.metrics import roc_auc_score
pred_val = model.predict(valset[feature_cols])
print(roc_auc_score(y_val,pred_val))

0.97280073085


In [32]:
importance = pd.Series(model.feature_importance(), index=feature_cols)
importance = importance.sort_values(ascending=False)
if len(model.feature_importance()) != len(feature_cols):
    raise ValueError('Feature importance has length: {}, \n while feature number is {}'.
                     format(len(model.feature_importance()), len(feature_cols)))
importance.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/output/importance_6500k_col17_scale99.csv')

In [33]:
importance

channel                      2130
os                           1237
app                          1118
timestamp                     624
ip_minute_count               582
app_os_channel_mean           551
ip                            455
ip_mean                       405
app_os_hour_mean              326
ip_channel_count              319
app_channel_hour_mean         293
ip_os_hour_count              251
device_minute_second_mean     229
device_minute_mean            229
ip_app_device_mean            209
hour_minute_second_mean       204
ip_app_mean                   182
second                        166
hour                          159
minute                        159
ip_second_mean                136
ip_channel_mean                67
device                         49
day                             0
dtype: int64

In [34]:
# prediction
df_test_raw = pd.read_csv('/home/kai/data/kaggle/talkingdata/data/test.csv')
df_test = df_test[list(valset[feature_cols].columns)]
df_sub = pd.DataFrame()
df_sub['click_id'] = df_test_raw['click_id']
df_sub['is_attributed'] = model.predict(df_test)
df_sub.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/submission/train_all_6500k_col17_scale99.csv.gz', compression='gzip', index=False)



In [36]:
len(df_sub[df_sub['is_attributed'] > 0.5])

309225

In [28]:
a1 = 1 - 4078/len(df_sub)
print(a1)

a2 = 1 - 379661/len(df_sub)
print(a2)

a3 = 1- y_train.sum()/len(y_train)
print(a3)

0.9997829750816757
0.9797950226787846
0.997596649573
