In [1]:
import pandas as pd
import numpy as np
import time
import lightgbm as lgb
import sys

In [2]:
t1 = time.time()
df_train = pd.read_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/train_countall_count_last75m.csv')
# df_test = pd.read_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/test_fold_last_in_12_count.csv')
t2 = time.time()
print('training loading done! Time: {}'.format(t2 - t1))
print('size is: {}'.format(sys.getsizeof(df_train) / 1024 ** 3))
print('length is {}'.format(len(df_train)))

training loading done! Time: 96.94060564041138
size is: 13.411045171320438
length is 75000000


# Specify Categorical Columns

In [3]:
categorical_col = ['ip', 'app', 'device', 'os', 'channel', 'hour', 'minute', 'second']
target = 'is_attributed'
feature_cols = list(set(df_train.columns) - set([target]))

# Get Validation -- Last 10%

In [4]:
ratio = 0.1
length = len(df_train)
front = int( (1-ratio) * length)
trainset = df_train.iloc[:front]
valset = df_train.iloc[front:]

print(sys.getsizeof(trainset)/ 1024 **3)
print(len(trainset))

12.06994066387415
67500000


In [23]:
y_train = trainset[target].values
y_val = valset[target].values

lgb_train = lgb.Dataset(trainset[feature_cols], y_train, categorical_feature = categorical_col)
lgb_val = lgb.Dataset(valset[feature_cols], y_val, categorical_feature = categorical_col)

zeros = len(y_train[y_train == 0])
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train) * 100
import gc
# del df_train
gc.collect()

print(scale_pos_weight)

99.7607674074074


# Train Lightgbm

In [24]:
params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'num_rounds': 2000,
        'learning_rate': 0.1,
        'num_leaves': 11,
        'num_threads': 4, # best speed: set to number of real cpu cores, which is vCPU/2
        'device': 'cpu',
        'max_depth': -1, # no limit. This is used to deal with over-fitting when #data is small.
        'min_data_in_leaf': 200,  #minimal number of data in one leaf. Can be used to deal with over-fitting
        'feature_fraction': 0.7, #For example, if set to 0.8, will select 80% features before training each tree.  speed up training / deal with over-fitting
        'feature_fraction_seed': 1,
        'early_stopping_round':50,
        'bagging_fraction': 0.8, #Randomly select part of data without resampling
        'bagging_freq': 1, #frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration. to enable bagging, bagging_fraction should be set as well
        'bagging_seed': 1,
        #'max_bin': 255,
        'verbose': 0,
        'scale_pos_weight': scale_pos_weight,
        'metric' : ['binary_logloss', 'auc']
    }

model = lgb.train(params, train_set=lgb_train, valid_sets=lgb_val, verbose_eval=10)



Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.968459	valid_0's binary_logloss: 0.241012
[20]	valid_0's auc: 0.97257	valid_0's binary_logloss: 0.121443
[30]	valid_0's auc: 0.975028	valid_0's binary_logloss: 0.0811393
[40]	valid_0's auc: 0.975913	valid_0's binary_logloss: 0.0652999
[50]	valid_0's auc: 0.977195	valid_0's binary_logloss: 0.0574961
[60]	valid_0's auc: 0.978366	valid_0's binary_logloss: 0.0519851
[70]	valid_0's auc: 0.979177	valid_0's binary_logloss: 0.0485991
[80]	valid_0's auc: 0.97965	valid_0's binary_logloss: 0.0466998
[90]	valid_0's auc: 0.98011	valid_0's binary_logloss: 0.0456764
[100]	valid_0's auc: 0.980356	valid_0's binary_logloss: 0.0448638
[110]	valid_0's auc: 0.980717	valid_0's binary_logloss: 0.0438212
[120]	valid_0's auc: 0.98091	valid_0's binary_logloss: 0.0430821
[130]	valid_0's auc: 0.981118	valid_0's binary_logloss: 0.042459
[140]	valid_0's auc: 0.981239	valid_0's binary_logloss: 0.0419911
[150]	valid_0's auc: 0.981261	

In [25]:
from sklearn.metrics import roc_auc_score
pred_val = model.predict(valset[feature_cols])
print(roc_auc_score(y_val,pred_val))

0.981776095035


In [30]:
importance = pd.Series(model.feature_importance(), index=feature_cols)
importance = importance.sort_values(ascending=False)
if len(model.feature_importance()) != len(feature_cols):
    raise ValueError('Feature importance has length: {}, \n while feature number is {}'.
                     format(len(model.feature_importance()), len(feature_cols)))
importance.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/output/importance_just_count_0404.csv')

In [29]:
importance

ip                       1214
channel                   401
app                       218
minute                    191
second                    178
os                        161
hour                       99
ip_app_os_count            65
ip_count                   63
app_channel_count          52
ip_device_count            50
ip_os_hour_count           41
device                     38
ip_app_hour_count          37
ip_day_count               34
ip_app_count               28
ip_minute_count            23
ip_second_count            17
ip_hour_count              15
app_channel_day_count      11
ip_day_hour_count           9
ip_device_hour_count        9
ip_channel_count            6
dtype: int64

In [13]:
df_test = pd.read_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/test_countall_count_last75m.csv')

# prediction
df_test_raw = pd.read_csv('/home/kai/data/kaggle/talkingdata/data/test.csv')
df_test = df_test[list(valset[feature_cols].columns)]
df_sub = pd.DataFrame()
df_sub['click_id'] = df_test_raw['click_id']
df_sub['is_attributed'] = model.predict(df_test)
df_sub.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/submission/train_count_75m_count.csv.gz', compression='gzip', index=False)


In [22]:
a = ['ip_day_hour_count', 'ip', 'day', 'hour']
df_train.loc[:20,a ]

Unnamed: 0,ip_day_hour_count,ip,day,hour
0,43,143414,,12
1,18,173096,,12
2,42,8210,,12
3,19,5746,,12
4,65,31475,,12
5,11,251465,,12
6,164,163593,,12
7,158,58288,,12
8,134,27038,,12
9,414,67682,,12
