In [2]:
import pandas as pd
import numpy as np
import time
import lightgbm as lgb
import sys

In [3]:
t1 = time.time()
df_train = pd.read_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/train_all_5000k_17cols.csv')
df_test = pd.read_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/test_all_5000k_17cols.csv')
t2 = time.time()
print('training loading done! Time: {}'.format(t2 - t1))
print('size is: {}'.format(sys.getsizeof(df_train) / 1024 ** 3))
print('length is {}'.format(len(df_train)))

training loading done! Time: 116.29419326782227
size is: 6.705522634088993
length is 50000000


# Specify Categorical Columns

In [4]:
# categorical_col = ['app','os', 'channel', 'ip']
categorical_col = ['app','os', 'channel']
target = 'is_attributed'
feature_cols = list(set(df_train.columns) - set([target]))

In [5]:
feature_cols

['device_minute_mean',
 'ip_channel_mean',
 'app_channel_hour_mean',
 'app_os_hour_mean',
 'ip_app_mean',
 'ip_channel_count',
 'ip_mean',
 'ip_os_hour_count',
 'app',
 'ip_second_mean',
 'hour_minute_second_mean',
 'channel',
 'ip_minute_count',
 'device_minute_second_mean',
 'app_os_channel_mean',
 'os',
 'ip_app_device_mean']

# Get Validation

In [8]:
from sklearn.model_selection import train_test_split
trainset, valset = train_test_split(df_train,test_size=0.1, random_state=31)
print(sys.getsizeof(trainset)/ 1024 **3)

6.370246432721615


In [19]:
y_train = trainset[target].values
y_val = valset[target].values

lgb_train = lgb.Dataset(trainset[feature_cols], y_train, categorical_feature = categorical_col)
lgb_val = lgb.Dataset(valset[feature_cols], y_val, categorical_feature = categorical_col)

zeros = len(y_train[y_train == 0])
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
import gc
# del df_train
gc.collect()

210

# Train Lightgbm

In [20]:
params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'num_rounds': 2000,
        'learning_rate': 0.05,
        'num_leaves': 31,
        'num_threads': 16, # best speed: set to number of real cpu cores, which is vCPU/2
        'device': 'cpu',
        'max_depth': -1, # no limit. This is used to deal with over-fitting when #data is small.
        'min_data_in_leaf': 100,  #minimal number of data in one leaf. Can be used to deal with over-fitting
        'feature_fraction': 0.85, #For example, if set to 0.8, will select 80% features before training each tree.  speed up training / deal with over-fitting
        'feature_fraction_seed': 1,
        'early_stopping_round':50,
        'bagging_fraction': 0.8, #Randomly select part of data without resampling
        'bagging_freq': 1, #frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration. to enable bagging, bagging_fraction should be set as well
        'bagging_seed': 1,
        #'max_bin': 255,
        'verbose': 0,
        'scale_pos_weight': scale_pos_weight,
        'metric' : ['binary_logloss', 'auc']
    }

model = lgb.train(params, train_set=lgb_train, valid_sets=lgb_val, verbose_eval=10)



Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.999242	valid_0's binary_logloss: 0.371639
[20]	valid_0's auc: 0.999316	valid_0's binary_logloss: 0.219727
[30]	valid_0's auc: 0.999395	valid_0's binary_logloss: 0.13901
[40]	valid_0's auc: 0.999429	valid_0's binary_logloss: 0.0934223
[50]	valid_0's auc: 0.999458	valid_0's binary_logloss: 0.0667748
[60]	valid_0's auc: 0.999489	valid_0's binary_logloss: 0.0510195
[70]	valid_0's auc: 0.99951	valid_0's binary_logloss: 0.0416451
[80]	valid_0's auc: 0.999524	valid_0's binary_logloss: 0.0358293
[90]	valid_0's auc: 0.999531	valid_0's binary_logloss: 0.0323158
[100]	valid_0's auc: 0.999538	valid_0's binary_logloss: 0.0301037
[110]	valid_0's auc: 0.999544	valid_0's binary_logloss: 0.0287088
[120]	valid_0's auc: 0.99956	valid_0's binary_logloss: 0.0278061
[130]	valid_0's auc: 0.999564	valid_0's binary_logloss: 0.027208
[140]	valid_0's auc: 0.999571	valid_0's binary_logloss: 0.0267933
[150]	valid_0's auc: 0.999576	

In [21]:
from sklearn.metrics import roc_auc_score
pred_val = model.predict(valset[feature_cols])
print(roc_auc_score(y_val,pred_val))

0.999584622243


In [22]:
importance = pd.Series(model.feature_importance(), index=feature_cols)
importance = importance.sort_values(ascending=False)
if len(model.feature_importance()) != len(feature_cols):
    raise ValueError('Feature importance has length: {}, \n while feature number is {}'.
                     format(len(model.feature_importance()), len(feature_cols)))
importance.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/output/importance_5000k_col17_all.csv')

In [23]:
importance

channel                      2015
os                           1502
ip_second_mean                977
ip_app_device_mean            678
hour_minute_second_mean       648
ip_channel_mean               634
app_os_hour_mean              564
device_minute_second_mean     457
app                           453
ip_mean                       433
ip_os_hour_count              428
device_minute_mean            350
ip_channel_count              329
ip_minute_count               298
app_channel_hour_mean         294
ip_app_mean                   284
app_os_channel_mean           246
dtype: int64

In [24]:
# prediction
df_test_raw = pd.read_csv('/home/kai/data/kaggle/talkingdata/data/test.csv')
df_test = df_test[list(valset[feature_cols].columns)]
df_sub = pd.DataFrame()
df_sub['click_id'] = df_test_raw['click_id']
df_sub['is_attributed'] = model.predict(df_test)
df_sub.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/submission/train_all_500k_col17_1.csv.gz', compression='gzip', index=False)



In [17]:
# prediction
df_test_raw = pd.read_csv('/home/kai/data/kaggle/talkingdata/data/test.csv')

df_sub = pd.DataFrame()
df_sub['click_id'] = df_test_raw['click_id']
df_sub['is_attributed'] = bb

In [25]:
len(df_sub[df_sub['is_attributed'] > 0.5])

207369

In [26]:
a1 = 1 - 4078/len(df_sub)
print(a1)

a2 = 1 - 207369/len(df_sub)
print(a2)

a3 = 1- y_train.sum()/len(y_train)
print(a3)

0.9997829750816757
0.9889641392133427
0.997235511111
