In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np
import gc

In [2]:
load_path = '/home/kai/data/kaggle/talkingdata/wl/data/equalhour/'
file_format = '{}_equalhour_supplementV1.csv'
day_list = ['day7', 'day8', 'day9']
df_dict = {}
for file in ['day7', 'day8', 'day9','test']: 
    df_dict[file] = pd.read_csv(load_path+file_format.format(file))
    print(file_format.format(file))
    

day7_equalhour_supplementV1.csv
day8_equalhour_supplementV1.csv
day9_equalhour_supplementV1.csv
test_equalhour_supplementV1.csv


# define parameter

In [6]:
target = 'is_attributed'

day_list = ['day7', 'day8', 'day9']

combine = 0
params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'num_rounds': 4000,
        'learning_rate': 0.1,
        'num_leaves': 61,
        'num_threads': 4, # best speed: set to number of real cpu cores, which is vCPU/2
        'device': 'cpu',
        'max_depth': -1, # no limit. This is used to deal with over-fitting when #data is small.
        'min_data_in_leaf': 390,  #minimal number of data in one leaf. Can be used to deal with over-fitting
        'feature_fraction': 0.7, #For example, if set to 0.8, will select 80% features before training each tree.  speed up training / deal with over-fitting
        'feature_fraction_seed': 1,
        'early_stopping_round':60,
        'bagging_fraction': 0.7, #Randomly select part of data without resampling
        'bagging_freq': 1, #frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration. to enable bagging, bagging_fraction should be set as well
        'bagging_seed': 1,
        'verbose': 0,
        'scale_pos_weight': 400,
        'metric' : [ 'auc']
    }

categorical_col = [ 'app', 'device', 'os', 'channel', 'hour']
feature_cols = list(df_dict['test'].columns.values)

In [7]:
df_dict['day7'].columns.values

array(['ip_day_hour_count', 'ip_os_day_hour_count',
       'ip_app_day_hour_count', 'ip_app_os_day_hour_count',
       'app_day_hour_count', 'ip_device_os_count',
       'ip_app_device_os_count', 'ip_day_hour_countself',
       'ip_os_day_hour_countself', 'ip_app_day_hour_countself',
       'ip_app_os_day_hour_countself', 'app_day_hour_countself',
       'ip_device_os_countself', 'ip_app_device_os_countself',
       'ip_device_os_mean', 'ip_app_device_os_mean', 'ip_app_device_mean',
       'app_device_os_mean', 'ip_device_os_time2nextclick',
       'ip_app_device_os_time2nextclick', 'ip_app_device_time2nextclick',
       'ip_device_os_time2previousclick',
       'ip_app_device_os_time2previousclick',
       'ip_app_device_time2previousclick', 'ip_device_os_countfromfuture',
       'ip_app_device_os_countfromfuture', 'ip_app_device_countfromfuture',
       'ip_device_os_countfrompast', 'ip_app_device_os_countfrompast',
       'ip_app_device_countfrompast', 'ip_device_os_lasttimediff',
 

In [8]:
# get rid of extra features
# extra = set(['matrixFact_user_iposdeviceapp_item_channel', 'matrixFact_user_iposdeviceapp_item_device'])
extra = set(['ip_device_os_countself',
             'ip_day_hour_countself', 
             'app_day_hour_countself',
             'ip_app_device_os_countself',
             'ip_app_day_hour_countself', 
             'ip_os_day_hour_countself', 
             'ip_app_os_day_hour_countself' ])

# extra = set([])
# extra = set(['ip_device_os_count',
#              'ip_day_hour_count', 
#              'app_day_hour_count',
#              'ip_app_device_os_count',
#              'ip_app_day_hour_count', 
#              'ip_os_day_hour_count', 
#              'ip_app_os_day_hour_count' ])
feature_cols = list(set(feature_cols) - extra)

In [9]:
feature_cols

['ip_app_device_regression',
 'hour',
 'app_device_os_mean',
 'ip_app_device_os_countfromfuture',
 'ip_app_day_hour_count',
 'ip_device_os_lasttimediff',
 'ip_app_os_day_hour_count',
 'ip_day_hour_count',
 'ip_device_os_mean',
 'ip_device_os_count',
 'ip_device_os_countfrompast',
 'ip_app_device_countfrompast',
 'ip_device_os_countfromfuture',
 'ip_app_device_os_time2nextclick',
 'ip_app_device_os_count',
 'ip_app_device_firsttimediff',
 'matrixFact_user_ipchannel_item_appdeviceos',
 'matrixFact_user_ip_item_appdeviceos',
 'ip_app_device_os_channel_regression',
 'device',
 'ip_app_device_time2previousclick',
 'ip_app_device_os_regression',
 'matrixFact_user_iposdeviceapp_item_app',
 'ip_device_os_time2nextclick',
 'ip_app_device_time2nextclick',
 'ip_app_device_countfromfuture',
 'app',
 'ip_os_day_hour_count',
 'app_day_hour_count',
 'channel',
 'os',
 'ip_app_device_os_lasttimediff',
 'ip_device_os_regression',
 'ip_device_os_time2previousclick',
 'ip_app_device_os_mean',
 'ip_app_de

In [10]:
def train_lightgbm(x_train, x_val, feature_cols, categorical_feature, params, best_round = None, target='is_attributed'):
    param = params.copy()
    y_train = x_train[target].values
    y_val = x_val[target].values
    
    lgb_train = lgb.Dataset(x_train[feature_cols], y_train, categorical_feature = categorical_feature)
    lgb_val = lgb.Dataset(x_val[feature_cols], y_val, categorical_feature = categorical_feature)
    if best_round is not None:
        param['num_rounds'] = best_round
        del param['early_stopping_round']
    print('start training')
    model = lgb.train(param, train_set=lgb_train, valid_sets=lgb_val, verbose_eval=10)
    return model

In [None]:
for day in day_list:
    train_day = list(set(day_list)-set([day]))
    file_name = 'concat_{}{}_val_{}_equalhour_supplement_V1.npy'.format(train_day[0],train_day[1],day)
    print(file_name)
    trainset = pd.concat([df_dict[train_day[0]],df_dict[train_day[1]]])
    valset = df_dict[day]
    print('building train val done!')
    model = train_lightgbm(trainset, valset, feature_cols, categorical_col, params)
    best_round = model.best_iteration
    df_all = pd.concat([trainset, valset])
    model = train_lightgbm(df_all, valset, feature_cols, categorical_col, params, best_round)
    del df_all
    gc.collect()
    pred = model.predict(df_dict['test'][feature_cols])
    np.save(load_path+file_name, pred)
    
    # prediction
    df_test_raw = pd.read_csv('/home/kai/data/kaggle/talkingdata/data/test.csv')
    print('loading file done!')
    df_sub = pd.DataFrame()
    df_sub['click_id'] = df_test_raw['click_id']
    df_sub['is_attributed'] = pred
    print('predicting file done!')
    df_sub.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/submission/concat_{}{}_val_{}_equalhour_supplement_V1.csv.gz'.format(train_day[0],train_day[1],day), compression='gzip', index=False)

    

concat_day9day8_val_day7_equalhour_supplement_V1.npy
building train val done!
start training




Training until validation scores don't improve for 60 rounds.
[10]	valid_0's auc: 0.980661
[20]	valid_0's auc: 0.982295
[30]	valid_0's auc: 0.983177
[40]	valid_0's auc: 0.984005
[50]	valid_0's auc: 0.984491
[60]	valid_0's auc: 0.984856
[70]	valid_0's auc: 0.984951
[80]	valid_0's auc: 0.98502
[90]	valid_0's auc: 0.985066
[100]	valid_0's auc: 0.985108
[110]	valid_0's auc: 0.985082
[120]	valid_0's auc: 0.985106
[130]	valid_0's auc: 0.985076
[140]	valid_0's auc: 0.985081
[150]	valid_0's auc: 0.985074
[160]	valid_0's auc: 0.985068
Early stopping, best iteration is:
[100]	valid_0's auc: 0.985108
start training
[10]	valid_0's auc: 0.981753
[20]	valid_0's auc: 0.983954
[30]	valid_0's auc: 0.985289
[40]	valid_0's auc: 0.986381
[50]	valid_0's auc: 0.987313
[60]	valid_0's auc: 0.987962
[70]	valid_0's auc: 0.988552
[80]	valid_0's auc: 0.989044
[90]	valid_0's auc: 0.989437
[100]	valid_0's auc: 0.98981
loading file done!
predicting file done!
concat_day9day7_val_day8_equalhour_supplement_V1.npy
buil

In [13]:
importance = pd.Series(model.feature_importance(importance_type='gain'), index=feature_cols)
# importance = pd.Series(model.feature_importance(), index=feature_cols)
importance = importance.sort_values(ascending=False)
if len(model.feature_importance()) != len(feature_cols):
    raise ValueError('Feature importance has length: {}, \n while feature number is {}'.
                     format(len(model.feature_importance()), len(feature_cols)))
    
importance

matrixFact_user_ip_item_appdeviceos           3.258546e+08
matrixFact_user_iposdeviceapp_item_app        7.112340e+07
matrixFact_user_ipchannel_item_appdeviceos    2.337828e+07
app_device_os_mean                            1.697719e+07
channel                                       1.335874e+07
ip_app_device_os_time2nextclick               1.112984e+07
ip_app_device_os_lasttimediff                 6.341537e+06
os                                            5.938842e+06
app                                           5.669180e+06
ip_day_hour_count                             5.045337e+06
ip_device_os_count                            3.062189e+06
ip_app_device_os_countfromfuture              2.117160e+06
ip_app_os_day_hour_count                      1.342405e+06
ip_app_device_os_count                        1.310539e+06
ip_app_device_mean                            1.298044e+06
hour                                          1.106012e+06
ip_app_device_firsttimediff                   9.815325e+

# Average

In [14]:
#### load each and average
load_path = '/home/kai/data/kaggle/talkingdata/wl/data/equalhour/'
day7 = np.load(load_path+'concat_day8day9_val_day7_equalhour_supplement_V1.npy')
day8 = np.load(load_path+'concat_day7day9_val_day8_equalhour_supplement_V1.npy')
day9 = np.load(load_path+'concat_day7day8_val_day9_equalhour_supplement_V1.npy')

pred = (day7 + day8 + day9)/3

In [16]:
# prediction
df_test_raw = pd.read_csv('/home/kai/data/kaggle/talkingdata/data/test.csv')
print('loading file done!')
df_sub = pd.DataFrame()
df_sub['click_id'] = df_test_raw['click_id']
df_sub['is_attributed'] = pred
print('predicting file done!')
df_sub.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/submission/equal_hour_blend_equalhour_supplement_V1.csv.gz', compression='gzip', index=False)


loading file done!
predicting file done!
