# time realted feature is from concat([train,test])

In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np
import gc

In [4]:
dtypes = {
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'hour'          : 'uint8',
        'is_attributed' : 'uint8', 
        'ip_day_hour_count': 'uint32', 
        'ip_os_day_hour_count': 'uint32', 
        'ip_app_day_hour_count': 'uint32', 
        'ip_app_os_day_hour_count': 'uint32', 
        'app_day_hour_count': 'uint32', 
        'ip_device_os_count': 'uint32', 
        'ip_app_device_os_count': 'uint32', 
        'ip_device_os_mean': 'float16',
        'ip_app_device_os_mean': 'float16',
        'ip_app_device_mean': 'float16',
        'app_device_os_mean': 'float16',
        'ip_device_os_time2nextclick': 'int32',
        'ip_app_device_os_time2nextclick': 'int32',
        'ip_app_device_time2nextclick': 'int32',
        'ip_device_os_time2previousclick': 'int32',
        'ip_app_device_os_time2previousclick': 'int32',
        'ip_app_device_time2previousclick': 'int32',
        'ip_device_os_countfromfuture': 'uint32', 
        'ip_app_device_os_countfromfuture': 'uint32', 
        'ip_app_device_countfromfuture': 'uint32', 
        'ip_device_os_countfrompast': 'uint32', 
        'ip_app_device_os_countfrompast': 'uint32', 
        'ip_app_device_countfrompast': 'uint32', 
        'ip_device_os_lasttimediff': 'int32',
        'ip_app_device_os_lasttimediff': 'int32',
        'ip_app_device_lasttimediff': 'int32',
        'ip_device_os_firsttimediff': 'int32',
        'ip_app_device_os_firsttimediff': 'int32',
        'ip_app_device_firsttimediff': 'int32',
        'matrixFact_user_iposdeviceapp_item_app': 'float16',
        'matrixFact_user_ip_item_appdeviceos': 'float16',
        'matrixFact_user_ipchannel_item_appdeviceos': 'float16',
        'ip_device_os_regression': 'float16',
        'ip_app_device_os_regression': 'float16',
        'ip_app_device_regression': 'float16',
        'ip_app_device_os_channel_regression': 'float16'
        } 

## load equal hour index

In [2]:
path = '/home/kai/data/kaggle/talkingdata/data/'
df_hour = pd.read_csv(path+'hourdistri.csv', index_col='Unnamed: 0')
index = {}
for day in ['day7', 'day8','day9']:
    index[day] = list(range(df_hour.loc[day,'4start'], df_hour.loc[day,'6end0sec'])) + \
    list(range(df_hour.loc[day,'9start'], df_hour.loc[day,'11end0sec'])) + \
    list(range(df_hour.loc[day,'13start'], df_hour.loc[day,'15end0sec'])) 

## load extra data 

In [5]:
load_path = '/home/kai/data/kaggle/talkingdata/wl/data/equalhour/'
file_format = '{}_features_supplementV2_extra_timeonALL.csv'
df_extra_dict = {}

for file in ['day7', 'day8', 'day9']: 
    print('loading file {}'.format(file))
    df_extra_dict[file] = pd.read_csv(load_path+file_format.format(file), dtype=dtypes)
    print(file_format.format(file))
    
df_extra_all = pd.concat([df_extra_dict['day7'], df_extra_dict['day8'], df_extra_dict['day9']])
print('finish concat')
df_extra_dict = {}

for file in ['day7', 'day8', 'day9']: 
    print('getting equal hour {}'.format(file))
    df_extra_dict[file] = df_extra_all.iloc[index[file]]
    print(len(df_extra_dict[file]))
    print('-------')
df_extra_dict['test'] = pd.read_csv(load_path+'test_equalhoursV2_features_supplementV2_extra_timeonALL', dtype=dtypes)
print(len(df_extra_dict['test']))

loading file day7
day7_features_supplementV2_extra_timeonALL.csv
loading file day8
day8_features_supplementV2_extra_timeonALL.csv
loading file day9
day9_features_supplementV2_extra_timeonALL.csv
finish concat
getting equal hour day7
19534560
-------
getting equal hour day8
20446743
-------
getting equal hour day9
20898422
-------
57537505


## load equal hour data

In [6]:
load_path = '/home/kai/data/kaggle/talkingdata/wl/data/equalhour/'
file_format = '{}_equalhour_supplementV1.csv'
day_list = ['day7', 'day8', 'day9']
df_dict = {}
for file in ['day7', 'day8', 'day9','test']: 
    df_dict[file] = pd.read_csv(load_path+file_format.format(file), dtype=dtypes)
    print(file_format.format(file))
    

day7_equalhour_supplementV1.csv
day8_equalhour_supplementV1.csv
day9_equalhour_supplementV1.csv
test_equalhour_supplementV1.csv


## replace counts with extra data

In [10]:
for col in df_extra_dict['test'].columns:
    print(col)
    for day in ['day7', 'day8', 'day9','test']:
        print('{} - {}'.format(day, col))
        df_dict[day][col] = df_extra_dict[day][col]

ip_app_device_time2nextclick
day7 - ip_app_device_time2nextclick
day8 - ip_app_device_time2nextclick
day9 - ip_app_device_time2nextclick
test - ip_app_device_time2nextclick
ip_device_os_time2nextclick
day7 - ip_device_os_time2nextclick
day8 - ip_device_os_time2nextclick
day9 - ip_device_os_time2nextclick
test - ip_device_os_time2nextclick
ip_app_device_os_time2nextclick
day7 - ip_app_device_os_time2nextclick
day8 - ip_app_device_os_time2nextclick
day9 - ip_app_device_os_time2nextclick
test - ip_app_device_os_time2nextclick
ip_app_device_time2previousclick
day7 - ip_app_device_time2previousclick
day8 - ip_app_device_time2previousclick
day9 - ip_app_device_time2previousclick
test - ip_app_device_time2previousclick
ip_device_os_time2previousclick
day7 - ip_device_os_time2previousclick
day8 - ip_device_os_time2previousclick
day9 - ip_device_os_time2previousclick
test - ip_device_os_time2previousclick
ip_app_device_os_time2previousclick
day7 - ip_app_device_os_time2previousclick
day8 - ip_a

## Gabage collection

In [11]:
del df_extra_all
del df_extra_dict
gc.collect()

2939

# define parameter

In [14]:
target = 'is_attributed'

day_list = ['day7', 'day8', 'day9']

combine = 0
params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'num_rounds': 4000,
        'learning_rate': 0.1,
        'num_leaves': 61,
        'num_threads': 4, # best speed: set to number of real cpu cores, which is vCPU/2
        'device': 'cpu',
        'max_depth': -1, # no limit. This is used to deal with over-fitting when #data is small.
        'min_data_in_leaf': 390,  #minimal number of data in one leaf. Can be used to deal with over-fitting
        'feature_fraction': 0.7, #For example, if set to 0.8, will select 80% features before training each tree.  speed up training / deal with over-fitting
        'feature_fraction_seed': 1,
        'early_stopping_round':60,
        'bagging_fraction': 0.7, #Randomly select part of data without resampling
        'bagging_freq': 1, #frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration. to enable bagging, bagging_fraction should be set as well
        'bagging_seed': 1,
        'verbose': 0,
        'scale_pos_weight': 400,
        'metric' : [ 'auc']
    }

categorical_col = [ 'app', 'device', 'os', 'channel', 'hour']
feature_cols = list(df_dict['test'].columns.values)

In [15]:
df_dict['day7'].columns.values

array(['ip_day_hour_count', 'ip_os_day_hour_count',
       'ip_app_day_hour_count', 'ip_app_os_day_hour_count',
       'app_day_hour_count', 'ip_device_os_count',
       'ip_app_device_os_count', 'ip_day_hour_countself',
       'ip_os_day_hour_countself', 'ip_app_day_hour_countself',
       'ip_app_os_day_hour_countself', 'app_day_hour_countself',
       'ip_device_os_countself', 'ip_app_device_os_countself',
       'ip_device_os_mean', 'ip_app_device_os_mean', 'ip_app_device_mean',
       'app_device_os_mean', 'ip_device_os_time2nextclick',
       'ip_app_device_os_time2nextclick', 'ip_app_device_time2nextclick',
       'ip_device_os_time2previousclick',
       'ip_app_device_os_time2previousclick',
       'ip_app_device_time2previousclick', 'ip_device_os_countfromfuture',
       'ip_app_device_os_countfromfuture', 'ip_app_device_countfromfuture',
       'ip_device_os_countfrompast', 'ip_app_device_os_countfrompast',
       'ip_app_device_countfrompast', 'ip_device_os_lasttimediff',
 

In [16]:
# get rid of extra features
# extra = set(['matrixFact_user_iposdeviceapp_item_channel', 'matrixFact_user_iposdeviceapp_item_device'])
extra = set(['ip_device_os_countself',
             'ip_day_hour_countself', 
             'app_day_hour_countself',
             'ip_app_device_os_countself',
             'ip_app_day_hour_countself', 
             'ip_os_day_hour_countself', 
             'ip_app_os_day_hour_countself' ])

# extra = set([])
# extra = set(['ip_device_os_count',
#              'ip_day_hour_count', 
#              'app_day_hour_count',
#              'ip_app_device_os_count',
#              'ip_app_day_hour_count', 
#              'ip_os_day_hour_count', 
#              'ip_app_os_day_hour_count' ])
feature_cols = list(set(feature_cols) - extra)

In [17]:
feature_cols

['ip_device_os_time2nextclick',
 'hour',
 'ip_app_device_regression',
 'ip_app_device_os_mean',
 'ip_app_device_firsttimediff',
 'os',
 'ip_os_day_hour_count',
 'ip_device_os_time2previousclick',
 'ip_app_device_os_countfrompast',
 'channel',
 'ip_app_device_countfrompast',
 'matrixFact_user_ip_item_appdeviceos',
 'ip_app_device_os_count',
 'matrixFact_user_iposdeviceapp_item_app',
 'ip_day_hour_count',
 'ip_app_device_os_firsttimediff',
 'ip_app_device_os_channel_regression',
 'ip_device_os_firsttimediff',
 'device',
 'app',
 'ip_app_device_time2nextclick',
 'ip_app_device_lasttimediff',
 'ip_app_device_mean',
 'ip_device_os_countfrompast',
 'ip_app_device_os_time2nextclick',
 'ip_app_device_os_time2previousclick',
 'ip_app_device_time2previousclick',
 'ip_app_device_countfromfuture',
 'ip_app_device_os_lasttimediff',
 'app_day_hour_count',
 'ip_device_os_mean',
 'ip_device_os_regression',
 'ip_app_device_os_countfromfuture',
 'ip_app_os_day_hour_count',
 'ip_app_day_hour_count',
 'ip

In [18]:
def train_lightgbm(x_train, x_val, feature_cols, categorical_feature, params, best_round = None, target='is_attributed'):
    param = params.copy()
    y_train = x_train[target].values
    y_val = x_val[target].values
    
    lgb_train = lgb.Dataset(x_train[feature_cols], y_train, categorical_feature = categorical_feature)
    lgb_val = lgb.Dataset(x_val[feature_cols], y_val, categorical_feature = categorical_feature)
    if best_round is not None:
        param['num_rounds'] = best_round
        del param['early_stopping_round']
    print('start training')
    model = lgb.train(param, train_set=lgb_train, valid_sets=lgb_val, verbose_eval=10)
    return model

In [19]:
for day in day_list:
    train_day = list(set(day_list)-set([day]))
    file_name = 'concat_{}{}_val_{}_equalhour_supplement_V1_extra.npy'.format(train_day[0],train_day[1],day)
    print(file_name)
    trainset = pd.concat([df_dict[train_day[0]],df_dict[train_day[1]]])
    valset = df_dict[day]
    print('building train val done!')
    model = train_lightgbm(trainset, valset, feature_cols, categorical_col, params)
    best_round = model.best_iteration
    df_all = pd.concat([trainset, valset])
    model = train_lightgbm(df_all, valset, feature_cols, categorical_col, params, best_round)
    del df_all
    gc.collect()
    pred = model.predict(df_dict['test'][feature_cols])
    np.save(load_path+file_name, pred)
    
    # prediction
    df_test_raw = pd.read_csv('/home/kai/data/kaggle/talkingdata/data/test.csv')
    print('loading file done!')
    df_sub = pd.DataFrame()
    df_sub['click_id'] = df_test_raw['click_id']
    df_sub['is_attributed'] = pred
    print('predicting file done!')
    df_sub.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/submission/concat_{}{}_val_{}_equalhour_supplement_V1_extra.csv.gz'.format(train_day[0],train_day[1],day), compression='gzip', index=False)

    

concat_day9day8_val_day7_equalhour_supplement_V1_extra.npy
building train val done!
start training




Training until validation scores don't improve for 60 rounds.
[10]	valid_0's auc: 0.977104
[20]	valid_0's auc: 0.97848
[30]	valid_0's auc: 0.979543
[40]	valid_0's auc: 0.980086
[50]	valid_0's auc: 0.980625
[60]	valid_0's auc: 0.980941
[70]	valid_0's auc: 0.981102
[80]	valid_0's auc: 0.981188
[90]	valid_0's auc: 0.98117
[100]	valid_0's auc: 0.981218
[110]	valid_0's auc: 0.981217
[120]	valid_0's auc: 0.981238
[130]	valid_0's auc: 0.981242
[140]	valid_0's auc: 0.981225
[150]	valid_0's auc: 0.98122
[160]	valid_0's auc: 0.981199
[170]	valid_0's auc: 0.981191
[180]	valid_0's auc: 0.981168
Early stopping, best iteration is:
[127]	valid_0's auc: 0.981254
start training
[10]	valid_0's auc: 0.978195
[20]	valid_0's auc: 0.980116


KeyboardInterrupt: 

In [13]:
importance = pd.Series(model.feature_importance(importance_type='gain'), index=feature_cols)
# importance = pd.Series(model.feature_importance(), index=feature_cols)
importance = importance.sort_values(ascending=False)
if len(model.feature_importance()) != len(feature_cols):
    raise ValueError('Feature importance has length: {}, \n while feature number is {}'.
                     format(len(model.feature_importance()), len(feature_cols)))
    
importance

matrixFact_user_ip_item_appdeviceos           3.258546e+08
matrixFact_user_iposdeviceapp_item_app        7.112340e+07
matrixFact_user_ipchannel_item_appdeviceos    2.337828e+07
app_device_os_mean                            1.697719e+07
channel                                       1.335874e+07
ip_app_device_os_time2nextclick               1.112984e+07
ip_app_device_os_lasttimediff                 6.341537e+06
os                                            5.938842e+06
app                                           5.669180e+06
ip_day_hour_count                             5.045337e+06
ip_device_os_count                            3.062189e+06
ip_app_device_os_countfromfuture              2.117160e+06
ip_app_os_day_hour_count                      1.342405e+06
ip_app_device_os_count                        1.310539e+06
ip_app_device_mean                            1.298044e+06
hour                                          1.106012e+06
ip_app_device_firsttimediff                   9.815325e+

# Average

In [14]:
#### load each and average
load_path = '/home/kai/data/kaggle/talkingdata/wl/data/equalhour/'
day7 = np.load(load_path+'concat_day8day9_val_day7_equalhour_supplement_V1.npy')
day8 = np.load(load_path+'concat_day7day9_val_day8_equalhour_supplement_V1.npy')
day9 = np.load(load_path+'concat_day7day8_val_day9_equalhour_supplement_V1.npy')

pred = (day7 + day8 + day9)/3

In [16]:
# prediction
df_test_raw = pd.read_csv('/home/kai/data/kaggle/talkingdata/data/test.csv')
print('loading file done!')
df_sub = pd.DataFrame()
df_sub['click_id'] = df_test_raw['click_id']
df_sub['is_attributed'] = pred
print('predicting file done!')
df_sub.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/submission/equal_hour_blend_equalhour_supplement_V1.csv.gz', compression='gzip', index=False)


loading file done!
predicting file done!
