In [1]:
from __future__ import division
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.cross_validation import KFold
import lightgbm as lgb
import pickle
import time
import datetime
import math
import gc
import warnings
warnings.filterwarnings('ignore')



In [5]:
root_path = './'
predictors = []

In [6]:
########################################### Helper function ###########################################


In [8]:
def encode_onehot(df, column_name):
    df_onehot = pd.get_dummies(df[column_name], prefix=column_name)
    df_all = pd.concat([df.drop([column_name], axis=1), df_onehot], axis=1)
    predictors.append(column_name)
    return df_all


def encode_count(df, column_name):
    le = preprocessing.LabelEncoder()
    le.fit(list(df[column_name].values))
    df[column_name] = le.transform(list(df[column_name].values))
    predictors.append(column_name)
    return df


def merge_count(df, columns_groupby, new_column_name, type='uint64'):
    add = pd.DataFrame(df.groupby(columns_groupby).size()).reset_index()
    add.columns = columns_groupby + [new_column_name]
    df = df.merge(add, on=columns_groupby, how="left")
    df[new_column_name] = df[new_column_name].astype(type)
    predictors.append(new_column_name)
    return df


def merge_nunique(df, columns_groupby, column, new_column_name, type='uint64'):
    add = pd.DataFrame(df.groupby(columns_groupby)[column].nunique()).reset_index()
    add.columns = columns_groupby + [new_column_name]
    df = df.merge(add, on=columns_groupby, how="left")
    df[new_column_name] = df[new_column_name].astype(type)
    predictors.append(new_column_name)
    return df


def merge_cumcount(df, columns_groupby, column, new_column_name, type='uint64'):
    df[new_column_name] = df.groupby(columns_groupby)[column].cumcount().values.astype(type)
    predictors.append(new_column_name)
    return df


def merge_median(df, columns_groupby, column, new_column_name, type='float64'):
    add = pd.DataFrame(df.groupby(columns_groupby)[column].median()).reset_index()
    add.columns = columns_groupby + [new_column_name]
    df = df.merge(add, on=columns_groupby, how="left")
    df[new_column_name] = df[new_column_name].astype(type)
    predictors.append(new_column_name)
    return df


def merge_mean(df, columns_groupby, column, new_column_name, type='float64'):
    add = pd.DataFrame(df.groupby(columns_groupby)[column].mean()).reset_index()
    add.columns = columns_groupby + [new_column_name]
    df = df.merge(add, on=columns_groupby, how="left")
    df[new_column_name] = df[new_column_name].astype(type)
    predictors.append(new_column_name)
    return df


def merge_sum(df, columns_groupby, column, new_column_name, type='float64'):
    add = pd.DataFrame(df.groupby(columns_groupby)[column].sum()).reset_index()
    add.columns = columns_groupby + [new_column_name]
    df = df.merge(add, on=columns_groupby, how="left")
    df[new_column_name] = df[new_column_name].astype(type)
    # predictors.append(new_column_name)  # bug: twice
    return df


def merge_max(df, columns_groupby, column, new_column_name, type='float64'):
    add = pd.DataFrame(df.groupby(columns_groupby)[column].max()).reset_index()
    add.columns = columns_groupby + [new_column_name]
    df = df.merge(add, on=columns_groupby, how="left")
    df[new_column_name] = df[new_column_name].astype(type)
    predictors.append(new_column_name)
    return df


def merge_min(df, columns_groupby, column, new_column_name, type='float64'):
    add = pd.DataFrame(df.groupby(columns_groupby)[column].min()).reset_index()
    add.columns = columns_groupby + [new_column_name]
    df = df.merge(add, on=columns_groupby, how="left")
    df[new_column_name] = df[new_column_name].astype(type)
    predictors.append(new_column_name)
    return df


def merge_std(df, columns_groupby, column, new_column_name, type='float64'):
    add = pd.DataFrame(df.groupby(columns_groupby)[column].std()).reset_index()
    add.columns = columns_groupby + [new_column_name]
    df = df.merge(add, on=columns_groupby, how="left")
    df[new_column_name] = df[new_column_name].astype(type)
    predictors.append(new_column_name)
    return df


def merge_var(df, columns_groupby, column, new_column_name, type='float64'):
    add = pd.DataFrame(df.groupby(columns_groupby)[column].var()).reset_index()
    add.columns = columns_groupby + [new_column_name]
    df = df.merge(add, on=columns_groupby, how="left")
    df[new_column_name] = df[new_column_name].astype(type)
    predictors.append(new_column_name)
    return df


def merge_rank(df, columns_groupby, column, new_column_name, ascending=True, type='uint64'):
    df[new_column_name] = df.groupby(columns_groupby)[column].rank(ascending=ascending)
    df[new_column_name] = df[new_column_name].astype(type)
    predictors.append(new_column_name)
    return df


def merge_feat_count(df, df_feat, columns_groupby, column, new_column_name=""):
    df_count = pd.DataFrame(df_feat.groupby(columns_groupby)[column].count()).reset_index()
    if not new_column_name:
        df_count.columns = columns_groupby + [column + "_gb_%s_count" % ("_".join(columns_groupby))]
    else:
        df_count.columns = columns_groupby + [new_column_name]
    df = df.merge(df_count, on=columns_groupby, how="left").fillna(0)
    predictors.append(df_count.columns[-1])
    return df


def merge_feat_nunique(df, df_feat, columns_groupby, column, new_column_name=""):
    df_nunique = pd.DataFrame(df_feat.groupby(columns_groupby)[column].nunique()).reset_index()
    if not new_column_name:
        df_nunique.columns = columns_groupby + [column + "_%s_nunique" % ("_".join(columns_groupby))]
    else:
        df_nunique.columns = columns_groupby + [new_column_name]
    df = df.merge(df_nunique, on=columns_groupby, how="left").fillna(0)
    predictors.append(df_nunique.columns[-1])
    return df


def merge_feat_mean(df, df_feat, columns_groupby, column, new_column_name=""):
    df_mean = pd.DataFrame(df_feat.groupby(columns_groupby)[column].mean()).reset_index()
    if not new_column_name:
        df_mean.columns = columns_groupby + [column + "_%s_mean" % ("_".join(columns_groupby))]
    else:
        df_mean.columns = columns_groupby + [new_column_name]
    df = df.merge(df_mean, on=columns_groupby, how="left").fillna(0)
    predictors.append(df_mean.columns[-1])
    return df


def merge_feat_std(df, df_feat, columns_groupby, column, new_column_name=""):
    df_std = pd.DataFrame(df_feat.groupby(columns_groupby)[column].std()).reset_index()
    if not new_column_name:
        df_std.columns = columns_groupby + [column + "_%s_std" % ("_".join(columns_groupby))]
    else:
        df_std.columns = columns_groupby + [new_column_name]
    df = df.merge(df_std, on=columns_groupby, how="left").fillna(0)
    predictors.append(df_std.columns[-1])
    return df


def merge_feat_median(df, df_feat, columns_groupby, column, new_column_name=""):
    df_median = pd.DataFrame(df_feat.groupby(columns_groupby)[column].median()).reset_index()
    if not new_column_name:
        df_median.columns = columns_groupby + [column + "_%s_median" % ("_".join(columns_groupby))]
    else:
        df_median.columns = columns_groupby + [new_column_name]
    df = df.merge(df_median, on=columns_groupby, how="left").fillna(0)
    predictors.append(df_median.columns[-1])
    return df


def merge_feat_max(df, df_feat, columns_groupby, column, new_column_name=""):
    df_max = pd.DataFrame(df_feat.groupby(columns_groupby)[column].max()).reset_index()
    if not new_column_name:
        df_max.columns = columns_groupby + [column + "_%s_max" % ("_".join(columns_groupby))]
    else:
        df_max.columns = columns_groupby + [new_column_name]
    df = df.merge(df_max, on=columns_groupby, how="left").fillna(0)
    predictors.append(df_max.columns[-1])
    return df


def merge_feat_min(df, df_feat, columns_groupby, column, new_column_name=""):
    df_min = pd.DataFrame(df_feat.groupby(columns_groupby)[column].min()).reset_index()
    if not new_column_name:
        df_min.columns = columns_groupby + [column + "_%s_min" % ("_".join(columns_groupby))]
    else:
        df_min.columns = columns_groupby + [new_column_name]
    df = df.merge(df_min, on=columns_groupby, how="left").fillna(0)
    predictors.append(df_min.columns[-1])
    return df


def merge_feat_sum(df, df_feat, columns_groupby, column, new_column_name=""):
    df_sum = pd.DataFrame(df_feat.groupby(columns_groupby)[column].sum()).reset_index()
    if not new_column_name:
        df_sum.columns = columns_groupby + [column + "_%s_sum" % ("_".join(columns_groupby))]
    else:
        df_sum.columns = columns_groupby + [new_column_name]
    df = df.merge(df_sum, on=columns_groupby, how="left").fillna(0)
    predictors.append(df_sum.columns[-1])
    return df


def merge_feat_var(df, df_feat, columns_groupby, column, new_column_name=""):
    df_var = pd.DataFrame(df_feat.groupby(columns_groupby)[column].var()).reset_index()
    if not new_column_name:
        df_var.columns = columns_groupby + [column + "_%s_var" % ("_".join(columns_groupby))]
    else:
        df_var.columns = columns_groupby + [new_column_name]
    df = df.merge(df_var, on=columns_groupby, how="left").fillna(0)
    predictors.append(df_var.columns[-1])
    return df


def merge_feat_quantile(df, df_feat, columns_groupby, column, quantile_n, new_column_name=""):
    df_quantile = pd.DataFrame(df_feat.groupby(columns_groupby)[column].quantile(quantile_n)).reset_index()
    if not new_column_name:
        df_quantile.columns = columns_groupby + [column + "_%s_quantile" % ("_".join(columns_groupby))]
    else:
        df_quantile.columns = columns_groupby + [new_column_name]
    df = df.merge(df_quantile, on=columns_groupby, how="left").fillna(0)
    predictors.append(df_quantile.columns[-1])
    return df


def merge_feat_skew(df, df_feat, columns_groupby, column, new_column_name=""):
    df_skew = pd.DataFrame(df_feat.groupby(columns_groupby)[column].skew()).reset_index()
    if not new_column_name:
        df_skew.columns = columns_groupby + [column + "_%s_skew" % ("_".join(columns_groupby))]
    else:
        df_skew.columns = columns_groupby + [new_column_name]
    df = df.merge(df_skew, on=columns_groupby, how="left").fillna(0)
    predictors.append(df_skew.columns[-1])
    return df


def merge_rank_sp(df, feat1, feat2, ascending):
    df.sort_values([feat1, feat2], inplace=True, ascending=ascending)
    df['rank'] = range(df.shape[0])
    min_rank = df.groupby(feat1, as_index=False)['rank'].agg({'min_rank': 'min'})
    df = pd.merge(df, min_rank, on=feat1, how='left')
    df['rank'] = df['rank'] - df['min_rank']
    predictors.append('rank')
    del df['min_rank']
    return df


def log(info):
    print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' ' + str(info))


def log_shape(train, test):
    log('Train data shape: %s' % str(train.shape))
    log('Test data shape: %s' % str(test.shape))

In [7]:
def process_date(df):
    format = '%Y-%m-%d %H:%M:%S'
    df['date'] = pd.to_datetime(df['click_time'], format=format)
    df['month'] = df['date'].dt.month.astype('uint8')
    df['weekday'] = df['date'].dt.weekday.astype('uint8')
    df['day'] = df['date'].dt.day.astype('uint8')
    df['hour'] = df['date'].dt.hour.astype('uint8')
    df['minute'] = df['date'].dt.minute.astype('uint8')
    df['second'] = df['date'].dt.second.astype('uint8')
    df['tm_hour'] = (df['hour'] + df['minute'] / 60.0).astype('float32')
    df['tm_hour_sin'] = (df['tm_hour'].map(lambda x: math.sin((x - 12) / 24 * 2 * math.pi))).astype('float32')
    df['tm_hour_cos'] = (df['tm_hour'].map(lambda x: math.cos((x - 12) / 24 * 2 * math.pi))).astype('float32')
    del df['click_time']
    return df

In [10]:
########### Construct features function - begin ###########

In [3]:
import pandas as pd
dtypes = {
    'click_id': 'uint32',
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}

In [4]:
gyz = pd.read_csv('train.csv', nrows=1000, header=0, sep=',', dtype=dtypes, usecols=['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed'])

In [9]:
import math

In [10]:
gyz = process_date(gyz)
gyz.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,date,month,weekday,day,hour,minute,second,tm_hour,tm_hour_sin,tm_hour_cos
0,83230,3,1,13,379,0,2017-11-06 14:32:21,11,0,6,14,32,21,14.533334,0.615662,0.788011
1,17357,3,1,19,379,0,2017-11-06 14:33:34,11,0,6,14,33,34,14.55,0.619094,0.785317
2,35810,3,1,13,379,0,2017-11-06 14:34:12,11,0,6,14,34,12,14.566667,0.622515,0.782608
3,45745,14,1,13,478,0,2017-11-06 14:34:52,11,0,6,14,34,52,14.566667,0.622515,0.782608
4,161007,3,1,13,379,0,2017-11-06 14:35:08,11,0,6,14,35,8,14.583333,0.625923,0.779885


In [16]:
gyz[['ip', 'os', 'device', 'app', "date"]].groupby(['ip', 'os', 'device', 'app']).date.shift(-1) - gyz.date

0          NaT
1          NaT
2          NaT
3          NaT
4          NaT
5          NaT
6          NaT
7          NaT
8          NaT
9          NaT
10         NaT
11         NaT
12         NaT
13         NaT
14         NaT
15         NaT
16         NaT
17         NaT
18         NaT
19         NaT
20         NaT
21         NaT
22         NaT
23         NaT
24         NaT
25         NaT
26         NaT
27    01:00:48
28         NaT
29         NaT
        ...   
970        NaT
971        NaT
972        NaT
973        NaT
974        NaT
975        NaT
976        NaT
977        NaT
978        NaT
979        NaT
980        NaT
981        NaT
982        NaT
983        NaT
984        NaT
985        NaT
986        NaT
987        NaT
988        NaT
989        NaT
990        NaT
991        NaT
992        NaT
993        NaT
994        NaT
995        NaT
996        NaT
997        NaT
998        NaT
999        NaT
Name: date, Length: 1000, dtype: timedelta64[ns]

In [11]:
def cal_next_time_delta(df, suffix, type='float32'):
    groupby_columns = [
        {'columns': ['ip', 'app', 'channel', 'device', 'os']},
        {'columns': ['ip', 'os', 'device']},
        {'columns': ['ip', 'os', 'device', 'app']}
    ]
    #group by ip os device app and date
    # Calculate the time to next click for each group
    for spec in groupby_columns:
        # Name of new feature
        new_name = '{}_{}'.format('_'.join(spec['columns']), suffix)
        # Unique list of features to select
        all_features = spec['columns'] + ['date']
        # Run calculation
        log('Calculate ' + suffix + '...')
        df[new_name] = (df[all_features].groupby(spec['columns']).date.shift(-1) - df.date).dt.seconds.astype(type)
        predictors.append(new_name)
        gc.collect()
    return df


def cal_prev_time_delta(df, suffix, type='float32'):
    groupby_columns = [
        {'columns': ['ip', 'channel']},
        {'columns': ['ip', 'os']}
    ]
    # Calculate the time to prev click for each group
    for spec in groupby_columns:
        # Name of new feature
        new_name = '{}_{}'.format('_'.join(spec['columns']), suffix)
        # Unique list of features to select
        all_features = spec['columns'] + ['date']
        # Run calculation
        log('Calculate ' + suffix + '...')
        df[new_name] = (df.date - df[all_features].groupby(spec['columns']).date.shift(+1)).dt.seconds.astype(type)
        predictors.append(new_name)
        gc.collect()
    return df


def cal_cvr(train, test, type='float32'):
    train['cvr_gb_ip_day_hour'] = 0
    train['cvr_gb_ip_app'] = 0
    train['cvr_gb_ip_app_os'] = 0

    # Define group by list
    idh = ['ip', 'day', 'hour']
    ia = ['ip', 'app']
    iao = ['ip', 'app', 'os']

    kf = KFold(train.shape[0], n_folds=5, shuffle=True, random_state=7)

    for i, (train_index, test_index) in enumerate(kf):
        log('Fold ' + str(i) + ' begin...')

        # Divide train/test fold
        tr = train.iloc[train_index, :train.shape[1] - 3]
        te = train.iloc[test_index, :train.shape[1] - 3]

        # Calculate sum of label of train folds
        log('Cal sum_label_gb_ip_day_hour')
        tr = merge_sum(tr, idh, 'is_attributed', 'sum_label_gb_ip_day_hour')
        log('Cal sum_label_gb_ip_app')
        tr = merge_sum(tr, ia, 'is_attributed', 'sum_label_gb_ip_app')
        log('Cal sum_label_gb_ip_app_os')
        tr = merge_sum(tr, iao, 'is_attributed', 'sum_label_gb_ip_app_os')

        # Calculate cvr of train folds with using smothing technique
        tr['cvr_gb_ip_day_hour'] = GaussianSmoth().update_moment(tr['count_gb_ip_day_hour'], tr['sum_label_gb_ip_day_hour'])
        tr['cvr_gb_ip_app'] = GaussianSmoth().update_moment(tr['count_gb_ip_app'], tr['sum_label_gb_ip_app'])
        tr['cvr_gb_ip_app_os'] = GaussianSmoth().update_moment(tr['count_gb_ip_app_os'], tr['sum_label_gb_ip_app_os'])

        # Merge test fold with cvr features of train folds
        te = te.merge(tr[['cvr_gb_ip_day_hour'] + idh].drop_duplicates(subset=idh, keep='first'), on=idh, how='left')
        te = te.merge(tr[['cvr_gb_ip_app'] + ia].drop_duplicates(subset=ia, keep='first'), on=ia, how='left')
        te = te.merge(tr[['cvr_gb_ip_app_os'] + iao].drop_duplicates(subset=iao, keep='first'), on=iao, how='left')

        # Put it in train
        train['cvr_gb_ip_day_hour'] += te['cvr_gb_ip_day_hour']
        train['cvr_gb_ip_app'] += te['cvr_gb_ip_app']
        train['cvr_gb_ip_app_os'] += te['cvr_gb_ip_app_os']

        del tr, te
        log('Fold ' + str(i) + ' Done!')

    # Convert type
    train['cvr_gb_ip_day_hour'] = train['cvr_gb_ip_day_hour'].astype(type)
    train['cvr_gb_ip_app'] = train['cvr_gb_ip_app'].astype(type)
    train['cvr_gb_ip_app_os'] = train['cvr_gb_ip_app_os'].astype(type)

    # Merge cvr of train to test
    test = test.merge(train[['cvr_gb_ip_day_hour'] + idh].drop_duplicates(subset=idh, keep='first'), on=idh, how='left')
    test = test.merge(train[['cvr_gb_ip_app'] + ia].drop_duplicates(subset=ia, keep='first'), on=ia, how='left')
    test = test.merge(train[['cvr_gb_ip_app_os'] + iao].drop_duplicates(subset=iao, keep='first'), on=iao, how='left')

    predictors.append('cvr_gb_ip_day_hour')
    predictors.append('cvr_gb_ip_app')
    predictors.append('cvr_gb_ip_app_os')

    return train, test

In [12]:
########### Construct features function - end ###########

In [13]:
def spilt_local_train_test(df, train_size, test_size):
    local_train = df[:train_size]
    local_test = df[train_size:train_size + test_size]
    return local_train, local_test


def get_model_input_data(train, test, is_local):
    feat = ['ip', 'app', 'device', 'os', 'channel', 'hour']
    for f in feat:
        if f not in predictors:
            predictors.append(f)
    train_x = train[predictors]
    train_y = train.is_attributed.values
    if is_local == 1:
        test_x = test[train_x.columns.values]
        test_y = test.is_attributed.values
        return train_x, train_y, test_x, test_y
    else:
        test_x = test[train_x.columns.values]
        return train_x, train_y, test_x

In [10]:
def lgb_cv(train_feature, train_label, test_feature, test_label, params, folds, rounds):
    start = time.clock()
    print(train_feature.columns)
    params['scale_pos_weight'] = float(len(train_label[train_label == 0])) / len(train_label[train_label == 1])
    dtrain = lgb.Dataset(train_feature, label=train_label, categorical_feature=['app', 'device', 'os', 'channel', 'hour'])
    dtest = lgb.Dataset(test_feature, label=test_label, categorical_feature=['app', 'device', 'os', 'channel', 'hour'])
    num_round = rounds
    print('LightGBM run cv: ' + 'round: ' + str(rounds))
    res = lgb.train(params, dtrain, num_round, valid_sets=[dtest], valid_names=['test'], verbose_eval=1, early_stopping_rounds=20)
    elapsed = (time.clock() - start)
    print('Time used:', elapsed, 's')
    return res.best_iteration, res.best_score['test']['auc'], res


def lgb_predict(dtrain, test_feature, rounds, params):
#     dtrain = lgb.Dataset(train_feature, label=train_label, categorical_feature=['app', 'device', 'os', 'channel', 'hour'])
    num_round = rounds
    model = lgb.train(params, dtrain, num_round, valid_sets=[dtrain], verbose_eval=1)
    predict = model.predict(test_feature)
    return model, predict

In [16]:
def store_result(test_index, pred, name):
    result = pd.DataFrame({'click_id': test_index, 'is_attributed': pred})
    result.to_csv(name, index=False, sep=',')
    return result

In [17]:
class GaussianSmoth(object):
    def __init__(self, alpha=0, beta=0):
        self.alpha = alpha
        self.beta = beta

    def update_moment(self, tries, success):
        '''estimate alpha, beta using moment estimation'''
        mean, var = self.__compute_moment(tries, success)
        self.alpha = (mean + 0.000001) * ((mean + 0.000001) * (1.000001 - mean) / (var + 0.000001) - 1)
        self.beta = (1.000001 - mean) * ((mean + 0.000001) * (1.000001 - mean) / (var + 0.000001) - 1)
        print(self.alpha, self.beta)
        return (self.alpha + success) / (self.alpha + self.beta + tries)

    def __compute_moment(self, tries, success):
        # Cal mean and variance
        '''moment estimation'''
        ctr_list = []
        mean = (success / tries).mean()
        if len(tries) == 1:
            var = 0
        else:
            var = (success / tries).var()
        return mean, var

In [18]:
########################################### Read data ###########################################

In [19]:
log('Read data...')
dtypes = {
    'click_id': 'uint32',
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}
train = pd.read_csv(root_path + 'train.csv', header=0, sep=',', dtype=dtypes, usecols=['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed'])
test_supplement = pd.read_csv(root_path + 'test_supplement.csv', header=0, sep=',', dtype=dtypes, usecols=['ip', 'app', 'device', 'os', 'channel', 'click_time'])
gc.collect()
log('Read data done!')
log_shape(train, test_supplement)

2018-06-28 15:51:35 Read data...
2018-06-28 15:54:16 Read data done!
2018-06-28 15:54:16 Train data shape: (184903890, 7)
2018-06-28 15:54:16 Test data shape: (57537505, 6)


In [20]:
########################################### Preprocess ###########################################

In [23]:
log(('Process date...'))
train = process_date(train)
test_supplement = process_date(test_supplement)
gc.collect()
log('Process date done!')
log_shape(train, test_supplement)

2018-06-28 15:55:52 Process date...
2018-06-28 16:02:23 Process date done!
2018-06-28 16:02:23 Train data shape: (184903890, 16)
2018-06-28 16:02:23 Test data shape: (57537505, 15)


In [24]:
########################################### Feature engineer ###########################################

In [25]:
train_len = len(train)
log('Train size:' + str(train_len))

2018-06-28 16:02:23 Train size:184903890


In [26]:
log('Train append test_supplement...')
df = train.append(test_supplement).reset_index(drop=True)
del train
del test_supplement
gc.collect()
log('Train append test_supplement done!')

2018-06-28 16:02:23 Train append test_supplement...
2018-06-28 16:02:51 Train append test_supplement done!


In [27]:
log('Before feature engineer')
log('Num of features: ' + str(len(df.columns)))
log('Features: ' + str(df.columns))

2018-06-28 16:02:51 Before feature engineer
2018-06-28 16:02:51 Num of features: 16
2018-06-28 16:02:51 Features: Index(['app', 'channel', 'date', 'day', 'device', 'hour', 'ip',
       'is_attributed', 'minute', 'month', 'os', 'second', 'tm_hour',
       'tm_hour_cos', 'tm_hour_sin', 'weekday'],
      dtype='object')


In [28]:
# Construct features...

In [29]:
log('Cal next_time_delta')
df = cal_next_time_delta(df, 'next_time_delta', 'float32')
gc.collect()
log('Cal prev_time_delta')
df = cal_prev_time_delta(df, 'prev_time_delta', 'float32')
gc.collect()

2018-06-28 16:02:51 Cal next_time_delta
2018-06-28 16:02:51 Calculate next_time_delta...
2018-06-28 16:06:26 Calculate next_time_delta...
2018-06-28 16:08:04 Calculate next_time_delta...
2018-06-28 16:10:39 Cal prev_time_delta
2018-06-28 16:10:39 Calculate prev_time_delta...
2018-06-28 16:12:43 Calculate prev_time_delta...


0

In [31]:
log('Cal nunique_channel_gb_ip')
df = merge_nunique(df, ['ip'], 'channel', 'nunique_channel_gb_ip', 'uint32')
gc.collect()
log('Cal nunique_app_gb_ip_device_os')
df = merge_nunique(df, ['ip', 'device', 'os'], 'app', 'nunique_app_gb_ip_device_os', 'uint32')
gc.collect()
log('Cal nunique_hour_gb_ip_day')
df = merge_nunique(df, ['ip', 'day'], 'hour', 'nunique_hour_gb_ip_day', 'uint32')
gc.collect()
log('Cal nunique_app_gb_ip')
df = merge_nunique(df, ['ip'], 'app', 'nunique_app_gb_ip', 'uint32')
gc.collect()
log('Cal nunique_os_gb_ip_app')
df = merge_nunique(df, ['ip', 'app'], 'os', 'nunique_os_gb_ip_app', 'uint32')
gc.collect()
log('Cal nunique_device_gb_ip')
df = merge_nunique(df, ['ip'], 'device', 'nunique_device_gb_ip', 'uint32')
gc.collect()
log('Cal nunique_channel_gb_app')
df = merge_nunique(df, ['app'], 'channel', 'nunique_channel_gb_app', 'uint32')
gc.collect()

2018-06-28 16:18:57 Cal nunique_channel_gb_ip
2018-06-28 16:24:54 Cal nunique_app_gb_ip_device_os
2018-06-28 16:32:18 Cal nunique_hour_gb_ip_day
2018-06-28 16:37:43 Cal nunique_app_gb_ip
2018-06-28 16:44:02 Cal nunique_os_gb_ip_app
2018-06-28 16:51:35 Cal nunique_device_gb_ip
2018-06-28 16:57:13 Cal nunique_channel_gb_app


42

In [32]:
log('Cal cumcount_os_gb_ip')
df = merge_cumcount(df, ['ip'], 'os', 'cumcount_os_gb_ip', 'uint32');
gc.collect()
log('Cal cumcount_app_gb_ip_device_os')
df = merge_cumcount(df, ['ip', 'device', 'os'], 'app', 'cumcount_app_gb_ip_device_os', 'uint32');
gc.collect()

2018-06-28 17:10:51 Cal cumcount_os_gb_ip
2018-06-28 17:12:20 Cal cumcount_app_gb_ip_device_os


14

In [33]:
log('Cal count_gb_ip_day_hour')
df = merge_count(df, ['ip', 'day', 'hour'], 'count_gb_ip_day_hour', 'uint32');
gc.collect()
log('Cal count_gb_ip_app')
df = merge_count(df, ['ip', 'app'], 'count_gb_ip_app', 'uint32');
gc.collect()
log('Cal count_gb_ip_app_os')
df = merge_count(df, ['ip', 'app', 'os'], 'count_gb_ip_app_os', 'uint32');
gc.collect()

2018-06-28 17:20:37 Cal count_gb_ip_day_hour
2018-06-28 17:25:12 Cal count_gb_ip_app
2018-06-28 17:30:51 Cal count_gb_ip_app_os


56

In [35]:
log('Cal var_day_gb_ip_app_os')
df = merge_var(df, ['ip', 'app', 'os'], 'day', 'var_day_gb_ip_app_os', 'float32')
gc.collect()

2018-06-28 17:40:06 Cal var_day_gb_ip_app_os


129

In [36]:
log('After feature engineer')
log('Num of features: ' + str(len(df.columns)))
log('Features: ' + str(df.columns))

2018-06-28 17:47:24 After feature engineer
2018-06-28 17:47:24 Num of features: 34
2018-06-28 17:47:24 Features: Index(['app', 'channel', 'date', 'day', 'device', 'hour', 'ip',
       'is_attributed', 'minute', 'month', 'os', 'second', 'tm_hour',
       'tm_hour_cos', 'tm_hour_sin', 'weekday',
       'ip_app_channel_device_os_next_time_delta',
       'ip_os_device_next_time_delta', 'ip_os_device_app_next_time_delta',
       'ip_channel_prev_time_delta', 'ip_os_prev_time_delta',
       'nunique_channel_gb_ip', 'nunique_app_gb_ip_device_os',
       'nunique_hour_gb_ip_day', 'nunique_app_gb_ip', 'nunique_os_gb_ip_app',
       'nunique_device_gb_ip', 'nunique_channel_gb_app', 'cumcount_os_gb_ip',
       'cumcount_app_gb_ip_device_os', 'count_gb_ip_day_hour',
       'count_gb_ip_app', 'count_gb_ip_app_os', 'var_day_gb_ip_app_os'],
      dtype='object')


In [42]:
pickle.dump(df, open("chinese_person_model.pickle", "wb"), protocol=4)

In [43]:
a = pickle.load(open("chinese_person_model.pickle", "rb"))

In [45]:
del a

In [46]:
log('Train test_supplement divid...')
train = df[:train_len]
test_supplement = df[train_len:]
del df
gc.collect()
log_shape(train, test_supplement)
log('Train test_supplement divid done!')

2018-06-28 18:47:50 Train test_supplement divid...
2018-06-28 18:48:09 Train data shape: (184903890, 34)
2018-06-28 18:48:09 Test data shape: (57537505, 34)
2018-06-28 18:48:09 Train test_supplement divid done!


In [47]:
log('Read test...')
test = pd.read_csv(root_path + 'test.csv', header=0, sep=',', dtype=dtypes, usecols=['click_id', 'ip', 'app', 'device', 'os', 'channel', 'click_time'], parse_dates=['click_time'])
log('Test data original shape: ' + str(test.shape))

2018-06-28 18:48:22 Read test...
2018-06-28 18:48:42 Test data original shape: (18790469, 7)


In [48]:
test = test.merge(test_supplement.drop_duplicates(subset=['ip', 'app', 'device', 'os', 'channel', 'date'], keep='first'), left_on=['ip', 'app', 'device', 'os', 'channel', 'click_time'], right_on=['ip', 'app', 'device', 'os', 'channel', 'date'], how='left')
test.drop(['click_time'], axis=1, inplace=True)
del test_supplement
gc.collect()
log_shape(train, test)
log('Read test done!')

2018-06-28 18:51:54 Train data shape: (184903890, 34)
2018-06-28 18:51:54 Test data shape: (18790469, 35)
2018-06-28 18:51:54 Read test done!


In [49]:
log('Cal cvr...')
train, test = cal_cvr(train, test, 'float32')
log('Cal cvr done!')

2018-06-28 18:52:12 Cal cvr...
2018-06-28 18:52:36 Fold 0 begin...
2018-06-28 18:53:37 Cal sum_label_gb_ip_day_hour
2018-06-28 18:56:34 Cal sum_label_gb_ip_app
2018-06-28 19:00:02 Cal sum_label_gb_ip_app_os
0.0035311148268673486 1.656359258566487
0.001306208065960712 0.6615934683856983
0.000896427256614664 0.4242455477965775
2018-06-28 19:08:39 Fold 0 Done!
2018-06-28 19:08:42 Fold 1 begin...
2018-06-28 19:09:43 Cal sum_label_gb_ip_day_hour
2018-06-28 19:12:43 Cal sum_label_gb_ip_app
2018-06-28 19:16:30 Cal sum_label_gb_ip_app_os
0.0035423236432704022 1.6610043930628793
0.0013066384017462493 0.6614581849267422
0.0008975560377448102 0.42456757856918026
2018-06-28 19:25:04 Fold 1 Done!
2018-06-28 19:25:08 Fold 2 begin...
2018-06-28 19:26:06 Cal sum_label_gb_ip_day_hour
2018-06-28 19:29:07 Cal sum_label_gb_ip_app
2018-06-28 19:32:52 Cal sum_label_gb_ip_app_os
0.003544077155657474 1.6644437732869244
0.0013067475306259206 0.6628916191252506
0.000897447530380914 0.4253481040987524
2018-06-28

In [50]:
cvr_feats = ['cvr_gb_ip_day_hour', 'cvr_gb_ip_app', 'cvr_gb_ip_app_os']

In [51]:
pickle.dump(train[cvr_feats], open('train_cvr.p', 'wb'), protocol=4)
pickle.dump(test[cvr_feats], open('test_cvr.p', 'wb'), protocol=4)

In [52]:

########################################### Split dataset for local ##################################################

In [53]:
log('Split dataset to get local train/test set...')
local_train_size = 10000000  # 182403890
local_test_size = 2500000
local_train, local_test = spilt_local_train_test(train, local_train_size, local_test_size)
log('Split dataset to get local train/test set done!')

log('================================= Local data info =====================================')
log('Local train shape:' + str(local_train.shape))
log('Local test shape:' + str(local_test.shape))
log('Local train label ratio (0-1):' + str(local_train.is_attributed.value_counts().values * 1.0 / local_train.shape[0]))
log('Local train label number (0-1):' + str(local_train.is_attributed.value_counts().values))
log('Local train min/max date:' + str(local_train.date.min()) + ',' + str(local_train.date.max()))
log('Local test min/max date:' + str(local_test.date.min()) + ',' + str(local_test.date.max()))
log('=======================================================================================')

log('================================= Online data info =====================================')
log('Online train shape:' + str(train.shape))
log('Online test shape:' + str(test.shape))
log('Online train label ratio (0-1):' + str(train.is_attributed.value_counts().values * 1.0 / train.shape[0]))
log('Online train label number (0-1):' + str(train.is_attributed.value_counts().values))
log('Online train min/max date:' + str(train.date.min()) + ',' + str(train.date.max()))
log('Online train min/max date:' + str(test.date.min()) + ',' + str(test.date.max()))
log('=======================================================================================')

2018-06-28 20:18:03 Split dataset to get local train/test set...
2018-06-28 20:18:03 Split dataset to get local train/test set done!
2018-06-28 20:18:03 Local train shape:(10000000, 37)
2018-06-28 20:18:03 Local test shape:(2500000, 37)
2018-06-28 20:18:04 Local train label ratio (0-1):[0.9981283 0.0018717]
2018-06-28 20:18:04 Local train label number (0-1):[9981283   18717]
2018-06-28 20:18:04 Local train min/max date:2017-11-06 14:32:21,2017-11-07 00:12:03
2018-06-28 20:18:04 Local test min/max date:2017-11-07 00:12:03,2017-11-07 00:52:39
2018-06-28 20:18:04 Online train shape:(184903890, 37)
2018-06-28 20:18:04 Online test shape:(18790469, 38)
2018-06-28 20:18:06 Online train label ratio (0-1):[0.99752928 0.00247072]
2018-06-28 20:18:07 Online train label number (0-1):[184447044    456846]
2018-06-28 20:18:09 Online train min/max date:2017-11-06 14:32:21,2017-11-09 16:00:00
2018-06-28 20:18:09 Online train min/max date:2017-11-10 04:00:00,2017-11-10 15:00:00


In [54]:
log('Get local model input data...')
local_train_x, local_train_y, local_test_x, local_test_y = get_model_input_data(local_train, local_test, is_local=1)
del local_train
del local_test
gc.collect()
log_shape(local_train_x, local_test_x)
log('Get local model input data done!')

2018-06-28 20:18:30 Get local model input data...
2018-06-28 20:18:30 Train data shape: (10000000, 27)
2018-06-28 20:18:30 Test data shape: (2500000, 27)
2018-06-28 20:18:30 Get local model input data done!


In [55]:
log('Get online model input data...')
online_train_x, online_train_y, online_test_x = get_model_input_data(train, test, is_local=0)
del train
del test
gc.collect()
log_shape(online_train_x, online_test_x)
log('Get online model input data done!')

2018-06-28 20:18:56 Get online model input data...
2018-06-28 20:19:15 Train data shape: (184903890, 27)
2018-06-28 20:19:15 Test data shape: (18790469, 27)
2018-06-28 20:19:15 Get online model input data done!


In [56]:
########################################### LigthGBM ###########################################


In [57]:
config_lgb = {
    'rounds': 10000,
    'folds': 5
}

params_lgb = {
    'boosting_type': 'gbdt',
    'objective': 'xentropy',
    'metric': 'auc',
    'learning_rate': 0.02,
    # 'is_unbalance': 'true',  # Because training data is unbalance (replaced with scale_pos_weight)
    'scale_pos_weight': 200,  # Because training data is extremely unbalanced
    'num_leaves': 31,  # We should let it be smaller than 2^(max_depth)
    'max_depth': -1,  # -1 means no limit
    'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
    'max_bin': 128,  # Number of bucketed bin for feature values
    'subsample': 0.7,  # Subsample ratio of the training instance.
    'subsample_freq': 1,  # Frequence of subsample, <=0 means no enable
    'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.
    'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
    'subsample_for_bin': 200000,  # Number of samples for constructing bin
    'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
    'reg_alpha': 0.99,  # L1 regularization term on weights
    'reg_lambda': 0.9,  # L2 regularization term on weights
    'nthread': 14,
    'verbose': 1,
    'seed': 8
}

In [58]:
iterations_lgb, best_score_lgb, model_cv_lgb = lgb_cv(local_train_x, local_train_y, local_test_x, local_test_y, params_lgb, config_lgb['folds'], config_lgb['rounds'])

Index(['ip_app_channel_device_os_next_time_delta',
       'ip_os_device_next_time_delta', 'ip_os_device_app_next_time_delta',
       'ip_channel_prev_time_delta', 'ip_os_prev_time_delta',
       'nunique_channel_gb_ip', 'nunique_app_gb_ip_device_os',
       'nunique_hour_gb_ip_day', 'nunique_app_gb_ip', 'nunique_os_gb_ip_app',
       'nunique_device_gb_ip', 'nunique_channel_gb_app', 'cumcount_os_gb_ip',
       'cumcount_app_gb_ip_device_os', 'count_gb_ip_day_hour',
       'count_gb_ip_app', 'count_gb_ip_app_os', 'var_day_gb_ip_app_os',
       'cvr_gb_ip_day_hour', 'cvr_gb_ip_app', 'cvr_gb_ip_app_os', 'ip', 'app',
       'device', 'os', 'channel', 'hour'],
      dtype='object')
LightGBM run cv: round: 10000
[1]	test's auc: 0.903934
Training until validation scores don't improve for 20 rounds.
[2]	test's auc: 0.918299
[3]	test's auc: 0.952441
[4]	test's auc: 0.953262
[5]	test's auc: 0.954077
[6]	test's auc: 0.963225
[7]	test's auc: 0.963417
[8]	test's auc: 0.963978
[9]	test's auc: 0.9647

[281]	test's auc: 0.980642
[282]	test's auc: 0.98064
[283]	test's auc: 0.98064
[284]	test's auc: 0.980641
[285]	test's auc: 0.980648
[286]	test's auc: 0.980643
[287]	test's auc: 0.980643
[288]	test's auc: 0.980642
[289]	test's auc: 0.980647
[290]	test's auc: 0.980641
[291]	test's auc: 0.980645
[292]	test's auc: 0.980659
[293]	test's auc: 0.980652
[294]	test's auc: 0.980671
[295]	test's auc: 0.980671
[296]	test's auc: 0.980675
[297]	test's auc: 0.980674
[298]	test's auc: 0.980676
[299]	test's auc: 0.980688
[300]	test's auc: 0.980705
[301]	test's auc: 0.980704
[302]	test's auc: 0.980724
[303]	test's auc: 0.980736
[304]	test's auc: 0.980739
[305]	test's auc: 0.980729
[306]	test's auc: 0.980736
[307]	test's auc: 0.980746
[308]	test's auc: 0.98076
[309]	test's auc: 0.980752
[310]	test's auc: 0.980759
[311]	test's auc: 0.980747
[312]	test's auc: 0.980754
[313]	test's auc: 0.980754
[314]	test's auc: 0.98075
[315]	test's auc: 0.980747
[316]	test's auc: 0.980749
[317]	test's auc: 0.980753
[318]

[586]	test's auc: 0.981401
[587]	test's auc: 0.981402
[588]	test's auc: 0.981404
[589]	test's auc: 0.981406
[590]	test's auc: 0.981399
[591]	test's auc: 0.9814
[592]	test's auc: 0.981402
[593]	test's auc: 0.981406
[594]	test's auc: 0.981404
[595]	test's auc: 0.981398
[596]	test's auc: 0.981404
[597]	test's auc: 0.98141
[598]	test's auc: 0.981418
[599]	test's auc: 0.981413
[600]	test's auc: 0.981415
[601]	test's auc: 0.981424
[602]	test's auc: 0.981429
[603]	test's auc: 0.981417
[604]	test's auc: 0.981411
[605]	test's auc: 0.981409
[606]	test's auc: 0.981402
[607]	test's auc: 0.981398
[608]	test's auc: 0.981404
[609]	test's auc: 0.981405
[610]	test's auc: 0.981405
[611]	test's auc: 0.981403
[612]	test's auc: 0.981402
[613]	test's auc: 0.981406
[614]	test's auc: 0.981406
[615]	test's auc: 0.98141
[616]	test's auc: 0.98141
[617]	test's auc: 0.981411
[618]	test's auc: 0.981415
[619]	test's auc: 0.981412
[620]	test's auc: 0.981412
[621]	test's auc: 0.981423
[622]	test's auc: 0.981423
Early 

In [116]:
online_train_x.memory_usage()

Index                                       6847940240
ip_app_channel_device_os_next_time_delta     739615560
ip_os_device_next_time_delta                 739615560
ip_os_device_app_next_time_delta             739615560
ip_channel_prev_time_delta                   739615560
ip_os_prev_time_delta                        739615560
nunique_channel_gb_ip                        739615560
nunique_app_gb_ip_device_os                  739615560
nunique_hour_gb_ip_day                       739615560
nunique_app_gb_ip                            739615560
nunique_os_gb_ip_app                         739615560
nunique_device_gb_ip                         739615560
nunique_channel_gb_app                       739615560
cumcount_os_gb_ip                            739615560
cumcount_app_gb_ip_device_os                 739615560
count_gb_ip_day_hour                         739615560
count_gb_ip_app                              739615560
count_gb_ip_app_os                           739615560
var_day_gb

In [74]:
import sys

In [59]:
pred_lgb = model_cv_lgb.predict(online_test_x)

In [80]:
train_column = online_train_x.columns

In [99]:
pickle.dump(online_train_x, open("online_train_x.pickle", "wb"), protocol=4)

In [100]:
pickle.dump(online_train_y, open("online_train_y.pickle", "wb"), protocol=4)

In [101]:
pickle.dump(iterations_lgb, open("iterations_lgb.pickle", "wb"), protocol=4)

In [102]:
pickle.dump(params_lgb, open("params_lgb.pickle", "wb"), protocol=4)

In [103]:
pickle.dump(online_test_x, open("online_test_x.pickle", "wb"), protocol=4)

In [97]:
dtrain = lgb.Dataset(online_train_x, label=online_train_y, categorical_feature=['app', 'device', 'os', 'channel', 'hour'])
del dtrain

In [98]:
gc.collect()

0

In [94]:
model_lgb, pred_lgb = lgb_predict(dtrain, online_test_x, iterations_lgb, params_lgb)

MemoryError: 

In [88]:
importance_lgb = sorted(zip(online_train_x.columns, model_cv_lgb.feature_importance("gain")), key=lambda x: x[1], reverse=True)
importance_lgb = pd.DataFrame({'feature': importance_lgb})
importance_lgb = importance_lgb.apply(lambda x: pd.Series(x['feature']), axis=1)
importance_lgb.columns = ['feature', 'importance']
importance_lgb.to_csv('importance-lgb-20180507-%f(r%d).csv' % (best_score_lgb, iterations_lgb), index=False)

## reload

In [2]:
online_train_x2 = pickle.load(open("online_train_x.pickle", "rb"))

In [3]:
online_train_y2 = pickle.load(open("online_train_y.pickle", "rb"))

In [119]:
len(online_train_y2) == len(online_train_y)

True

In [4]:
iterations_lgb2 = pickle.load(open("iterations_lgb.pickle", "rb"))

In [5]:
params_lgb2 = pickle.load(open("params_lgb.pickle", "rb"))

In [6]:
online_test_x2 = pickle.load(open("online_test_x.pickle", "rb"))

In [8]:
dtrain = lgb.Dataset(online_train_x2, label=online_train_y2, categorical_feature=['app', 'device', 'os', 'channel', 'hour'])

In [11]:
model_lgb, pred_lgb = lgb_predict(dtrain, online_test_x2, iterations_lgb2, params_lgb2)

[1]	training's auc: 0.890661
[2]	training's auc: 0.926717
[3]	training's auc: 0.955742
[4]	training's auc: 0.958631
[5]	training's auc: 0.959335
[6]	training's auc: 0.959503
[7]	training's auc: 0.959542
[8]	training's auc: 0.959803
[9]	training's auc: 0.95994
[10]	training's auc: 0.96001
[11]	training's auc: 0.960017
[12]	training's auc: 0.960142
[13]	training's auc: 0.968521
[14]	training's auc: 0.96854
[15]	training's auc: 0.968633
[16]	training's auc: 0.968645
[17]	training's auc: 0.968735
[18]	training's auc: 0.968795
[19]	training's auc: 0.968817
[20]	training's auc: 0.968905
[21]	training's auc: 0.968916
[22]	training's auc: 0.969819
[23]	training's auc: 0.969864
[24]	training's auc: 0.970065
[25]	training's auc: 0.970237
[26]	training's auc: 0.970325
[27]	training's auc: 0.970521
[28]	training's auc: 0.970579
[29]	training's auc: 0.970889
[30]	training's auc: 0.970928
[31]	training's auc: 0.970821
[32]	training's auc: 0.971136
[33]	training's auc: 0.971192
[34]	training's auc: 0

[271]	training's auc: 0.982862
[272]	training's auc: 0.982879
[273]	training's auc: 0.982898
[274]	training's auc: 0.982918
[275]	training's auc: 0.982931
[276]	training's auc: 0.982946
[277]	training's auc: 0.982965
[278]	training's auc: 0.982983
[279]	training's auc: 0.983002
[280]	training's auc: 0.983021
[281]	training's auc: 0.983039
[282]	training's auc: 0.983047
[283]	training's auc: 0.983066
[284]	training's auc: 0.983082
[285]	training's auc: 0.9831
[286]	training's auc: 0.983121
[287]	training's auc: 0.983141
[288]	training's auc: 0.983161
[289]	training's auc: 0.983174
[290]	training's auc: 0.983183
[291]	training's auc: 0.98319
[292]	training's auc: 0.983204
[293]	training's auc: 0.983219
[294]	training's auc: 0.983232
[295]	training's auc: 0.983243
[296]	training's auc: 0.983254
[297]	training's auc: 0.983267
[298]	training's auc: 0.983284
[299]	training's auc: 0.983297
[300]	training's auc: 0.983312
[301]	training's auc: 0.983326
[302]	training's auc: 0.983344
[303]	train

[537]	training's auc: 0.984815
[538]	training's auc: 0.984816
[539]	training's auc: 0.98482
[540]	training's auc: 0.984823
[541]	training's auc: 0.984828
[542]	training's auc: 0.98483
[543]	training's auc: 0.984831
[544]	training's auc: 0.984832
[545]	training's auc: 0.984835
[546]	training's auc: 0.98484
[547]	training's auc: 0.984844
[548]	training's auc: 0.98485
[549]	training's auc: 0.984851
[550]	training's auc: 0.984853
[551]	training's auc: 0.984856
[552]	training's auc: 0.98486
[553]	training's auc: 0.984863
[554]	training's auc: 0.984869
[555]	training's auc: 0.984872
[556]	training's auc: 0.984876
[557]	training's auc: 0.984878
[558]	training's auc: 0.984882
[559]	training's auc: 0.984885
[560]	training's auc: 0.98489
[561]	training's auc: 0.984891
[562]	training's auc: 0.984892
[563]	training's auc: 0.984895
[564]	training's auc: 0.984902
[565]	training's auc: 0.984906
[566]	training's auc: 0.98491
[567]	training's auc: 0.98491
[568]	training's auc: 0.984911
[569]	training's

In [14]:
importance_lgb = sorted(zip(online_train_x2.columns, model_lgb.feature_importance("gain")), key=lambda x: x[1], reverse=True)
importance_lgb = pd.DataFrame({'feature': importance_lgb})
importance_lgb = importance_lgb.apply(lambda x: pd.Series(x['feature']), axis=1)
importance_lgb.columns = ['feature', 'importance']
importance_lgb.to_csv('importance-lgb-whole_model.csv', index=False)

In [17]:
res_lgb = store_result(pd.read_csv('./test.csv', header=0, sep=',', usecols=['click_id']).click_id.astype(int), pred_lgb, 'prediction_chinese.csv')


In [18]:
model_lgb.save_model("model_chinese.h5")
