In [1]:
import datetime
import numpy as np
import pandas as pd
from glob import glob

import warnings
warnings.simplefilter("ignore")

In [2]:
glob('./data2/*')

['./data2/login.csv',
 './data2/purchase_detail.csv',
 './data2/submission.csv',
 './data2/user_info.csv',
 './data2/user_label_train.csv']

# load data

In [3]:
df_login = pd.read_csv('./data2/login.csv')
df_user_info = pd.read_csv('./data2/user_info.csv')
df_submission = pd.read_csv('./data2/submission.csv')
df_user_label_train = pd.read_csv('./data2/user_label_train.csv')
df_purchase_detail = pd.read_csv('./data2/purchase_detail.csv')

In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
for _df in [df_login,df_user_info,df_submission,df_user_label_train,df_purchase_detail]:
    _df = reduce_mem_usage(_df)

Mem. usage decreased to 628.64 Mb (45.8% reduction)
Mem. usage decreased to  8.14 Mb (57.5% reduction)
Mem. usage decreased to  0.29 Mb (50.0% reduction)
Mem. usage decreased to  2.04 Mb (68.7% reduction)
Mem. usage decreased to 141.21 Mb (52.5% reduction)


# feature engineer

## df_user_info

In [6]:
df_user_info['lifetime'] = pd.to_datetime('2020-07-31') - pd.to_datetime(df_user_info['enroll_time'])
df_user_info['lifetime'] = df_user_info['lifetime'].astype('timedelta64[D]')
df_user_info['age'] = 2020 - df_user_info['birth_year']

## login

In [7]:
df_login_feature = df_login.groupby('userid').agg({
    'login_times': ['sum', 'min', 'max', 'std', 'mean','median'],
    'date': ['count']
})
df_login_feature.columns = ["_".join(x) for x in df_login_feature.columns.ravel()]

In [8]:
df = pd.merge(df_user_info, df_login_feature, on='userid', how='inner')

##  purchase_detail

In [9]:
df_purchase_detail['month'] = df_purchase_detail['grass_date'].str[5:7].astype('int')

In [10]:
df_purchase_detail['grass_date'] = pd.to_datetime(df_purchase_detail.grass_date)
df_purchase_detail = df_purchase_detail.sort_values(['userid','grass_date'], ascending=[1,1])

In [11]:
df_purchase_detail['epochtime'] = (
    df_purchase_detail['grass_date'] - datetime.datetime(1970, 1, 1)).dt.total_seconds()
df_purchase_detail['amount_per_order'] = df_purchase_detail['total_amount'] / df_purchase_detail['order_count']

In [12]:
df_dt = df_purchase_detail[['userid','grass_date']].drop_duplicates()
df_dt['dt_diff'] = df_dt.groupby('userid')['grass_date'].diff().astype('timedelta64[D]')

In [13]:
df_purchase_dt_diff = df_dt.groupby('userid').agg({
    'dt_diff':['sum', 'min', 'max', 'std','mean']
})
df_purchase_dt_diff.columns = ["_".join(x) for x in df_purchase_dt_diff.columns.ravel()]

In [14]:
cat_map = df_purchase_detail['category_encoded'].value_counts(normalize=True)
df_purchase_detail['cat_encode'] = df_purchase_detail['category_encoded'].map(cat_map)

In [15]:
df_purchase_detail_ordersum = pd.crosstab(index=df_purchase_detail['userid'],
    columns=df_purchase_detail['category_encoded'],
    values=df_purchase_detail['order_count'],
    aggfunc='sum',
    margins=False).fillna(0)

df_purchase_detail_ordersum.columns = [f'ordercount_{x}' for x in df_purchase_detail_ordersum.columns]

In [16]:
df_purchase_detail_totalrsum = pd.crosstab(index=df_purchase_detail['userid'],
    columns=df_purchase_detail['category_encoded'],
    values=df_purchase_detail['total_amount'],
    aggfunc='sum',
    margins=False).fillna(0)

df_purchase_detail_totalrsum.columns = [f'totalcount_{x}' for x in df_purchase_detail_totalrsum.columns]

In [17]:
df_purchase_detail_peruserid_count = pd.crosstab(index=df_purchase_detail['userid'],
    columns=df_purchase_detail['category_encoded'],
    values=df_purchase_detail['order_count'],
    aggfunc='count',
    margins=False).fillna(0)

df_purchase_detail_peruserid_count.columns = [f'count_{x}' for x in df_purchase_detail_peruserid_count.columns]

In [18]:
df_purchase_detail_monthamount = pd.crosstab(index=df_purchase_detail['userid'],
    columns=df_purchase_detail['month'],
    values=df_purchase_detail['total_amount'],
    aggfunc='sum',
    margins=False).fillna(0)

df_purchase_detail_monthamount.columns = [f'monthamount_{x}' for x in df_purchase_detail_monthamount.columns]

In [19]:
df_purchase_detail_monthorder = pd.crosstab(index=df_purchase_detail['userid'],
    columns=df_purchase_detail['month'],
    values=df_purchase_detail['order_count'],
    aggfunc='sum',
    margins=False).fillna(0)

df_purchase_detail_monthorder.columns = [f'monthorder_{x}' for x in df_purchase_detail_monthorder.columns]

In [20]:
df_purchase_detail_monthratio = pd.crosstab(index=df_purchase_detail['userid'],
    columns=df_purchase_detail['month'],
    values=df_purchase_detail['amount_per_order'],
    aggfunc='mean',
    margins=False).fillna(0)

df_purchase_detail_monthratio.columns = [f'monthratio_{x}' for x in df_purchase_detail_monthratio.columns]

In [21]:
df_purchase_detail_peruserid = df_purchase_detail.groupby('userid').agg({
    'order_count':['count','mean','std','median','sum','max','min'],
    'total_amount':['count','mean','std','median','sum','max','min'],
    'cat_encode':['sum','mean','std','max','min'],
    'amount_per_order':['sum', 'min', 'max', 'std','mean'],
    'epochtime':['count','mean','std','median','sum','max','min'],
})
df_purchase_detail_peruserid.columns = [f'purchase_detail_{x[0]}_{x[1]}' for x in df_purchase_detail_peruserid.columns]

In [22]:
def normalization(_df):
    return (_df - _df.mean())/_df.std()

In [23]:
df_purchase_detail_peruserid = normalization(df_purchase_detail_peruserid)

In [24]:
df = pd.merge(df, df_purchase_dt_diff, on='userid', how='inner')
df = pd.merge(df, df_purchase_detail_ordersum, on='userid', how='inner')
df = pd.merge(df, df_purchase_detail_totalrsum, on='userid', how='inner')
df = pd.merge(df, df_purchase_detail_peruserid, on='userid', how='inner')
df = pd.merge(df, df_purchase_detail_peruserid_count, on='userid', how='inner')

df = pd.merge(df, df_purchase_detail_monthamount, on='userid', how='inner')
df = pd.merge(df, df_purchase_detail_monthorder, on='userid', how='inner')
df = pd.merge(df, df_purchase_detail_monthratio, on='userid', how='inner')

In [25]:
df = pd.merge(df, df_user_label_train, on='userid', how='outer')

In [26]:
df = reduce_mem_usage(df)

Mem. usage decreased to 153.73 Mb (70.0% reduction)


# model

In [27]:
x = [x for x in df.columns if x not in ['userid','birth_year','label','enroll_time']]
y = 'label'

In [58]:
from sklearn.model_selection import GridSearchCV,cross_val_score,train_test_split

df['gender'] = (df['gender'].fillna(-1)).astype('int')
df['is_seller'] = (df['is_seller'].fillna(-1)).astype('int')

X_train, X_valid, y_train, y_valid = train_test_split(
    df[df.label.notnull()][x], df[df.label.notnull()][y], test_size=0.2, random_state=2048)

X_all, y_all = df[df.label.notnull()][x].values , df[df.label.notnull()][y].values.astype('int')

In [59]:
X_test = df[df.label.isnull()][x].values

In [30]:
len(x)

134

## xgb

In [31]:
import gc
from bayes_opt import BayesianOptimization
from xgboost import XGBClassifier

n_folds = 5

def xgb_eval(gamma, max_depth,min_child_weight,scale_pos_weight,#num_leaves
             colsample_bytree,subsample):
    fit_params={'early_stopping_rounds': 100, 
                'verbose': False,
                'eval_set':[[X_valid,y_valid]]}
    clf = XGBClassifier(learning_rate=0.03,
                        n_estimators=1000,
                        tree_method='gpu_hist',
                        gpu_id='1',
                        max_depth=int(max_depth),
                        #num_leaves = int(num_leaves),
                        gamma = gamma,
                        scale_pos_weight=scale_pos_weight,
                        min_child_weight=min_child_weight, 
                        subsample=subsample, 
                        colsample_bytree=colsample_bytree,
                        #eval_metric='auc',
                        #verbose = 0,
                        n_jobs=10,
                        seed=666)
    cv_result = cross_val_score(estimator=clf,
                                X=X_train,
                                y=y_train,
                                scoring='roc_auc',
                                cv=n_folds, 
                                fit_params = fit_params,
                                verbose=0, 
                                n_jobs=1)    
    return cv_result.mean()

xgbBO = BayesianOptimization(xgb_eval, {'gamma':(0,1),
                                        #'num_leaves': (24, 45),
                                        'scale_pos_weight':(1,3),
                                        'max_depth': (3, 20),
                                        'min_child_weight': (2, 20),
                                        'colsample_bytree':(0.3,0.9),
                                        'subsample':(0.3,0.9)
                                        }, 
                             random_state=0)

In [32]:
print('Bayesian Optimization Start')
xgbBO.maximize(init_points=5 , n_iter=10)
print('Bayesian Optimization End')
print(xgbBO.max)

Bayesian Optimization Start
|   iter    |  target   | colsam... |   gamma   | max_depth | min_ch... | scale_... | subsample |
-------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.6918  [0m | [0m 0.6293  [0m | [0m 0.7152  [0m | [0m 13.25   [0m | [0m 11.81   [0m | [0m 1.847   [0m | [0m 0.6875  [0m |
| [0m 2       [0m | [0m 0.6829  [0m | [0m 0.5626  [0m | [0m 0.8918  [0m | [0m 19.38   [0m | [0m 8.902   [0m | [0m 2.583   [0m | [0m 0.6173  [0m |
| [95m 3       [0m | [95m 0.6992  [0m | [95m 0.6408  [0m | [95m 0.9256  [0m | [95m 4.208   [0m | [95m 3.568   [0m | [95m 1.04    [0m | [95m 0.7996  [0m |
| [0m 4       [0m | [0m 0.686   [0m | [0m 0.7669  [0m | [0m 0.87    [0m | [0m 19.64   [0m | [0m 16.38   [0m | [0m 1.923   [0m | [0m 0.7683  [0m |
| [0m 5       [0m | [0m 0.6943  [0m | [0m 0.371   [0m | [0m 0.6399  [0m | [0m 5.437   [0m | [0m 19.0    [0m 

In [33]:
model = XGBClassifier(learning_rate=0.03,
                      n_estimators=1000,
                      tree_method='gpu_hist',
                      gpu_id='1',
                      max_depth=int(xgbBO.max['params']['max_depth']),
                      gamma=xgbBO.max['params']['gamma'],
                      min_child_weight=xgbBO.max['params']['min_child_weight'],
                      subsample=xgbBO.max['params']['subsample'],
                      colsample_bytree=xgbBO.max['params']['colsample_bytree'],
                      n_jobs=10)

In [34]:
model.fit(X_all,y_all)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9, gamma=1.0, gpu_id='1',
              importance_type='gain', interaction_constraints='',
              learning_rate=0.03, max_delta_step=0, max_depth=6,
              min_child_weight=12.238617986542883, missing=nan,
              monotone_constraints='(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)',
              n_estimators=1000, n_jobs=10, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.9,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)

## lgb

In [35]:
import lightgbm as lgb
lgb_data = lgb.Dataset(data=X_all, label= y_all)
n_folds, random_seed = 5, 666

def lgb_eval(feature_fraction, bagging_fraction, max_depth, scale_pos_weight, #subsample_for_bin,num_leaves
             lambda_l1, lambda_l2, min_split_gain, min_child_weight):
    params = {'objective': 'binary',
              'num_iterations': 1000,
              'learning_rate': 0.03,
              'metric':'auc',
              'boosting':'dart', # magic parameter
              'max_bin':256,     # magic parameter
              'n_jobs': 75}
    #params["subsample_for_bin"] = int(subsample_for_bin)
    #params["num_leaves"] = int(num_leaves)
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['max_depth'] = int(max_depth)
    params['lambda_l1'] = max(lambda_l1, 0)
    params['lambda_l2'] = max(lambda_l2, 0)
    params['scale_pos_weight'] = scale_pos_weight
    params['min_split_gain'] = min_split_gain
    params['min_child_weight'] = min_child_weight
    cv_result = lgb.cv(params, lgb_data, nfold=n_folds, early_stopping_rounds=100,
                       seed=random_seed, stratified=True)
    return max(cv_result['auc-mean'])

lgbBO = BayesianOptimization(lgb_eval, {#'num_leaves': (24, 60),
                                        'feature_fraction': (0.1, 0.9),
                                        'bagging_fraction': (0.5, 1),
                                        'max_depth': (4, 15),
                                        'scale_pos_weight':(1,2),
                                        #'subsample_for_bin': (3000, 6000),
                                        'lambda_l1': (0, 1),
                                        'lambda_l2': (0, 1),
                                        'min_split_gain': (0.001, 0.5),
                                        'min_child_weight': (5, 50)}, random_state=0)

In [36]:
print('Bayesian Optimzation Start')
lgbBO.maximize(init_points=5, n_iter=10)
print('Bayesian Optimzation End')

Bayesian Optimzation Start
|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | max_depth | min_ch... | min_sp... | scale_... |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.6994  [0m | [0m 0.7744  [0m | [0m 0.6722  [0m | [0m 0.6028  [0m | [0m 0.5449  [0m | [0m 8.66    [0m | [0m 34.07   [0m | [0m 0.2194  [0m | [0m 1.892   [0m |
| [0m 2       [0m | [0m 0.6993  [0m | [0m 0.9818  [0m | [0m 0.4068  [0m | [0m 0.7917  [0m | [0m 0.5289  [0m | [0m 10.25   [0m | [0m 46.65   [0m | [0m 0.03645 [0m | [0m 1.087   [0m |
| [95m 3       [0m | [95m 0.6994  [0m | [95m 0.5101  [0m | [95m 0.7661  [0m | [95m 0.7782  [0m | [95m 0.87    [0m | [95m 14.76   [0m | [95m 40.96   [0m | [95m 0.2313  [0m | [95m 1.781   [0m |
| [95m 4       [0m | [95m 0.6995  [0m | [95m 0.5591  [0m | [95m 0.6119  [0m | [95m 0.1434  [0m | [95m 

In [37]:
from lightgbm import LGBMClassifier

lgb_model = LGBMClassifier(learning_rate=0.03,
                           num_iterations=1000,
                           bagging_fraction=lgbBO.max['params']['bagging_fraction'],
                           feature_fraction=lgbBO.max['params']['feature_fraction'],
                           lambda_l1=lgbBO.max['params']['lambda_l1'],
                           lambda_l2=lgbBO.max['params']['lambda_l2'],
                           max_depth=int(lgbBO.max['params']['max_depth']),
                           min_child_weight=lgbBO.max['params']['min_child_weight'],
                           min_split_gain=lgbBO.max['params']['min_split_gain'],
                           #num_leaves=int(lgbBO.max['params']['num_leaves']),
                           objective='binary',
                           n_jobs=50,
                           seed=666)

lgb_model.fit(X_all,y_all)

LGBMClassifier(bagging_fraction=0.9479309414899773,
               feature_fraction=0.5726969041872401,
               lambda_l1=0.13744679824568462, lambda_l2=0.531871524132905,
               learning_rate=0.03, max_depth=11,
               min_child_weight=26.97691050825829,
               min_split_gain=0.03258088585236473, n_jobs=50,
               num_iterations=1000, objective='binary', seed=666)

## cat

In [60]:
from catboost import CatBoostClassifier
cat_idx = [0,1]
cat_model = CatBoostClassifier(eval_metric='AUC', 
                               random_seed=42,
                               learning_rate=0.02,
                               depth=7,
                               l2_leaf_reg=40,
                               iterations=1500,
                               bootstrap_type='Bernoulli',
                               subsample=0.8,
                               task_type = "GPU",
                               devices="1",
                               verbose=False,
                               scale_pos_weight=280944/145888)
#cat_model.fit(X_train,y_train,eval_set=(X_valid,y_valid),
#              cat_features=cat_idx,verbose=False)

cat_model.fit(X_all, y_all,verbose_eval=False)

<catboost.core.CatBoostClassifier at 0x7f0cf158bfd0>

## stack

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
estimators = [
    ('xgb', model),
    ('lgb', lgb_model),
    ('cat', cat_model)
]
clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(), cv=5
)

In [50]:
clf.fit(X_all, y_all)

StackingClassifier(cv=5,
                   estimators=[('xgb',
                                XGBClassifier(base_score=0.5, booster='gbtree',
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=0.9, gamma=1.0,
                                              gpu_id='1',
                                              importance_type='gain',
                                              interaction_constraints='',
                                              learning_rate=0.03,
                                              max_delta_step=0, max_depth=6,
                                              min_child_weight=12.238617986542883,
                                              missing=nan,
                                              monotone_constraints='(0,0,0,0,0,0...
                                               feature_fraction=0.5726969

# mixture

In [51]:
xgb_pred = model.predict_proba(X_test)[:,1].reshape(-1)

In [52]:
lgb_pred = lgb_model.predict_proba(X_test)[:,1].reshape(-1)

In [61]:
cat_pred = cat_model.predict_proba(X_test)[:,1].reshape(-1)

In [62]:
stack_pred = clf.predict_proba(X_test)[:,0].reshape(-1)

In [63]:
def output_generate(y_pred):
    submit1 = pd.DataFrame({
        'userid': df[df.label.isnull()]['userid'].values,
        'label':y_pred
    })
    final = df_submission.merge(submit1)
    return final
df_submission = pd.read_csv('./data2/submission.csv')

In [64]:
output_generate(lgb_pred).to_csv('output_lgb1.csv',index=0)

In [65]:
output_generate(xgb_pred).to_csv('output_xgb1.csv',index=0)

In [66]:
output_generate(stack_pred).to_csv('output_stack.csv',index=0)

In [67]:
output_generate(cat_pred).to_csv('output_cat1.csv',index=0)

In [68]:
mix = (xgb_pred+lgb_pred+stack_pred+cat_pred)/4

In [69]:
output_generate(mix).to_csv('output_mix4.csv',index=0)