In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days, date_add_times
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature, get_factorize_feature
from func.parallel_utils import get_parallel_arg_list
from joblib import delayed, Parallel
from itertools import combinations

from scipy.sparse import vstack, csr_matrix, save_npz, load_npz
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
gc.enable()

In [2]:
COLUMN_ID = 'TransactionID'
# COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('time_zone')
               or path.count('fill__cnt')
               or path.count('bin_')
               or path.count('ProductCD')
              ]
test_paths = [path for path in test_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('time_zone')
               or path.count('fill__cnt')
               or path.count('bin_')
               or path.count('ProductCD')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)

In [21]:
df_train.head()

Unnamed: 0,fill__cnt_V306,bin__C7,fill__P_emaildomain_bin,fill__cnt_V87,fill__cnt_V78,fill__cnt_V165,fill__cnt_C5,fill__cnt_addr2,fill__cnt_V314,fill__cnt_V156,...,fill__cnt_V317,fill__cnt_V281,fill__cnt_C8,fill__cnt_C6,fill__cnt_card2,fill__cnt_V265,fill__cnt_V54,fill__cnt_V37,fill__cnt_V67,ProductCD
0,244,1,1,31,36,110,12,31,273,16,...,113,3,6,12,89,98,8,10,11,W
1,244,1,7,31,36,110,12,31,273,16,...,194,3,6,12,286,98,7,28,11,W
2,244,1,8,31,36,110,12,31,273,16,...,194,3,6,12,326,98,8,28,11,W
3,103,1,11,31,36,110,12,31,273,16,...,139,3,6,10,13,98,8,28,11,W
4,244,1,7,3,2,106,12,31,273,11,...,194,3,4,12,169,25,9,10,10,H


In [None]:
use_cols = [col for col in df_train.columns if col not in COLUMNS_IGNORE]
print('Transform all features to category.\n')
for usecol in use_cols:

    df_train[usecol] = df_train[usecol].astype('str')
    df_test[usecol] = df_test[usecol].astype('str')
    
    #Fit LabelEncoder
    le = LabelEncoder().fit(
            np.unique(df_train[usecol].unique().tolist()+
                      df_test[usecol].unique().tolist()))

    #At the end 0 will be used for dropped values
    df_train[usecol] = le.transform(df_train[usecol])+1
    df_test[usecol]  = le.transform(df_test[usecol])+1

    agg_tr = (df_train
              .groupby([usecol])
              .aggregate({COLUMN_ID:'count'})
              .reset_index()
              .rename({COLUMN_ID:'df_train'}, axis=1))
    agg_te = (df_test
              .groupby([usecol])
              .aggregate({COLUMN_ID:'count'})
              .reset_index()
              .rename({COLUMN_ID:'df_test'}, axis=1))

    agg = pd.merge(agg_tr, agg_te, on=usecol, how='outer').replace(np.nan, 0)
    #Select values with more than 1000 observations
    agg = agg[(agg['df_train'] > 20)].reset_index(drop=True)
    agg['Total'] = agg['df_train'] + agg['df_test']
    #Drop unbalanced values
    agg = agg[(agg['df_train'] / agg['Total'] > 0.05) & (agg['df_train'] / agg['Total'] < 0.95)]
    agg[usecol+'Copy'] = agg[usecol]

    df_train[usecol] = (pd.merge(df_train[[usecol]], 
                              agg[[usecol, usecol+'Copy']], 
                              on=usecol, how='left')[usecol+'Copy']
                     .replace(np.nan, 0).astype('int').astype('category'))

    df_test[usecol]  = (pd.merge(df_test[[usecol]], 
                              agg[[usecol, usecol+'Copy']], 
                              on=usecol, how='left')[usecol+'Copy']
                     .replace(np.nan, 0).astype('int').astype('category'))

    del le, agg_tr, agg_te, agg, usecol
    gc.collect()
          
y_train = np.array(df_train[COLUMN_TARGET])
train_ids = df_train.index
test_ids  = df_test.index

del df_train[COLUMN_TARGET], df_train[COLUMN_ID], df_test[COLUMN_ID]
del df_test[COLUMN_TARGET]
gc.collect()

print("If you don't want use Sparse Matrix choose Kernel Version 2 to get simple solution.\n")

print('--------------------------------------------------------------------------------------------------------')
print('Transform Data to Sparse Matrix.')
print('Sparse Matrix can be used to fit a lot of models, eg. XGBoost, LightGBM, Random Forest, K-Means and etc.')
print('To concatenate Sparse Matrices by column use hstack()')
print('Read more about Sparse Matrix https://docs.scipy.org/doc/scipy/reference/sparse.html')
print('Good Luck!')
print('--------------------------------------------------------------------------------------------------------')


# Fit OneHotEncoder
data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
ohe = OneHotEncoder(categories='auto', sparse=True, dtype='uint8').fit(data)

In [20]:
#Transform data using small groups to reduce memory usage
m = 100000

# ohe = OneHotEncoder(categories='auto', sparse=True, dtype='uint8').fit(df_train)
# ohe_train = vstack([ohe.transform(df_train[i*m:(i+1)*m]) for i in range(df_train.shape[0] // m + 1)])
# ohe_test  = vstack([ohe.transform(df_test[i*m:(i+1)*m])  for i in range(df_test.shape[0] // m +  1)])

# data = pd.concat([df_train, df_test], axis=0, ignore_index=True)
# ohe = OneHotEncoder(categories='auto', sparse=True, dtype='uint8').fit(data)
# ohe_all  = vstack([ohe.transform(data[i*m:(i+1)*m])  for i in range(data.shape[0] // m +  1)])
ohe_train = ohe_all[:len(df_train)]
ohe_test = ohe_all[len(df_train):]
save_npz('610_train.npz', ohe_train, compressed=True)
save_npz('610_test.npz',  ohe_test,  compressed=True)

del ohe, ohe_train, ohe_test, data
gc.collect()

1229

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skf.get_n_splits(train_ids, y_train)

lgb_test_result  = np.zeros(test_ids.shape[0])
#lgb_train_result = np.zeros(train_ids.shape[0])
#xgb_test_result  = np.zeros(test_ids.shape[0])
#xgb_train_result = np.zeros(train_ids.shape[0])
counter = 0

print('\nLightGBM\n')

for train_index, test_index in skf.split(train_ids, y_train):
    
    print('Fold {}\n'.format(counter + 1))
    
    train = load_npz('train.npz')
    X_fit = vstack([train[train_index[i*m:(i+1)*m]] for i in range(train_index.shape[0] // m + 1)])
    X_val = vstack([train[test_index[i*m:(i+1)*m]]  for i in range(test_index.shape[0] //  m + 1)])
    X_fit, X_val = csr_matrix(X_fit, dtype='float32'), csr_matrix(X_val, dtype='float32')
    y_fit, y_val = y_train[train_index], y_train[test_index]
    
    del train
    gc.collect()

    lgb_model = lgb.LGBMClassifier(max_depth=-1,
                                   n_estimators=30000,
                                   learning_rate=0.05,
                                   num_leaves=2**12-1,
                                   colsample_bytree=0.28,
                                   objective='binary', 
                                   n_jobs=-1)
                                   
    #xgb_model = xgb.XGBClassifier(max_depth=6,
    #                              n_estimators=30000,
    #                              colsample_bytree=0.2,
    #                              learning_rate=0.1,
    #                              objective='binary:logistic', 
    #                              n_jobs=-1)
    
                               
    lgb_model.fit(X_fit, y_fit, eval_metric='auc', 
                  eval_set=[(X_val, y_val)], 
                  verbose=100, early_stopping_rounds=100)
                  
    #xgb_model.fit(X_fit, y_fit, eval_metric='auc', 
    #              eval_set=[(X_val, y_val)], 
    #              verbose=1000, early_stopping_rounds=300)

    #lgb_train_result[test_index] += lgb_model.predict_proba(X_val)[:,1]
    #xgb_train_result[test_index] += xgb_model.predict_proba(X_val)[:,1]
    
    del X_fit, X_val, y_fit, y_val, train_index, test_index
    gc.collect()
    
    test = load_npz('test.npz')
    test = csr_matrix(test, dtype='float32')
    lgb_test_result += lgb_model.predict_proba(test)[:,1]
    #xgb_test_result += xgb_model.predict_proba(test)[:,1]
    counter += 1
    
    del test
    gc.collect()