In [1]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, warnings, random, datetime

from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold, GroupKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm_notebook
from tqdm import tqdm
import matplotlib.pylab as plt
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import math
warnings.filterwarnings('ignore')

In [2]:
########################### Helpers
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
## Memory Reducer
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
########################### Model
import lightgbm as lgb

def make_predictions(tr_df, tt_df, features_columns, target, lgb_params, NFOLDS=5):
    
    folds = GroupKFold(n_splits=NFOLDS)

    X,y = tr_df[features_columns], tr_df[target]    
    P,P_y = tt_df[features_columns], tt_df[target]  
    split_groups = tr_df['DT_M']

    tt_df = tt_df[['TransactionID',target]]    
    predictions = np.zeros(len(tt_df))
    oof = np.zeros(len(tr_df))
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups=split_groups)):
        print('Fold:',fold_)
        tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]
        vl_x, vl_y = X.iloc[val_idx,:], y[val_idx]
            
        print(len(tr_x),len(vl_x))
        tr_data = lgb.Dataset(tr_x, label=tr_y)
        vl_data = lgb.Dataset(vl_x, label=vl_y)  

        estimator = lgb.train(
            lgb_params,
            tr_data,
            valid_sets = [tr_data, vl_data],
            verbose_eval = 200,
        )   
        
        pp_p = estimator.predict(P)
        predictions += pp_p/NFOLDS
        
        oof_preds = estimator.predict(vl_x)
        oof[val_idx] = (oof_preds - oof_preds.min())/(oof_preds.max() - oof_preds.min())

#         if LOCAL_TEST:
        feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(),X.columns)), columns=['Value','Feature'])
        print(feature_imp)

        del tr_x, tr_y, vl_x, vl_y, tr_data, vl_data
        gc.collect()
        
    tt_df['prediction'] = predictions
    print('OOF AUC:', metrics.roc_auc_score(y, oof))
    if LOCAL_TEST:
        print('Holdout AUC:', metrics.roc_auc_score(tt_df[TARGET], tt_df['prediction']))
    
    return tt_df
## -------------------

In [4]:
########################### Vars
#################################################################################
SEED = 42
seed_everything(SEED)
LOCAL_TEST = False
TARGET = 'isFraud'
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')

In [5]:
########################### Model params
lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.5,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':800,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': SEED,
                    'early_stopping_rounds':100, 
                } 

In [6]:
########################### DATA LOAD
#################################################################################
print('Load Data')

if LOCAL_TEST:
    train_df = pd.read_pickle('../input/ieee-fe-for-local-test/train_df.pkl')
    test_df = pd.read_pickle('../input/ieee-fe-for-local-test/test_df.pkl') 
else:
    train_df = pd.read_pickle('../input/ieee-fe-with-some-eda/train_df.pkl')
    test_df = pd.read_pickle('../input/ieee-fe-with-some-eda/test_df.pkl')
    
remove_features = pd.read_pickle('../input/ieee-fe-with-some-eda/remove_features.pkl')
remove_features = list(remove_features['features_to_remove'].values)
print('Shape control:', train_df.shape, test_df.shape)

Load Data
Shape control: (590540, 791) (506691, 791)


In [7]:
# fe with amount (user id)
def fe_uid(train_df, test_df, rm_features):
    
    print('==> processing uid...')
    
    # user id, save for later aggregation
    for df in [train_df, test_df]:
        df['cents'] = df['TransactionAmt'].apply(lambda x: x - int(x))
        df['uid_0'] = df['cents'].astype(str)+'_'+df['ProductCD'].astype(str)
        df['uid_1'] = df['ProductCD'].astype(str)+'_'+df['addr1'].astype(str)+'_'+df['addr2'].astype(str)+'_'+df['id_19'].astype(str)+'_'+df['id_20'].astype(str)
        df['uid_2'] = df['ProductCD'].astype(str)+'_'+df['card1'].astype(str)+'_'+df['card2'].astype(str)
        df['uid_3'] = df['ProductCD'].astype(str)+'_'+df['card1'].astype(str)+'_'+df['card2'].astype(str)+'_'+df['card3'].astype(str)+'_'+df['card5'].astype(str)
        df['uid_4'] = df['ProductCD'].astype(str)+'_'+df['addr1'].astype(str)+'_'+df['addr2'].astype(str)
        df['uid_5'] = df['ProductCD'].astype(str)+'_'+df['card1'].astype(str)+'_'+df['card2'].astype(str)+'_'+df['card3'].astype(str)+'_'+df['card5'].astype(str)+'_'+df['addr1'].astype(str)+'_'+df['addr2'].astype(str)
        df['uid_6'] = df['ProductCD'].astype(str)+'_'+df['P_emaildomain'].astype(str)+'_'+df['R_emaildomain'].astype(str)
        df['uid_7'] = df['ProductCD'].astype(str)+'_'+df['card1'].astype(str)+'_'+df['card2'].astype(str)+'_'+df['card3'].astype(str)+'_'+df['card5'].astype(str)+'_'+df['P_emaildomain'].astype(str)+'_'+df['R_emaildomain'].astype(str)
        df['uid_8'] = df['ProductCD'].astype(str)+'_'+df['addr1'].astype(str)+'_'+df['addr2'].astype(str)+'_'+df['P_emaildomain'].astype(str)+'_'+df['R_emaildomain'].astype(str)
        df['uid_9'] = df['ProductCD'].astype(str)+'_'+df['card1'].astype(str)+'_'+df['card2'].astype(str)+'_'+df['card3'].astype(str)+'_'+df['card5'].astype(str)+'_'+df['addr1'].astype(str)+'_'+df['addr2'].astype(str)+'_'+df['P_emaildomain'].astype(str)+'_'+df['R_emaildomain'].astype(str)
        
        df['uid_10'] = df['uid_9'].astype(str)+'_'+df['id_19'].astype(str)+'_'+df['id_20'].astype(str)
        df['uid_11'] = df['uid_9'].astype(str)+'_'+df['C1'].astype(str)+'_'+df['C2'].astype(str)
        df['uid_12'] = df['uid_9'].astype(str)+'_'+df['D1'].astype(str)+'_'+df['D2'].astype(str)
        df['uid_13'] = df['uid_9'].astype(str)+'_'+df['dist1'].astype(str)+'_'+df['dist2'].astype(str)
        df['uid_13'] = df['uid_9'].astype(str)+'_'+df['D10'].astype(str)+'_'+df['D11'].astype(str)+'_'+df['D15'].astype(str)
        df['uid_14'] = df['uid_9'].astype(str)+'_'+df['C5'].astype(str)+'_'+df['C6'].astype(str)
        df['uid_15'] = df['uid_9'].astype(str)+'_'+df['id_30'].astype(str)+'_'+df['id_31'].astype(str)
        df['uid_16'] = df['uid_9'].astype(str)+'_'+df['C5'].astype(str)+'_'+df['C6'].astype(str)
        df['uid_17'] = df['uid_9'].astype(str)+'_'+df['C9'].astype(str)+'_'+df['C11'].astype(str)
        df['uid_18'] = df['uid_9'].astype(str)+'_'+df['C13'].astype(str)+'_'+df['C14'].astype(str)
        df['uid_19'] = df['uid_9'].astype(str)+'_'+df['D3'].astype(str)+'_'+df['D4'].astype(str)+'_'+df['D5'].astype(str)
        df['uid_20'] = df['uid_9'].astype(str)+'_'+df['dist1'].astype(str)
        df['uid_21'] = df['uid_9'].astype(str)+'_'+df['dist2'].astype(str)
        df['uid_22'] = df['uid_9'].astype(str)+'_'+df['M1'].astype(str)+'_'+df['M2'].astype(str)+'_'+df['M3'].astype(str)+'_'+df['M4'].astype(str)+'_'+df['M5'].astype(str)+'_'+df['M6'].astype(str)+'_'+df['M7'].astype(str)+'_'+df['M8'].astype(str)+'_'+df['M9'].astype(str)
        df['uid_23'] = df['uid_9'].astype(str)+'_'+df['V129'].astype(str)+'_'+df['V130'].astype(str)+'_'+df['V131'].astype(str)
        df['uid_24'] = df['uid_9'].astype(str)+'_'+df['V310'].astype(str)+'_'+df['V311'].astype(str)+'_'+df['V312'].astype(str)+'_'+df['V313'].astype(str)+'_'+df['V314'].astype(str)+'_'+df['V315'].astype(str)
        df['uid_25'] = df['uid_9'].astype(str)+'_'+df['C4'].astype(str)+'_'+df['C6'].astype(str)+'_'+df['C8'].astype(str)+'_'+df['C10'].astype(str)
        df['uid_26'] = df['uid_9'].astype(str)+'_'+df['D8'].astype(str)+'_'+df['D9'].astype(str)
        df['uid_27'] = df['uid_9'].astype(str)+'_'+df['DeviceInfo'].astype(str)+'_'+df['DeviceType'].astype(str)
        df['uid_28'] = df['uid_9'].astype(str)+'_'+df['id_01'].astype(str)+'_'+df['id_02'].astype(str)
        df['uid_29'] = df['uid_9'].astype(str)+'_'+df['id_05'].astype(str)+'_'+df['id_06'].astype(str)+'_'+df['id_07'].astype(str)+'_'+df['id_08'].astype(str)
        df['uid_30'] = df['uid_9'].astype(str)+'_'+df['id_13'].astype(str)+'_'+df['id_14'].astype(str)+'_'+df['id_15'].astype(str)+'_'+df['id_16'].astype(str)
        df['uid_31'] = df['uid_9'].astype(str)+'_'+df['id_30'].astype(str)+'_'+df['id_31'].astype(str)+'_'+df['id_33'].astype(str)
        df['uid_32'] = df['uid_9'].astype(str)+'_'+df['TransactionAmt'].astype(str)
        df['uid_33'] = df['uid_9'].astype(str)+'_'+df['id_09'].astype(str)+'_'+df['id_10'].astype(str)+'_'+df['id_11'].astype(str)
        df['uid_34'] = df['uid_9'].astype(str)+'_'+df['DT_M'].astype(str)
        df['uid_35'] = df['uid_9'].astype(str)+'_'+df['DT_W'].astype(str)
        df['uid_36'] = df['uid_9'].astype(str)+'_'+df['DT_D'].astype(str)
        df['uid_37'] = df['uid_9'].astype(str)+'_'+df['DT_hour'].astype(str)
        df['uid_38'] = df['uid_9'].astype(str)+'_'+df['DT_day_week'].astype(str)
        df['uid_39'] = df['uid_9'].astype(str)+'_'+df['DT_day_month'].astype(str)

        df['uid_40'] = df['uid_8'].astype(str)+'_'+df['id_19'].astype(str)+'_'+df['id_20'].astype(str)
        df['uid_41'] = df['uid_8'].astype(str)+'_'+df['C1'].astype(str)+'_'+df['C2'].astype(str)
        df['uid_42'] = df['uid_8'].astype(str)+'_'+df['D1'].astype(str)+'_'+df['D2'].astype(str)
        df['uid_43'] = df['uid_8'].astype(str)+'_'+df['dist1'].astype(str)+'_'+df['dist2'].astype(str)
        df['uid_43'] = df['uid_8'].astype(str)+'_'+df['D10'].astype(str)+'_'+df['D11'].astype(str)+'_'+df['D15'].astype(str)
        df['uid_44'] = df['uid_8'].astype(str)+'_'+df['C5'].astype(str)+'_'+df['C6'].astype(str)
        df['uid_45'] = df['uid_8'].astype(str)+'_'+df['id_30'].astype(str)+'_'+df['id_31'].astype(str)
        df['uid_46'] = df['uid_8'].astype(str)+'_'+df['C5'].astype(str)+'_'+df['C6'].astype(str)
        df['uid_47'] = df['uid_8'].astype(str)+'_'+df['C9'].astype(str)+'_'+df['C11'].astype(str)
        df['uid_48'] = df['uid_8'].astype(str)+'_'+df['C13'].astype(str)+'_'+df['C14'].astype(str)
        df['uid_49'] = df['uid_8'].astype(str)+'_'+df['D3'].astype(str)+'_'+df['D4'].astype(str)+'_'+df['D5'].astype(str)
        df['uid_50'] = df['uid_8'].astype(str)+'_'+df['dist1'].astype(str)
        df['uid_51'] = df['uid_8'].astype(str)+'_'+df['dist2'].astype(str)
        df['uid_52'] = df['uid_8'].astype(str)+'_'+df['M1'].astype(str)+'_'+df['M2'].astype(str)+'_'+df['M3'].astype(str)+'_'+df['M4'].astype(str)+'_'+df['M5'].astype(str)+'_'+df['M6'].astype(str)+'_'+df['M7'].astype(str)+'_'+df['M8'].astype(str)+'_'+df['M9'].astype(str)
        df['uid_53'] = df['uid_8'].astype(str)+'_'+df['V129'].astype(str)+'_'+df['V130'].astype(str)+'_'+df['V131'].astype(str)
        df['uid_54'] = df['uid_8'].astype(str)+'_'+df['V310'].astype(str)+'_'+df['V311'].astype(str)+'_'+df['V312'].astype(str)+'_'+df['V313'].astype(str)+'_'+df['V314'].astype(str)+'_'+df['V315'].astype(str)
        df['uid_55'] = df['uid_8'].astype(str)+'_'+df['C4'].astype(str)+'_'+df['C6'].astype(str)+'_'+df['C8'].astype(str)+'_'+df['C10'].astype(str)
        df['uid_56'] = df['uid_8'].astype(str)+'_'+df['D8'].astype(str)+'_'+df['D9'].astype(str)
        df['uid_57'] = df['uid_8'].astype(str)+'_'+df['DeviceInfo'].astype(str)+'_'+df['DeviceType'].astype(str)
        df['uid_58'] = df['uid_8'].astype(str)+'_'+df['id_01'].astype(str)+'_'+df['id_02'].astype(str)
        df['uid_59'] = df['uid_8'].astype(str)+'_'+df['id_05'].astype(str)+'_'+df['id_06'].astype(str)+'_'+df['id_07'].astype(str)+'_'+df['id_08'].astype(str)
        df['uid_60'] = df['uid_8'].astype(str)+'_'+df['id_13'].astype(str)+'_'+df['id_14'].astype(str)+'_'+df['id_15'].astype(str)+'_'+df['id_16'].astype(str)
        df['uid_61'] = df['uid_8'].astype(str)+'_'+df['id_30'].astype(str)+'_'+df['id_31'].astype(str)+'_'+df['id_33'].astype(str)
        df['uid_62'] = df['uid_8'].astype(str)+'_'+df['TransactionAmt'].astype(str)
        df['uid_63'] = df['uid_8'].astype(str)+'_'+df['id_09'].astype(str)+'_'+df['id_10'].astype(str)+'_'+df['id_11'].astype(str)
        df['uid_64'] = df['uid_8'].astype(str)+'_'+df['DT_M'].astype(str)
        df['uid_65'] = df['uid_8'].astype(str)+'_'+df['DT_W'].astype(str)
        df['uid_66'] = df['uid_8'].astype(str)+'_'+df['DT_D'].astype(str)
        df['uid_67'] = df['uid_8'].astype(str)+'_'+df['DT_hour'].astype(str)
        df['uid_68'] = df['uid_8'].astype(str)+'_'+df['DT_day_week'].astype(str)
        df['uid_69'] = df['uid_8'].astype(str)+'_'+df['DT_day_month'].astype(str)
        
    uid_list = ['uid_' + str(i) for i in range(70)]
    cid_list  =[]
            
    tmp_rm = [] + uid_list

    rm_features = rm_features + tmp_rm
    return train_df, test_df, rm_features, uid_list, cid_list
# train, test, rm_features = fe_uid(train, test, rm_features)

In [8]:
# group aggregation
def fe_agg(train_df, test_df, rm_features, uid_list, cid_list):

    print('==> processing aggregation...')

    uid_cols = uid_list
    
    for col in tqdm_notebook(uid_cols):
        # aggr: mean, std, min, max, sum
        for agg_type in ['mean', 'std']:
            for agg_col in ['TransactionAmt']:
                new_col_name = col + '_' + agg_col + '_' + agg_type
                temp_df = pd.concat(
                    [train_df[[col, agg_col]], test_df[[col, agg_col]]])
                temp_df = temp_df.groupby([col])[agg_col].agg([
                    agg_type
                ]).reset_index().rename(columns={agg_type: new_col_name})

                temp_df.index = list(temp_df[col])
                temp_df = temp_df[new_col_name].to_dict()

                train_df[new_col_name] = train_df[col].map(temp_df)
                test_df[new_col_name] = test_df[col].map(temp_df)  
                
#     for col in tqdm_notebook(uid_cols):
#         # aggr: value - mean
#         for agg_type in ['mean']:
#             for agg_col in ['TransactionAmt']:
#                 new_col_name = col + '_' + agg_col + '_' + agg_type + '_diff'
#                 temp_df = pd.concat(
#                     [train_df[[col, agg_col]], test_df[[col, agg_col]]])
#                 temp_df = temp_df.groupby([col])[agg_col].agg([
#                     agg_type
#                 ]).reset_index().rename(columns={agg_type: new_col_name})

#                 temp_df.index = list(temp_df[col])
#                 temp_df = temp_df[new_col_name].to_dict()

#                 train_df[new_col_name] = train_df[col].map(temp_df)
#                 test_df[new_col_name] = test_df[col].map(temp_df) 
#                 train_df[new_col_name] = train_df[agg_col] - train_df[new_col_name]
#                 test_df[new_col_name] = test_df[agg_col] - test_df[new_col_name]
                
    for col in tqdm_notebook(uid_cols + cid_list):
        # count
        for agg_type in ['count']:
            for agg_col in ['TransactionDT']:
                new_col_name = col + '_' + agg_type
                temp_df = pd.concat(
                    [train_df[[col, agg_col]], test_df[[col, agg_col]]])
                temp_df = temp_df.groupby([col])[agg_col].agg([
                    agg_type
                ]).reset_index().rename(columns={agg_type: new_col_name})

                temp_df.index = list(temp_df[col])
                temp_df = temp_df[new_col_name].to_dict()

                train_df[new_col_name] = train_df[col].map(temp_df)
                test_df[new_col_name] = test_df[col].map(temp_df)
        
        
    train_df = train_df.replace(np.inf, 999)
    test_df = test_df.replace(np.inf, 999)
    
    tmp_rm = []
    rm_features = rm_features + tmp_rm
    return train_df, test_df, rm_features

# train, test, rm_features = fe_agg(train, test, rm_features)

In [None]:
def fe(train_df, test_df, rm_features):
    
    # fe with amount (user id)
    train_df, test_df, rm_features, uid_list, cid_list = fe_uid(train_df, test_df, rm_features)
    
    # group aggregation
    train_df, test_df, rm_features = fe_agg(train_df, test_df, rm_features, uid_list, cid_list)
    
    return train_df, test_df, rm_features

train_df, test_df, remove_features = fe(train_df, test_df, remove_features)

==> processing uid...


In [None]:
print('Shape control:', train_df.shape, test_df.shape)
print(remove_features)

In [None]:
########################### Final features list
features_columns = [col for col in list(train_df) if col not in remove_features]

########################### Final Minification
## I don't like this part as it changes float numbers
## small change but change.
## To be able to train lgbm without 
## minification we need to do some changes on model
## we will do it later.
if not LOCAL_TEST:
    train_df = reduce_mem_usage(train_df)
    test_df  = reduce_mem_usage(test_df)

In [None]:
train_df.head()

In [None]:
########################### Model Train
if LOCAL_TEST:
    lgb_params['learning_rate'] = 0.01
    lgb_params['n_estimators'] = 10000
    lgb_params['early_stopping_rounds'] = 100
    test_predictions = make_predictions(train_df, test_df, features_columns, TARGET, lgb_params, NFOLDS=4)
else:
    lgb_params['learning_rate'] = 0.01
    lgb_params['n_estimators'] = 2000
    lgb_params['early_stopping_rounds'] = 100    
    test_predictions = make_predictions(train_df, test_df, features_columns, TARGET, lgb_params, NFOLDS=6)

In [None]:
########################### Export
if not LOCAL_TEST:
    test_predictions['isFraud'] = test_predictions['prediction']
    test_predictions[['TransactionID','isFraud']].to_csv('../submissions/submission_4.csv', index=False)