In [1]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, warnings, random, datetime

from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold, GroupKFold
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm

import math
warnings.filterwarnings('ignore')

In [8]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.options.display.max_columns = None
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_rows', 100)

# 1. All feature engineering for CV 9424 LB 9506 solution

### 1.0 Some preprocessing steps & feature engineering from public kernel (9450 LB)

In [2]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
## ------------------- 

## -------------------
## Memory Reducer
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
########################### Vars
#################################################################################
SEED = 42
seed_everything(SEED)
LOCAL_TEST = False

########################### DATA LOAD
#################################################################################
print('Load Data')
train_df = pd.read_csv('train_transaction.csv')
test_df = pd.read_csv('test_transaction.csv')
test_df['isFraud'] = 0

train_identity = pd.read_csv('train_identity.csv')
test_identity = pd.read_csv('test_identity.csv')

Load Data


In [4]:
########################### Merge Identity columns
temp_df = train_df[['TransactionID']]
temp_df = temp_df.merge(train_identity, on=['TransactionID'], how='left')
del temp_df['TransactionID']
train_df = pd.concat([train_df,temp_df], axis=1)
    
temp_df = test_df[['TransactionID']]
temp_df = temp_df.merge(test_identity, on=['TransactionID'], how='left')
del temp_df['TransactionID']
test_df = pd.concat([test_df,temp_df], axis=1)

In [9]:
train_df = reduce_mem_usage(train_df)
test_df  = reduce_mem_usage(test_df)

train_identity = reduce_mem_usage(train_identity)
test_identity  = reduce_mem_usage(test_identity)

Mem. usage decreased to 645.97 Mb (67.0% reduction)
Mem. usage decreased to 561.98 Mb (66.5% reduction)
Mem. usage decreased to 25.86 Mb (42.7% reduction)
Mem. usage decreased to 25.44 Mb (42.7% reduction)


In [10]:
train_df.shape
test_df.shape
train_df.head()
test_df.head()

(590540, 434)

(506691, 434)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,M1,M2,M3,M4,M5,M6,M7,M8,M9,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150,V151,V152,V153,V154,V155,V156,V157,V158,V159,V160,V161,V162,V163,V164,V165,V166,V167,V168,V169,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,13.0,,,,0.0,T,T,T,M2,F,T,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,,,,,0.0,,,,M0,T,T,,,,,,,,,,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,,outlook.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,315.0,,,,315.0,T,T,T,M0,F,F,F,F,F,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,,,yahoo.com,,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,,,,,84.0,,,,,111.0,,,,M0,T,F,,,,,,,,,,,,,,,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,48.0,28.0,0.0,10.0,4.0,1.0,38.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,50.0,1758.0,925.0,0.0,354.0,135.0,50.0,1404.0,790.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,28.0,0.0,0.0,0.0,0.0,10.0,0.0,4.0,0.0,0.0,1.0,1.0,1.0,1.0,38.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,50.0,1758.0,925.0,0.0,354.0,0.0,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,18.0,140.0,0.0,0.0,0.0,0.0,1803.0,49.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,15560.0,169690.796875,0.0,0.0,0.0,515.0,5155.0,2840.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70787.0,,,,,,,,,100.0,NotFound,,-480.0,New,NotFound,166.0,,542.0,144.0,,,,,,,,New,NotFound,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,M1,M2,M3,M4,M5,M6,M7,M8,M9,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V150,V151,V152,V153,V154,V155,V156,V157,V158,V159,V160,V161,V162,V163,V164,V165,V166,V167,V168,V169,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,isFraud,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,3663549,18403224,31.953125,W,10409,111.0,150.0,visa,226.0,debit,170.0,87.0,1.0,,gmail.com,,6.0,6.0,0.0,0.0,3.0,4.0,0.0,0.0,6.0,0.0,5.0,1.0,115.0,6.0,419.0,419.0,27.0,398.0,27.0,,,,,418.0,203.0,,,,409.0,T,T,F,,,F,T,T,T,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,47.950001,0.0,0.0,47.950001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,47.950001,0.0,0.0,47.950001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,3663550,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,299.0,87.0,4.0,,aol.com,,3.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,1.0,12.0,2.0,149.0,149.0,7.0,634.0,7.0,,,,,231.0,634.0,,,,634.0,T,F,F,M0,,F,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,280.0,77.0,0.0,280.0,77.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,280.0,77.0,0.0,280.0,0.0,77.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,3663551,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,472.0,87.0,2636.0,,hotmail.com,,2.0,2.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,0.0,2.0,0.0,22.0,2.0,137.0,137.0,10.0,97.0,10.0,,,,,136.0,136.0,,,,97.0,T,T,F,M0,F,F,F,F,F,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,2.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,968.0,0.0,0.0,705.0,0.0,0.0,0.0,0.0,0.0,263.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1321.0,0.0,0.0,1058.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,263.0,0.0,,,,,,,,,,,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,3663552,18403310,285.0,W,10989,360.0,150.0,visa,166.0,debit,205.0,87.0,17.0,,gmail.com,,5.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,2.0,0.0,7.0,4.0,42.0,42.0,41.0,242.0,41.0,,,,,242.0,242.0,,,,242.0,T,T,T,,,T,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,2.0,2.0,2.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,282.5,282.5,282.5,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,3663553,18403317,67.9375,W,18018,452.0,150.0,mastercard,117.0,debit,264.0,87.0,6.0,,gmail.com,,6.0,6.0,0.0,0.0,2.0,5.0,0.0,0.0,5.0,0.0,6.0,0.0,14.0,6.0,22.0,22.0,0.0,22.0,0.0,,,,,22.0,22.0,,,,22.0,T,T,T,,,F,F,T,T,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,3.0,1.0,1.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,67.949997,183.850006,67.949997,67.9375,183.850006,67.949997,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,0.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,67.949997,183.850006,67.949997,67.9375,183.850006,67.9375,67.949997,67.9375,183.875,67.9375,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [11]:
import gc
gc.collect()

0

In [12]:
train_df['P_isproton']=(train_df['P_emaildomain']=='protonmail.com')
train_df['R_isproton']=(train_df['R_emaildomain']=='protonmail.com')
test_df['P_isproton']=(test_df['P_emaildomain']=='protonmail.com')
test_df['R_isproton']=(test_df['R_emaildomain']=='protonmail.com')

train_df['nulls1'] = train_df.isna().sum(axis=1)
test_df['nulls1'] = test_df.isna().sum(axis=1)

a = np.zeros(train_df.shape[0])
train_df["lastest_df_browser"] = a
a = np.zeros(test_df.shape[0])
test_df["lastest_df_browser"] = a
def setbrowser(df):
    df.loc[df["id_31"]=="samsung browser 7.0",'lastest_df_browser']=1
    df.loc[df["id_31"]=="opera 53.0",'lastest_df_browser']=1
    df.loc[df["id_31"]=="mobile safari 10.0",'lastest_df_browser']=1
    df.loc[df["id_31"]=="google search application 49.0",'lastest_df_browser']=1
    df.loc[df["id_31"]=="firefox 60.0",'lastest_df_browser']=1
    df.loc[df["id_31"]=="edge 17.0",'lastest_df_browser']=1
    df.loc[df["id_31"]=="chrome 69.0",'lastest_df_browser']=1
    df.loc[df["id_31"]=="chrome 67.0 for android",'lastest_df_browser']=1
    df.loc[df["id_31"]=="chrome 63.0 for android",'lastest_df_browser']=1
    df.loc[df["id_31"]=="chrome 63.0 for ios",'lastest_df_browser']=1
    df.loc[df["id_31"]=="chrome 64.0",'lastest_df_browser']=1
    df.loc[df["id_31"]=="chrome 64.0 for android",'lastest_df_browser']=1
    df.loc[df["id_31"]=="chrome 64.0 for ios",'lastest_df_browser']=1
    df.loc[df["id_31"]=="chrome 65.0",'lastest_df_browser']=1
    df.loc[df["id_31"]=="chrome 65.0 for android",'lastest_df_browser']=1
    df.loc[df["id_31"]=="chrome 65.0 for ios",'lastest_df_browser']=1
    df.loc[df["id_31"]=="chrome 66.0",'lastest_df_browser']=1
    df.loc[df["id_31"]=="chrome 66.0 for android",'lastest_df_browser']=1
    df.loc[df["id_31"]=="chrome 66.0 for ios",'lastest_df_browser']=1
    return df
train_df=setbrowser(train_df)
test_df=setbrowser(test_df)

train_df['lastest_df_browser'].head()

emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo', 'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other', 'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}
us_emails = ['gmail', 'net', 'edu']
for c in ['P_emaildomain', 'R_emaildomain']:
    train_df[c + '_bin'] = train_df[c].map(emails)
    test_df[c + '_bin'] = test_df[c].map(emails)
    
    train_df[c + '_suffix'] = train_df[c].map(lambda x: str(x).split('.')[-1])
    test_df[c + '_suffix'] = test_df[c].map(lambda x: str(x).split('.')[-1])
    
    train_df[c + '_suffix'] = train_df[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    test_df[c + '_suffix'] = test_df[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')

train_df['card1_count_full'] = train_df['card1'].map(pd.concat([train_df['card1'], test_df['card1']], ignore_index=True).value_counts(dropna=False))
test_df['card1_count_full'] = test_df['card1'].map(pd.concat([train_df['card1'], test_df['card1']], ignore_index=True).value_counts(dropna=False))

train_df['card2_count_full'] = train_df['card2'].map(pd.concat([train_df['card2'], test_df['card2']], ignore_index=True).value_counts(dropna=False))
test_df['card2_count_full'] = test_df['card2'].map(pd.concat([train_df['card2'], test_df['card2']], ignore_index=True).value_counts(dropna=False))

train_df['card3_count_full'] = train_df['card3'].map(pd.concat([train_df['card3'], test_df['card3']], ignore_index=True).value_counts(dropna=False))
test_df['card3_count_full'] = test_df['card3'].map(pd.concat([train_df['card3'], test_df['card3']], ignore_index=True).value_counts(dropna=False))

train_df['card4_count_full'] = train_df['card4'].map(pd.concat([train_df['card4'], test_df['card4']], ignore_index=True).value_counts(dropna=False))
test_df['card4_count_full'] = test_df['card4'].map(pd.concat([train_df['card4'], test_df['card4']], ignore_index=True).value_counts(dropna=False))

train_df['card5_count_full'] = train_df['card5'].map(pd.concat([train_df['card5'], test_df['card5']], ignore_index=True).value_counts(dropna=False))
test_df['card5_count_full'] = test_df['card5'].map(pd.concat([train_df['card5'], test_df['card5']], ignore_index=True).value_counts(dropna=False))

train_df['card6_count_full'] = train_df['card6'].map(pd.concat([train_df['card6'], test_df['card6']], ignore_index=True).value_counts(dropna=False))
test_df['card6_count_full'] = test_df['card6'].map(pd.concat([train_df['card6'], test_df['card6']], ignore_index=True).value_counts(dropna=False))


train_df['addr1_count_full'] = train_df['addr1'].map(pd.concat([train_df['addr1'], test_df['addr1']], ignore_index=True).value_counts(dropna=False))
test_df['addr1_count_full'] = test_df['addr1'].map(pd.concat([train_df['addr1'], test_df['addr1']], ignore_index=True).value_counts(dropna=False))

train_df['addr2_count_full'] = train_df['addr2'].map(pd.concat([train_df['addr2'], test_df['addr2']], ignore_index=True).value_counts(dropna=False))
test_df['addr2_count_full'] = test_df['addr2'].map(pd.concat([train_df['addr2'], test_df['addr2']], ignore_index=True).value_counts(dropna=False))


train_df['TransactionAmt_to_mean_card1'] = train_df['TransactionAmt'] / train_df.groupby(['card1'])['TransactionAmt'].transform('mean')
train_df['TransactionAmt_to_mean_card4'] = train_df['TransactionAmt'] / train_df.groupby(['card4'])['TransactionAmt'].transform('mean')
train_df['TransactionAmt_to_std_card1'] = train_df['TransactionAmt'] / train_df.groupby(['card1'])['TransactionAmt'].transform('std')
train_df['TransactionAmt_to_std_card4'] = train_df['TransactionAmt'] / train_df.groupby(['card4'])['TransactionAmt'].transform('std')

test_df['TransactionAmt_to_mean_card1'] = test_df['TransactionAmt'] / test_df.groupby(['card1'])['TransactionAmt'].transform('mean')
test_df['TransactionAmt_to_mean_card4'] = test_df['TransactionAmt'] / test_df.groupby(['card4'])['TransactionAmt'].transform('mean')
test_df['TransactionAmt_to_std_card1'] = test_df['TransactionAmt'] / test_df.groupby(['card1'])['TransactionAmt'].transform('std')
test_df['TransactionAmt_to_std_card4'] = test_df['TransactionAmt'] / test_df.groupby(['card4'])['TransactionAmt'].transform('std')

train_df['id_02_to_mean_card1'] = train_df['id_02'] / train_df.groupby(['card1'])['id_02'].transform('mean')
train_df['id_02_to_mean_card4'] = train_df['id_02'] / train_df.groupby(['card4'])['id_02'].transform('mean')
train_df['id_02_to_std_card1'] = train_df['id_02'] / train_df.groupby(['card1'])['id_02'].transform('std')
train_df['id_02_to_std_card4'] = train_df['id_02'] / train_df.groupby(['card4'])['id_02'].transform('std')

test_df['id_02_to_mean_card1'] = test_df['id_02'] / test_df.groupby(['card1'])['id_02'].transform('mean')
test_df['id_02_to_mean_card4'] = test_df['id_02'] / test_df.groupby(['card4'])['id_02'].transform('mean')
test_df['id_02_to_std_card1'] = test_df['id_02'] / test_df.groupby(['card1'])['id_02'].transform('std')
test_df['id_02_to_std_card4'] = test_df['id_02'] / test_df.groupby(['card4'])['id_02'].transform('std')

train_df['D15_to_mean_card1'] = train_df['D15'] / train_df.groupby(['card1'])['D15'].transform('mean')
train_df['D15_to_mean_card4'] = train_df['D15'] / train_df.groupby(['card4'])['D15'].transform('mean')
train_df['D15_to_std_card1'] = train_df['D15'] / train_df.groupby(['card1'])['D15'].transform('std')
train_df['D15_to_std_card4'] = train_df['D15'] / train_df.groupby(['card4'])['D15'].transform('std')

test_df['D15_to_mean_card1'] = test_df['D15'] / test_df.groupby(['card1'])['D15'].transform('mean')
test_df['D15_to_mean_card4'] = test_df['D15'] / test_df.groupby(['card4'])['D15'].transform('mean')
test_df['D15_to_std_card1'] = test_df['D15'] / test_df.groupby(['card1'])['D15'].transform('std')
test_df['D15_to_std_card4'] = test_df['D15'] / test_df.groupby(['card4'])['D15'].transform('std')

train_df['D15_to_mean_addr1'] = train_df['D15'] / train_df.groupby(['addr1'])['D15'].transform('mean')
train_df['D15_to_mean_card4'] = train_df['D15'] / train_df.groupby(['card4'])['D15'].transform('mean')
train_df['D15_to_std_addr1'] = train_df['D15'] / train_df.groupby(['addr1'])['D15'].transform('std')
train_df['D15_to_std_card4'] = train_df['D15'] / train_df.groupby(['card4'])['D15'].transform('std')

test_df['D15_to_mean_addr1'] = test_df['D15'] / test_df.groupby(['addr1'])['D15'].transform('mean')
test_df['D15_to_mean_card4'] = test_df['D15'] / test_df.groupby(['card4'])['D15'].transform('mean')
test_df['D15_to_std_addr1'] = test_df['D15'] / test_df.groupby(['addr1'])['D15'].transform('std')
test_df['D15_to_std_card4'] = test_df['D15'] / test_df.groupby(['card4'])['D15'].transform('std')

train_df['Transaction_day_of_week'] = np.floor((train_df['TransactionDT'] / (3600 * 24) - 1) % 7)
test_df['Transaction_day_of_week'] = np.floor((test_df['TransactionDT'] / (3600 * 24) - 1) % 7)

train_df['Transaction_hour_of_day'] = np.floor(train_df['TransactionDT'] / 3600) % 24
test_df['Transaction_hour_of_day'] = np.floor(test_df['TransactionDT'] / 3600) % 24

train_df['TransactionAmt_decimal'] = ((train_df['TransactionAmt'] - train_df['TransactionAmt'].astype(int)) * 1000).astype(int)
test_df['TransactionAmt_decimal'] = ((test_df['TransactionAmt'] - test_df['TransactionAmt'].astype(int)) * 1000).astype(int)

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: lastest_df_browser, dtype: float64

In [13]:
from sklearn import preprocessing

for feature in ['id_02__id_20', 'id_02__D8', 'D11__DeviceInfo', 'DeviceInfo__P_emaildomain', 'P_emaildomain__C2', 
                'card2__dist1', 'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1']:

    f1, f2 = feature.split('__')
    train_df[feature] = train_df[f1].astype(str) + '_' + train_df[f2].astype(str)
    test_df[feature] = test_df[f1].astype(str) + '_' + test_df[f2].astype(str)

    le =preprocessing.LabelEncoder()
    le.fit(list(train_df[feature].astype(str).values) + list(test_df[feature].astype(str).values))
    train_df[feature] = le.transform(list(train_df[feature].astype(str).values))
    test_df[feature] = le.transform(list(test_df[feature].astype(str).values))
    
for feature in ['id_34', 'id_36']:
    # Count encoded for both train_df and test_df
    train_df[feature + '_count_full'] = train_df[feature].map(pd.concat([train_df[feature], test_df[feature]], ignore_index=True).value_counts(dropna=False))
    test_df[feature + '_count_full'] = test_df[feature].map(pd.concat([train_df[feature], test_df[feature]], ignore_index=True).value_counts(dropna=False))
        
for feature in ['id_01', 'id_31', 'id_33', 'id_35', 'id_36']:
    # Count encoded separately for train_df and test_df
    train_df[feature + '_count_dist'] = train_df[feature].map(train_df[feature].value_counts(dropna=False))
    test_df[feature + '_count_dist'] = test_df[feature].map(test_df[feature].value_counts(dropna=False))

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

In [14]:
for col in ['card4', 'card6', 'ProductCD']:
    print('Encoding', col)
    temp_df = pd.concat([train_df[[col]], test_df[[col]]])
    col_encoded = temp_df[col].value_counts().to_dict()   
    train_df[col] = train_df[col].map(col_encoded)
    test_df[col]  = test_df[col].map(col_encoded)
    print(col_encoded)
    
for col in ['M1','M2','M3','M5','M6','M7','M8','M9']:
    train_df[col] = train_df[col].map({'T':1, 'F':0})
    test_df[col]  = test_df[col].map({'T':1, 'F':0})

for col in ['M4']:
    print('Encoding', col)
    temp_df = pd.concat([train_df[[col]], test_df[[col]]])
    col_encoded = temp_df[col].value_counts().to_dict()   
    train_df[col] = train_df[col].map(col_encoded)
    test_df[col]  = test_df[col].map(col_encoded)
    print(col_encoded)

Encoding card4
{'mastercard': 347386, 'discover': 9524, 'visa': 719649, 'american express': 16009}
Encoding card6
{'debit': 824959, 'credit': 267648, 'charge card': 16, 'debit or credit': 30}
Encoding ProductCD
{'W': 800657, 'S': 23046, 'C': 137785, 'H': 62397, 'R': 73346}
Encoding M4
{'M0': 357789, 'M2': 122947, 'M1': 97306}


In [15]:
def minify_identity_df(df):

    df['id_12'] = df['id_12'].map({'Found':1, 'NotFound':0})
    df['id_15'] = df['id_15'].map({'New':2, 'Found':1, 'Unknown':0})
    df['id_16'] = df['id_16'].map({'Found':1, 'NotFound':0})

    df['id_23'] = df['id_23'].map({'TRANSPARENT':4, 'IP_PROXY':3, 'IP_PROXY:ANONYMOUS':2, 'IP_PROXY:HIDDEN':1})

    df['id_27'] = df['id_27'].map({'Found':1, 'NotFound':0})
    df['id_28'] = df['id_28'].map({'New':2, 'Found':1})

    df['id_29'] = df['id_29'].map({'Found':1, 'NotFound':0})

    df['id_35'] = df['id_35'].map({'T':1, 'F':0})
    df['id_36'] = df['id_36'].map({'T':1, 'F':0})
    df['id_37'] = df['id_37'].map({'T':1, 'F':0})
    df['id_38'] = df['id_38'].map({'T':1, 'F':0})

    df['id_34'] = df['id_34'].fillna(':0')
    df['id_34'] = df['id_34'].apply(lambda x: x.split(':')[1]).astype(np.int8)
    df['id_34'] = np.where(df['id_34']==0, np.nan, df['id_34'])
    
    df['id_33'] = df['id_33'].fillna('0x0')
    df['id_33_0'] = df['id_33'].apply(lambda x: x.split('x')[0]).astype(int)
    df['id_33_1'] = df['id_33'].apply(lambda x: x.split('x')[1]).astype(int)
    df['id_33'] = np.where(df['id_33']=='0x0', np.nan, df['id_33'])

    df['DeviceType'].map({'desktop':1, 'mobile':0})
    return df

train_identity = minify_identity_df(train_identity)
test_identity = minify_identity_df(test_identity)

for col in ['id_33']:
    train_identity[col] = train_identity[col].fillna('unseen_before_label')
    test_identity[col]  = test_identity[col].fillna('unseen_before_label')
    
    le = LabelEncoder()
    le.fit(list(train_identity[col])+list(test_identity[col]))
    train_identity[col] = le.transform(train_identity[col])
    test_identity[col]  = le.transform(test_identity[col])

LabelEncoder()

In [16]:
train_df = reduce_mem_usage(train_df)
test_df  = reduce_mem_usage(test_df)

train_identity = reduce_mem_usage(train_identity)
test_identity  = reduce_mem_usage(test_identity)

Mem. usage decreased to 707.92 Mb (13.7% reduction)
Mem. usage decreased to 618.04 Mb (13.2% reduction)
Mem. usage decreased to 15.54 Mb (44.6% reduction)
Mem. usage decreased to 15.29 Mb (44.6% reduction)


In [17]:
def values_normalization(dt_df, periods, columns):
    for period in periods:
        for col in columns:
            new_col = col +'_'+ period
            dt_df[col] = dt_df[col].astype(float)  

            temp_min = dt_df.groupby([period])[col].agg(['min']).reset_index()
            temp_min.index = temp_min[period].values
            temp_min = temp_min['min'].to_dict()

            temp_max = dt_df.groupby([period])[col].agg(['max']).reset_index()
            temp_max.index = temp_max[period].values
            temp_max = temp_max['max'].to_dict()

            temp_mean = dt_df.groupby([period])[col].agg(['mean']).reset_index()
            temp_mean.index = temp_mean[period].values
            temp_mean = temp_mean['mean'].to_dict()

            temp_std = dt_df.groupby([period])[col].agg(['std']).reset_index()
            temp_std.index = temp_std[period].values
            temp_std = temp_std['std'].to_dict()

            dt_df['temp_min'] = dt_df[period].map(temp_min)
            dt_df['temp_max'] = dt_df[period].map(temp_max)
            dt_df['temp_mean'] = dt_df[period].map(temp_mean)
            dt_df['temp_std'] = dt_df[period].map(temp_std)

            dt_df[new_col+'_min_max'] = (dt_df[col]-dt_df['temp_min'])/(dt_df['temp_max']-dt_df['temp_min'])
            dt_df[new_col+'_std_score'] = (dt_df[col]-dt_df['temp_mean'])/(dt_df['temp_std'])
            del dt_df['temp_min'],dt_df['temp_max'],dt_df['temp_mean'],dt_df['temp_std']
    return dt_df

def frequency_encoding(train_df, test_df, columns, self_encoding=False):
    for col in columns:
        temp_df = pd.concat([train_df[[col]], test_df[[col]]])
        fq_encode = temp_df[col].value_counts(dropna=False).to_dict()
        if self_encoding:
            train_df[col] = train_df[col].map(fq_encode)
            test_df[col]  = test_df[col].map(fq_encode)            
        else:
            train_df[col+'_fq_enc'] = train_df[col].map(fq_encode)
            test_df[col+'_fq_enc']  = test_df[col].map(fq_encode)
    return train_df, test_df

def timeblock_frequency_encoding(train_df, test_df, periods, columns, 
                                 with_proportions=True, only_proportions=False):
    for period in periods:
        for col in columns:
            new_col = col +'_'+ period
            train_df[new_col] = train_df[col].astype(str)+'_'+train_df[period].astype(str)
            test_df[new_col]  = test_df[col].astype(str)+'_'+test_df[period].astype(str)

            temp_df = pd.concat([train_df[[new_col]], test_df[[new_col]]])
            fq_encode = temp_df[new_col].value_counts().to_dict()

            train_df[new_col] = train_df[new_col].map(fq_encode)
            test_df[new_col]  = test_df[new_col].map(fq_encode)
            
            if only_proportions:
                train_df[new_col] = train_df[new_col]/train_df[period+'_total']
                test_df[new_col]  = test_df[new_col]/test_df[period+'_total']

            if with_proportions:
                train_df[new_col+'_proportions'] = train_df[new_col]/train_df[period+'_total']
                test_df[new_col+'_proportions']  = test_df[new_col]/test_df[period+'_total']

    return train_df, test_df

def uid_aggregation(train_df, test_df, main_columns, uids, aggregations):
    for main_column in main_columns:  
        for col in uids:
            for agg_type in aggregations:
                new_col_name = col+'_'+main_column+'_'+agg_type
                temp_df = pd.concat([train_df[[col, main_column]], test_df[[col,main_column]]])
                temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
                                                        columns={agg_type: new_col_name})

                temp_df.index = list(temp_df[col])
                temp_df = temp_df[new_col_name].to_dict()   

                train_df[new_col_name] = train_df[col].map(temp_df)
                test_df[new_col_name]  = test_df[col].map(temp_df)
    return train_df, test_df

def uid_aggregation_and_normalization(train_df, test_df, main_columns, uids, aggregations):
    for main_column in main_columns:  
        for col in uids:
            
            new_norm_col_name = col+'_'+main_column+'_std_norm'
            norm_cols = []
            
            for agg_type in aggregations:
                new_col_name = col+'_'+main_column+'_'+agg_type
                temp_df = pd.concat([train_df[[col, main_column]], test_df[[col,main_column]]])
                temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
                                                        columns={agg_type: new_col_name})

                temp_df.index = list(temp_df[col])
                temp_df = temp_df[new_col_name].to_dict()   

                train_df[new_col_name] = train_df[col].map(temp_df)
                test_df[new_col_name]  = test_df[col].map(temp_df)
                norm_cols.append(new_col_name)
            
            train_df[new_norm_col_name] = (train_df[main_column]-train_df[norm_cols[0]])/train_df[norm_cols[1]]
            test_df[new_norm_col_name]  = (test_df[main_column]-test_df[norm_cols[0]])/test_df[norm_cols[1]]          
            
            del train_df[norm_cols[0]], train_df[norm_cols[1]]
            del test_df[norm_cols[0]], test_df[norm_cols[1]]
                                              
    return train_df, test_df


def check_cor_and_remove(train_df, test_df, i_cols, new_columns, remove=False):
    # Check correllation
    print('Correlations','#'*10)
    for col in new_columns:
        cor_cof = np.corrcoef(train_df[TARGET], train_df[col].fillna(0))[0][1]
        print(col, cor_cof)

    if remove:
        print('#'*10)
        print('Best options:')
        best_fe_columns = []
        for main_col in i_cols:
            best_option = ''
            best_cof = 0
            for col in new_columns:
                if main_col in col:
                    cor_cof = np.corrcoef(train_df[TARGET], train_df[col].fillna(0))[0][1]
                    cor_cof = (cor_cof**2)**0.5
                    if cor_cof>best_cof:
                        best_cof = cor_cof
                        best_option = col

            print(main_col, best_option, best_cof)            
            best_fe_columns.append(best_option)

        for col in new_columns:
            if col not in best_fe_columns:
                del train_df[col], test_df[col]

    return train_df, test_df

In [18]:
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')
remove_features = []

In [19]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
dates_range = pd.date_range(start='2017-10-01', end='2019-01-01')
us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())

# Let's add temporary "time variables" for aggregations
# and add normal "time variables"
for df in [train_df, test_df]:
    
    # Temporary variables for aggregation
    df['DT'] = df['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
    df['DT_M'] = ((df['DT'].dt.year-2017)*12 + df['DT'].dt.month).astype(np.int8)
    df['DT_W'] = ((df['DT'].dt.year-2017)*52 + df['DT'].dt.weekofyear).astype(np.int8)
    df['DT_D'] = ((df['DT'].dt.year-2017)*365 + df['DT'].dt.dayofyear).astype(np.int16)
    
    df['DT_hour'] = (df['DT'].dt.hour).astype(np.int8)
    df['DT_day_week'] = (df['DT'].dt.dayofweek).astype(np.int8)
    df['DT_day_month'] = (df['DT'].dt.day).astype(np.int8)
        
    # Possible solo feature
    df['is_december'] = df['DT'].dt.month
    df['is_december'] = (df['is_december']==12).astype(np.int8)

    # Holidays
    df['is_holiday'] = (df['DT'].dt.date.astype('datetime64').isin(us_holidays)).astype(np.int8)

# Remove temporary features from final list
remove_features += ['DT','DT_M','DT_W','DT_D','DT_hour','DT_day_week','DT_day_month']
    
# Total transactions per timeblock
for col in ['DT_M','DT_W','DT_D']:
    temp_df = pd.concat([train_df[[col]], test_df[[col]]])
    fq_encode = temp_df[col].value_counts().to_dict()
            
    train_df[col+'_total'] = train_df[col].map(fq_encode)
    test_df[col+'_total']  = test_df[col].map(fq_encode)
    
    # We can't use it as solo feature
    remove_features.append(col+'_total')

In [20]:
########################### Card columns "outliers"
for col in ['card1']: 
    valid_card = pd.concat([train_df[[col]], test_df[[col]]])
    valid_card = valid_card[col].value_counts()
    valid_card_std = valid_card.values.std()

    invalid_cards = valid_card[valid_card<=2]
    print('Rare cards',len(invalid_cards))

    valid_card = valid_card[valid_card>2]
    valid_card = list(valid_card.index)

    print('No intersection in Train', len(train_df[~train_df[col].isin(test_df[col])]))
    print('Intersection in Train', len(train_df[train_df[col].isin(test_df[col])]))
    
    train_df[col] = np.where(train_df[col].isin(test_df[col]), train_df[col], np.nan)
    test_df[col]  = np.where(test_df[col].isin(train_df[col]), test_df[col], np.nan)

    train_df[col] = np.where(train_df[col].isin(valid_card), train_df[col], np.nan)
    test_df[col]  = np.where(test_df[col].isin(valid_card), test_df[col], np.nan)
    print('#'*20)

for col in ['card2','card3','card4','card5','card6',]: 
    print('No intersection in Train', col, len(train_df[~train_df[col].isin(test_df[col])]))
    print('Intersection in Train', col, len(train_df[train_df[col].isin(test_df[col])]))
    
    train_df[col] = np.where(train_df[col].isin(test_df[col]), train_df[col], np.nan)
    test_df[col]  = np.where(test_df[col].isin(train_df[col]), test_df[col], np.nan)
    print('#'*20)

Rare cards 5993
No intersection in Train 10396
Intersection in Train 580144
####################
No intersection in Train card2 5012
Intersection in Train card2 585528
####################
No intersection in Train card3 47
Intersection in Train card3 590493
####################
No intersection in Train card4 0
Intersection in Train card4 590540
####################
No intersection in Train card5 7279
Intersection in Train card5 583261
####################
No intersection in Train card6 30
Intersection in Train card6 590510
####################


In [21]:
train_df['uid'] = train_df['card1'].astype(str)+'_'+train_df['card2'].astype(str)
test_df['uid'] = test_df['card1'].astype(str)+'_'+test_df['card2'].astype(str)

train_df['uid2'] = train_df['uid'].astype(str)+'_'+train_df['card3'].astype(str)+'_'+train_df['card5'].astype(str)
test_df['uid2'] = test_df['uid'].astype(str)+'_'+test_df['card3'].astype(str)+'_'+test_df['card5'].astype(str)

train_df['uid3'] = train_df['uid2'].astype(str)+'_'+train_df['addr1'].astype(str)+'_'+train_df['addr2'].astype(str)
test_df['uid3'] = test_df['uid2'].astype(str)+'_'+test_df['addr1'].astype(str)+'_'+test_df['addr2'].astype(str)

train_df['uid4'] = train_df['uid3'].astype(str)+'_'+train_df['P_emaildomain'].astype(str)
test_df['uid4'] = test_df['uid3'].astype(str)+'_'+test_df['P_emaildomain'].astype(str)

train_df['uid5'] = train_df['uid3'].astype(str)+'_'+train_df['R_emaildomain'].astype(str)
test_df['uid5'] = test_df['uid3'].astype(str)+'_'+test_df['R_emaildomain'].astype(str)

# Add values remove list
new_columns = ['uid','uid2','uid3','uid4','uid5']
remove_features += new_columns

print('#'*10)
print('Most common uIds:')
for col in new_columns:
    print('#'*10, col)
    print(train_df[col].value_counts()[:10])

# Do Global frequency encoding 
i_cols = ['card1','card2','card3','card5'] + new_columns
train_df, test_df = frequency_encoding(train_df, test_df, i_cols, self_encoding=False)

##########
Most common uIds:
########## uid
7919.0_194.0     14891
9500.0_321.0     14112
15885.0_545.0    10332
17188.0_321.0    10312
15066.0_170.0     7918
12695.0_490.0     7079
6019.0_583.0      6766
12544.0_321.0     6760
2803.0_100.0      6126
7585.0_553.0      5325
Name: uid, dtype: int64
########## uid2
9500.0_321.0_150.0_226.0     14112
15885.0_545.0_185.0_138.0    10332
17188.0_321.0_150.0_226.0    10312
7919.0_194.0_150.0_166.0      8844
15066.0_170.0_150.0_102.0     7918
12695.0_490.0_150.0_226.0     7079
6019.0_583.0_150.0_226.0      6766
12544.0_321.0_150.0_226.0     6760
2803.0_100.0_150.0_226.0      6126
7919.0_194.0_150.0_nan        6047
Name: uid2, dtype: int64
########## uid3
15885.0_545.0_185.0_138.0_nan_nan       9900
17188.0_321.0_150.0_226.0_299.0_87.0    5862
12695.0_490.0_150.0_226.0_325.0_87.0    5766
9500.0_321.0_150.0_226.0_204.0_87.0     4647
3154.0_408.0_185.0_224.0_nan_nan        4398
12839.0_321.0_150.0_226.0_264.0_87.0    3538
16132.0_111.0_150.0_226.0

In [22]:
for df in [train_df, test_df]:
    df['bank_type'] = df['card3'].astype(str) +'_'+ df['card5'].astype(str)
remove_features.append('bank_type') 

encoding_mean = {
    1: ['DT_D','DT_hour','_hour_dist','DT_hour_mean'],
    2: ['DT_W','DT_day_week','_week_day_dist','DT_day_week_mean'],
    3: ['DT_M','DT_day_month','_month_day_dist','DT_day_month_mean'],
    }

encoding_best = {
    1: ['DT_D','DT_hour','_hour_dist_best','DT_hour_best'],
    2: ['DT_W','DT_day_week','_week_day_dist_best','DT_day_week_best'],
    3: ['DT_M','DT_day_month','_month_day_dist_best','DT_day_month_best'],   
    }

# Some ugly code here (even worse than in other parts)
for col in ['card3','card5','bank_type']:
    for df in [train_df, test_df]:
        for encode in encoding_mean:
            encode = encoding_mean[encode].copy()
            new_col = col + '_' + encode[0] + encode[2]
            df[new_col] = df[col].astype(str) +'_'+ df[encode[0]].astype(str)

            temp_dict = df.groupby([new_col])[encode[1]].agg(['mean']).reset_index().rename(
                                                                    columns={'mean': encode[3]})
            temp_dict.index = temp_dict[new_col].values
            temp_dict = temp_dict[encode[3]].to_dict()
            df[new_col] = df[encode[1]] - df[new_col].map(temp_dict)

        for encode in encoding_best:
            encode = encoding_best[encode].copy()
            new_col = col + '_' + encode[0] + encode[2]
            df[new_col] = df[col].astype(str) +'_'+ df[encode[0]].astype(str)
            temp_dict = df.groupby([col,encode[0],encode[1]])[encode[1]].agg(['count']).reset_index().rename(
                                                                    columns={'count': encode[3]})

            temp_dict.sort_values(by=[col,encode[0],encode[3]], inplace=True)
            temp_dict = temp_dict.drop_duplicates(subset=[col,encode[0]], keep='last')
            temp_dict[new_col] = temp_dict[col].astype(str) +'_'+ temp_dict[encode[0]].astype(str)
            temp_dict.index = temp_dict[new_col].values
            temp_dict = temp_dict[encode[1]].to_dict()
            df[new_col] = df[encode[1]] - df[new_col].map(temp_dict)

In [23]:
i_cols = ['bank_type'] #['uid','uid2','uid3','uid4','uid5','bank_type']
periods = ['DT_M','DT_W','DT_D']

# We have few options to encode it here:
# - Just count transactions
# (but some timblocks have more transactions than others)
# - Devide to total transactions per timeblock (proportions)
# - Use both
# - Use only proportions
train_df, test_df = timeblock_frequency_encoding(train_df, test_df, periods, i_cols, 
                                 with_proportions=False, only_proportions=True)

In [24]:
i_cols = ['D'+str(i) for i in range(1,16)]
uids = ['uid','uid2','uid3','uid4','uid5','bank_type']
aggregations = ['mean','std']

####### uIDs aggregations
train_df, test_df = uid_aggregation(train_df, test_df, i_cols, uids, aggregations)

####### Cleaning Neagtive values and columns transformations
for df in [train_df, test_df]:

    for col in i_cols:
        df[col] = df[col].clip(0) 
    
    # Lets transform D8 and D9 column
    # As we almost sure it has connection with hours
    df['D9_not_na'] = np.where(df['D9'].isna(),0,1)
    df['D8_not_same_day'] = np.where(df['D8']>=1,1,0)
    df['D8_D9_decimal_dist'] = df['D8'].fillna(0)-df['D8'].fillna(0).astype(int)
    df['D8_D9_decimal_dist'] = ((df['D8_D9_decimal_dist']-df['D9'])**2)**0.5
    df['D8'] = df['D8'].fillna(-1).astype(int)

####### Values Normalization
i_cols.remove('D1')
i_cols.remove('D2')
i_cols.remove('D9')
periods = ['DT_D','DT_W','DT_M']
for df in [train_df, test_df]:
    df = values_normalization(df, periods, i_cols)

for col in ['D1','D2']:
    for df in [train_df, test_df]:
        df[col+'_scaled'] = df[col]/train_df[col].max()
        
####### Global Self frequency encoding
# self_encoding=True because 
# we don't need original values anymore
i_cols = ['D'+str(i) for i in range(1,16)]
train_df, test_df = frequency_encoding(train_df, test_df, i_cols, self_encoding=True)

In [25]:
i_cols = ['TransactionAmt']
periods = ['DT_D']

temp_df = pd.concat([train_df[['TransactionDT']+i_cols+periods], test_df[['TransactionDT']+i_cols+periods]])

# Clip Values
train_df['TransactionAmt'] = train_df['TransactionAmt'].clip(0,5000)
test_df['TransactionAmt']  = test_df['TransactionAmt'].clip(0,5000)

# Check if the Transaction Amount is common or not (we can use freq encoding here)
# In our dialog with a model we are telling to trust or not to these values   
train_df['TransactionAmt_check'] = np.where(train_df['TransactionAmt'].isin(test_df['TransactionAmt']), 1, 0)
test_df['TransactionAmt_check']  = np.where(test_df['TransactionAmt'].isin(train_df['TransactionAmt']), 1, 0)

# For our model current TransactionAmt is a noise
# https://www.kaggle.com/kyakovlev/ieee-check-noise
# (even if features importances are telling contrariwise)
# There are many unique values and model doesn't generalize well
# Lets do some aggregations
i_cols = ['TransactionAmt']
uids = ['card1','card2','card3','card5','uid','uid2','uid3','uid4','uid5','bank_type']
aggregations = ['mean','std']

# uIDs aggregations
train_df, test_df = uid_aggregation(train_df, test_df, i_cols, uids, aggregations)
 
# TransactionAmt Normalization
periods = ['DT_D','DT_W','DT_M']
for df in [train_df, test_df]:
    df = values_normalization(df, periods, i_cols)

# Product type
train_df['product_type'] = train_df['ProductCD'].astype(str)+'_'+train_df['TransactionAmt'].astype(str)
test_df['product_type'] = test_df['ProductCD'].astype(str)+'_'+test_df['TransactionAmt'].astype(str)

i_cols = ['product_type']
periods = ['DT_D','DT_W','DT_M']
train_df, test_df = timeblock_frequency_encoding(train_df, test_df, periods, i_cols, 
                                                 with_proportions=False, only_proportions=True)
train_df, test_df = frequency_encoding(train_df, test_df, i_cols, self_encoding=True)

# Small "hack" to transform distribution 
# (doesn't affect auc much, but I like it more)
# please see how distribution transformation can boost your score 
# (not our case but related)
# https://scikit-learn.org/stable/auto_examples/compose/plot_transformed_target.html
train_df['TransactionAmt'] = np.log1p(train_df['TransactionAmt'])
test_df['TransactionAmt'] = np.log1p(test_df['TransactionAmt'])

In [26]:
i_cols = ['C'+str(i) for i in range(1,15)]

####### Global Self frequency encoding
# self_encoding=False because 
# I want to keep original values
train_df, test_df = frequency_encoding(train_df, test_df, i_cols, self_encoding=False)

####### Clip max values
for df in [train_df, test_df]:
    for col in i_cols:
        max_value = train_df[train_df['DT_M']==train_df['DT_M'].max()][col].max()
        df[col] = df[col].clip(None,max_value) 

In [28]:
train_identity.head()
test_identity.head()

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,id_33_0,id_33_1,DeviceInfo_device,DeviceInfo_version,id_30_device,id_30_version,id_31_device
0,2987004,0.0,70787.0,,,,,,,,,100.0,0,,-480.0,2.0,0.0,166.0,,542.0,144.0,,,,,,,,2.0,0.0,android 7.0,samsung browser 6.2,32.0,267,2.0,1.0,0.0,1.0,1.0,mobile,samsung sm-g892a build/nrd90m,2220,1080,samsungsmgabuildnrdm,89290.0,android,70.0,samsungbrowser
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,,100.0,0,49.0,-300.0,2.0,0.0,166.0,,621.0,500.0,,,,,,,,2.0,0.0,ios 11.1.2,mobile safari 11.0,32.0,79,1.0,1.0,0.0,0.0,1.0,mobile,ios device,1334,750,iosdevice,,ios,1112.0,mobilesafari
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,0.0,100.0,0,52.0,,1.0,1.0,121.0,,410.0,142.0,,,,,,,,1.0,1.0,unknown_device,chrome 62.0,,460,,0.0,0.0,1.0,1.0,desktop,windows,0,0,windows,,unknowndevice,,chrome
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,,100.0,0,52.0,,2.0,0.0,225.0,,176.0,507.0,,,,,,,,2.0,0.0,unknown_device,chrome 62.0,,460,,0.0,0.0,1.0,1.0,desktop,unknown_device,0,0,unknowndevice,,unknowndevice,,chrome
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,0.0,100.0,0,,-300.0,1.0,1.0,166.0,15.0,529.0,575.0,,,,,,,,1.0,1.0,mac os x 10_11_6,chrome 62.0,24.0,67,2.0,1.0,0.0,1.0,1.0,desktop,macos,1280,800,macos,,macosx,10116.0,chrome


Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,id_33_0,id_33_1,DeviceInfo_device,DeviceInfo_version,id_30_device,id_30_version,id_31_device
0,3663586,-45.0,280290.0,,,0.0,0.0,,,,,100.0,0,27.0,,2.0,0.0,225.0,15.0,427.0,563.0,,,,,,,,2.0,0.0,unknown_device,chrome 67.0 for android,,460,,0.0,0.0,1.0,0.0,mobile,mya-l13 build/huaweimya-l13,0,0,myalbuildhuaweimyal,1313,unknowndevice,,chromeforandroid
1,3663588,0.0,3579.0,0.0,0.0,0.0,0.0,,,0.0,0.0,100.0,1,,-300.0,1.0,1.0,166.0,,542.0,368.0,,,,,,,,1.0,1.0,android 6.0.1,chrome 67.0 for android,24.0,61,2.0,1.0,0.0,1.0,1.0,mobile,lgls676 build/mxb48t,1280,720,lglsbuildmxbt,67648,android,601.0,chromeforandroid
2,3663597,-5.0,185210.0,,,1.0,0.0,,,,,100.0,0,52.0,-360.0,2.0,0.0,225.0,,271.0,507.0,,,,,,,,2.0,0.0,unknown_device,ie 11.0 for tablet,,460,,0.0,1.0,1.0,0.0,desktop,trident/7.0,0,0,trident,70,unknowndevice,,iefortablet
3,3663601,-45.0,252944.0,0.0,0.0,0.0,0.0,,,0.0,0.0,100.0,0,27.0,,1.0,1.0,225.0,15.0,427.0,563.0,,,,,,,,1.0,1.0,unknown_device,chrome 67.0 for android,,460,,0.0,0.0,1.0,0.0,mobile,mya-l13 build/huaweimya-l13,0,0,myalbuildhuaweimyal,1313,unknowndevice,,chromeforandroid
4,3663602,-95.0,328680.0,,,7.0,-33.0,,,,,100.0,0,27.0,,2.0,0.0,225.0,15.0,567.0,507.0,,,,,,,,2.0,0.0,unknown_device,chrome 67.0 for android,,460,,0.0,0.0,1.0,0.0,mobile,sm-g9650 build/r16nw,0,0,smgbuildrnw,965016,unknowndevice,,chromeforandroid


In [30]:
for df in [train_identity, test_identity]:
    ########################### Device info
    df['DeviceInfo'] = df['DeviceInfo'].fillna('unknown_device').str.lower()
    df['DeviceInfo_device'] = df['DeviceInfo'].apply(lambda x: ''.join([i for i in x if i.isalpha()]))
    df['DeviceInfo_version'] = df['DeviceInfo'].apply(lambda x: ''.join([i for i in x if i.isnumeric()]))
    
    ########################### Device info 2
    df['id_30'] = df['id_30'].fillna('unknown_device').str.lower()
    df['id_30_device'] = df['id_30'].apply(lambda x: ''.join([i for i in x if i.isalpha()]))
    df['id_30_version'] = df['id_30'].apply(lambda x: ''.join([i for i in x if i.isnumeric()]))
    
    ########################### Browser
    df['id_31'] = df['id_31'].fillna('unknown_device').str.lower()
    df['id_31_device'] = df['id_31'].apply(lambda x: ''.join([i for i in x if i.isalpha()]))
    

i_cols = [
          'DeviceInfo','DeviceInfo_device','DeviceInfo_version',
          'id_30','id_30_device','id_30_version',
          'id_31','id_31_device',
          'id_33',
         ]

temp_df = train_df[['TransactionID']]
temp_df = temp_df.merge(train_identity, on=['TransactionID'], how='left')
temp_df = temp_df[[fea for fea in temp_df if fea not in train_df]]
del temp_df['TransactionID']
train_df = pd.concat([train_df,temp_df], axis=1)
    
temp_df = test_df[['TransactionID']]
temp_df = temp_df.merge(test_identity, on=['TransactionID'], how='left')
del temp_df['TransactionID']
temp_df = temp_df[[fea for fea in temp_df if fea not in test_df]]
test_df = pd.concat([test_df,temp_df], axis=1)

####### Global Self frequency encoding
# self_encoding=True because 
# we don't need original values anymore
train_df, test_df = frequency_encoding(train_df, test_df, i_cols, self_encoding=True)

In [32]:
train_df.shape
test_df.shape

(590540, 841)

(506691, 841)

In [33]:
TARGET = 'isFraud'

In [34]:
for col in ['ProductCD','M4']:
    temp_dict = train_df.groupby([col])[TARGET].agg(['mean']).reset_index().rename(
                                                        columns={'mean': col+'_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col+'_target_mean'].to_dict()

    train_df[col] = train_df[col].map(temp_dict)
    test_df[col]  = test_df[col].map(temp_dict)

In [35]:
for col in list(train_df):
    if train_df[col].dtype=='O':
        print(col)
        train_df[col] = train_df[col].fillna('unseen_before_label')
        test_df[col]  = test_df[col].fillna('unseen_before_label')
        
        train_df[col] = train_df[col].astype(str)
        test_df[col] = test_df[col].astype(str)
        
        le = LabelEncoder()
        le.fit(list(train_df[col])+list(test_df[col]))
        train_df[col] = le.transform(train_df[col])
        test_df[col]  = le.transform(test_df[col])
        
        train_df[col] = train_df[col].astype('category')
        test_df[col] = test_df[col].astype('category')

P_emaildomain


LabelEncoder()

R_emaildomain


LabelEncoder()

id_12


LabelEncoder()

id_15


LabelEncoder()

id_16


LabelEncoder()

id_23


LabelEncoder()

id_27


LabelEncoder()

id_28


LabelEncoder()

id_29


LabelEncoder()

id_34


LabelEncoder()

id_35


LabelEncoder()

id_36


LabelEncoder()

id_37


LabelEncoder()

id_38


LabelEncoder()

DeviceType


LabelEncoder()

P_emaildomain_bin


LabelEncoder()

P_emaildomain_suffix


LabelEncoder()

R_emaildomain_bin


LabelEncoder()

R_emaildomain_suffix


LabelEncoder()

uid


LabelEncoder()

uid2


LabelEncoder()

uid3


LabelEncoder()

uid4


LabelEncoder()

uid5


LabelEncoder()

bank_type


LabelEncoder()

In [36]:
data_backup = pd.concat([train_df, test_df])

In [37]:
import gc
gc.collect()

1387

In [38]:
data_backup.to_pickle('tmp')

### 1.1 add four way combination 

In [40]:
from copy import deepcopy

In [41]:
data = deepcopy(data_backup)

In [42]:
# Specified four-way operations
#     This function calculates a specified set of four-way operations between pairs of columns x and y, specifically
#     x+y, x-y, xy, x/y
#     where in each case the operation is done elementwise.
# 
#     This function assumes that the columns to be acted on are normalized to (a subset of) the interval [-1,1]
#     The function can be used for columns with values outside [-1,1] but there are no checks included to ensure 
#     reasonable operation or useful output
#
#     Currently fourwayColumns is an array where
#               fourwayColumns[0] is a list of column names, and 
#               fourwayColumns[1] is 1 if the column should be included in the summing operation, and zero otherwise
#               fourwayColumns[2] is 1 if the column should be included in the difference operation, and zero otherwise
#               fourwayColumns[3] is 1 if the column should be included in the product operation, and zero otherwise
#               fourwayColumns[4] is 1 if the column should be included in the quotient (division) operation, and zero otherwise
#     The intent of using this structure is to allow the info in fourwayColumns[1] through [4] to be later provided
#     within a ColumnRules object

import itertools

def process_feed_four_way_ops(df, fourwayColumns):
    #TODO:HH - protect against logs of zero, and zero divisions 
    addedColumnList = []
    
    baseColumnList = []
    for n in range(len(fourwayColumns)):
        baseColumnList.append(fourwayColumns[n][0])
    # Check that there are at least two columns to operate on
    if len(baseColumnList)==0:
        return df, addedColumnList
    
    # Summing operation
    sumColumns = []
    for n, col in enumerate(baseColumnList):
        if fourwayColumns[n][1]==1:
            sumColumns.append(col)
    if len(sumColumns) >= 2:
        pairs = itertools.combinations(sumColumns, 2)
        for pair in pairs:
            col1 = pair[0]
            col2 = pair[1]
            print(col1, col2)
            sumTitle = col1 + '_plus_' + col2
            addedColumnList.append(sumTitle)
            df[sumTitle] = df[col1] + df[col2]
        print("Added sums of columns")
        
    # Difference operation
    diffColumns = []
    for n, col in enumerate(baseColumnList):
        if fourwayColumns[n][2]==1:
            diffColumns.append(col)
    if len(diffColumns) >= 2:
        pairs = itertools.combinations(diffColumns, 2)
        for pair in pairs:
            col1 = pair[0]
            col2 = pair[1]
            diffTitle = col1 + '_minus_' + col2
            addedColumnList.append(diffTitle)
            df[diffTitle] = df[col1] - df[col2]
        print("Added differences of columns")
        
    # Product operation
    prodColumns = []
    for n, col in enumerate(baseColumnList):
        if fourwayColumns[n][3]==1:
            prodColumns.append(col)
    if len(prodColumns) >= 2:
        pairs = itertools.combinations(prodColumns, 2)
        for pair in pairs:
            col1 = pair[0]
            col2 = pair[1]
            prodTitle = col1 + '_times_' + col2
            addedColumnList.append(prodTitle)
            df[prodTitle] = df[col1] * df[col2]
        print("Added products of columns")

    # Quotient operation
    quotientColumns = []
    for n, col in enumerate(baseColumnList):
        if fourwayColumns[n][4]==1:
            quotientColumns.append(col)
    if len(quotientColumns) >= 2:
        pairs = itertools.combinations(quotientColumns, 2)
        for pair in pairs:
            col1 = pair[0]
            col2 = pair[1]
            quotientTitle = col1 + '_divide_' + col2
            addedColumnList.append(quotientTitle)
            df[quotientTitle] = df[col1] / df[col2].clip_lower(0.000001) #Clip to avoid division by zero
        print("Added quotients of columns")
        
    return df, addedColumnList

In [81]:
features = ['id_01_count_dist',
 'C6_fq_enc',
 'card5_fq_enc',
 'C1_fq_enc',
 'card5_count_full',
 'C14_fq_enc',
 'C13_fq_enc',
 'id_31_count_dist',
 'card2_fq_enc',
 'card2_count_full',
 'addr1_count_full',
 'card1_count_full',
 'card1_count_full',
 'card2_count_full',
 'card3_count_full',
 'card4_count_full',
 'card5_count_full',
 'card6_count_full',
 'addr1_count_full',
 'addr2_count_full',
 'id_34_count_full',
 'id_36_count_full',
 'id_01_count_dist',
 'id_31_count_dist',
 'id_33_count_dist',
 'id_35_count_dist',
 'id_36_count_dist',
 'card2_fq_enc',
 'card3_fq_enc',
 'card5_fq_enc',
 'C1_fq_enc',
 'C2_fq_enc',
 'C3_fq_enc',
 'C4_fq_enc',
 'C5_fq_enc',
 'C6_fq_enc',
 'C7_fq_enc',
 'C8_fq_enc',
 'C9_fq_enc',
 'C10_fq_enc',
 'C11_fq_enc',
 'C13_fq_enc',
 'C14_fq_enc',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15']


len(features)

72

In [79]:
# len(features)

# for i,fea in enumerate(features):
#     if fea[-3:] == 'new':
#         features[i] = fea[:-3]

# len(features)

# features = [fea for fea in features if fea in data]
# len(features)

97

97

72

In [82]:
fourwayColumns = []
# for fea in c_features:
#for fea in d_features:
for fea in features:
    fourwayColumns.append([fea,1,1,1,1])
fourwayColumns

[['id_01_count_dist', 1, 1, 1, 1],
 ['C6_fq_enc', 1, 1, 1, 1],
 ['card5_fq_enc', 1, 1, 1, 1],
 ['C1_fq_enc', 1, 1, 1, 1],
 ['card5_count_full', 1, 1, 1, 1],
 ['C14_fq_enc', 1, 1, 1, 1],
 ['C13_fq_enc', 1, 1, 1, 1],
 ['id_31_count_dist', 1, 1, 1, 1],
 ['card2_fq_enc', 1, 1, 1, 1],
 ['card2_count_full', 1, 1, 1, 1],
 ['addr1_count_full', 1, 1, 1, 1],
 ['card1_count_full', 1, 1, 1, 1],
 ['card1_count_full', 1, 1, 1, 1],
 ['card2_count_full', 1, 1, 1, 1],
 ['card3_count_full', 1, 1, 1, 1],
 ['card4_count_full', 1, 1, 1, 1],
 ['card5_count_full', 1, 1, 1, 1],
 ['card6_count_full', 1, 1, 1, 1],
 ['addr1_count_full', 1, 1, 1, 1],
 ['addr2_count_full', 1, 1, 1, 1],
 ['id_34_count_full', 1, 1, 1, 1],
 ['id_36_count_full', 1, 1, 1, 1],
 ['id_01_count_dist', 1, 1, 1, 1],
 ['id_31_count_dist', 1, 1, 1, 1],
 ['id_33_count_dist', 1, 1, 1, 1],
 ['id_35_count_dist', 1, 1, 1, 1],
 ['id_36_count_dist', 1, 1, 1, 1],
 ['card2_fq_enc', 1, 1, 1, 1],
 ['card3_fq_enc', 1, 1, 1, 1],
 ['card5_fq_enc', 1, 1, 1, 

In [83]:
data[features].head()

Unnamed: 0,id_01_count_dist,C6_fq_enc,card5_fq_enc,C1_fq_enc,card5_count_full,C14_fq_enc,C13_fq_enc,id_31_count_dist,card2_fq_enc,card2_count_full,addr1_count_full,card1_count_full,card1_count_full.1,card2_count_full.1,card3_count_full,card4_count_full,card5_count_full.1,card6_count_full,addr1_count_full.1,addr2_count_full,id_34_count_full,id_36_count_full,id_01_count_dist.1,id_31_count_dist.1,id_33_count_dist,id_35_count_dist,id_36_count_dist,card2_fq_enc.1,card3_fq_enc,card5_fq_enc.1,C1_fq_enc.1,C2_fq_enc,C3_fq_enc,C4_fq_enc,C5_fq_enc,C6_fq_enc.1,C7_fq_enc,C8_fq_enc,C9_fq_enc,C10_fq_enc,C11_fq_enc,C13_fq_enc.1,C14_fq_enc.1,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15
0,446307,631237,309,586742,309,598520,367617,450258,22739,17587,43035,56,56,17587,956845,9524,309,267648,43035,956415,947251,819269,446307,450258,517251,449555,449555,22739,956845,309,586742,581696,1081890,823454,707008,631237,961237,816304,420354,824420,168780,367617,598520,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,5548,515566,11525,245773,534216,899261,998181,947967,947967,3208,2304,963260,911895,919850,357981
1,446307,631237,49491,586742,49491,598520,367617,450258,5593,5593,76902,1338,1338,5593,956845,347386,49491,267648,76902,956415,947251,819269,446307,450258,517251,449555,449555,5593,956845,49491,586742,581696,1081890,823454,707008,631237,961237,816304,341561,824420,731792,367617,598520,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,507854,515566,466020,344864,534216,899261,998181,947967,947967,449929,455805,963260,911895,919850,357981
2,446307,631237,102930,586742,102930,598520,367617,450258,70496,70496,48387,1794,1794,70496,956845,719649,102930,824959,48387,956415,947251,819269,446307,450258,517251,449555,449555,70496,956845,102930,586742,581696,1081890,823454,707008,631237,961237,816304,420354,824420,731792,367617,598520,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,507854,515566,466020,344864,534216,899261,998181,947967,947967,449929,487,963260,911895,919850,683
3,446307,34225,47061,197520,47061,598520,5659,450258,11287,11287,17455,7635,7635,11287,956845,347386,47061,824959,17455,956415,947251,819269,446307,450258,517251,449555,449555,11287,956845,47061,197520,31682,1081890,823454,707008,34225,961237,816304,420354,824420,731792,5659,598520,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,1440,1374,146326,894,139264,899261,998181,947967,947967,1588,455805,963260,911895,919850,994
4,19555,631237,49491,586742,49491,598520,367617,1061,27225,27225,7107,30,30,27225,956845,347386,49491,267648,7107,956415,132185,267353,19555,1061,544,77814,134066,27225,956845,49491,586742,581696,1081890,823454,707008,631237,961237,202326,341561,192574,731792,367617,598520,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,507854,515566,466020,245773,534216,899261,998181,947967,947967,88567,455805,963260,911895,919850,101182


In [84]:
data, addedColumns = process_feed_four_way_ops(data, fourwayColumns)
gc.collect()

id_01_count_dist C6_fq_enc
id_01_count_dist card5_fq_enc
id_01_count_dist C1_fq_enc
id_01_count_dist card5_count_full
id_01_count_dist C14_fq_enc
id_01_count_dist C13_fq_enc
id_01_count_dist id_31_count_dist
id_01_count_dist card2_fq_enc
id_01_count_dist card2_count_full
id_01_count_dist addr1_count_full
id_01_count_dist card1_count_full
id_01_count_dist card1_count_full
id_01_count_dist card2_count_full
id_01_count_dist card3_count_full
id_01_count_dist card4_count_full
id_01_count_dist card5_count_full
id_01_count_dist card6_count_full
id_01_count_dist addr1_count_full
id_01_count_dist addr2_count_full
id_01_count_dist id_34_count_full
id_01_count_dist id_36_count_full
id_01_count_dist id_01_count_dist
id_01_count_dist id_31_count_dist
id_01_count_dist id_33_count_dist
id_01_count_dist id_35_count_dist
id_01_count_dist id_36_count_dist
id_01_count_dist card2_fq_enc
id_01_count_dist card3_fq_enc
id_01_count_dist card5_fq_enc
id_01_count_dist C1_fq_enc
id_01_count_dist C2_fq_enc
id_01_

KeyboardInterrupt: 

In [85]:
gc.collect()

5160

In [86]:
data.head()
del data # note - we cannot put all features together in the same dataframe due to the memory limit

Unnamed: 0,C1,C10,C10_fq_enc,C11,C11_fq_enc,C12,C12_fq_enc,C13,C13_fq_enc,C14,C14_fq_enc,C1_fq_enc,C2,C2_fq_enc,C3,C3_fq_enc,C4,C4_fq_enc,C5,C5_fq_enc,C6,C6_fq_enc,C7,C7_fq_enc,C8,C8_fq_enc,C9,C9_fq_enc,D1,D10,D10_DT_D_min_max,D10_DT_D_std_score,D10_DT_M_min_max,D10_DT_M_std_score,D10_DT_W_min_max,D10_DT_W_std_score,D11,D11_DT_D_min_max,D11_DT_D_std_score,D11_DT_M_min_max,D11_DT_M_std_score,D11_DT_W_min_max,D11_DT_W_std_score,D11__DeviceInfo,D12,D12_DT_D_min_max,D12_DT_D_std_score,D12_DT_M_min_max,D12_DT_M_std_score,D12_DT_W_min_max,D12_DT_W_std_score,D13,D13_DT_D_min_max,D13_DT_D_std_score,D13_DT_M_min_max,D13_DT_M_std_score,D13_DT_W_min_max,D13_DT_W_std_score,D14,D14_DT_D_min_max,D14_DT_D_std_score,D14_DT_M_min_max,D14_DT_M_std_score,D14_DT_W_min_max,D14_DT_W_std_score,D15,D15_DT_D_min_max,D15_DT_D_std_score,D15_DT_M_min_max,D15_DT_M_std_score,D15_DT_W_min_max,D15_DT_W_std_score,D15_to_mean_addr1,D15_to_mean_card1,D15_to_mean_card4,D15_to_std_addr1,D15_to_std_card1,D15_to_std_card4,D1_scaled,D2,D2_scaled,D3,D3_DT_D_min_max,D3_DT_D_std_score,D3_DT_M_min_max,D3_DT_M_std_score,D3_DT_W_min_max,D3_DT_W_std_score,D4,D4_DT_D_min_max,D4_DT_D_std_score,D4_DT_M_min_max,D4_DT_M_std_score,D4_DT_W_min_max,D4_DT_W_std_score,D5,D5_DT_D_min_max,D5_DT_D_std_score,D5_DT_M_min_max,D5_DT_M_std_score,D5_DT_W_min_max,D5_DT_W_std_score,D6,D6_DT_D_min_max,D6_DT_D_std_score,D6_DT_M_min_max,D6_DT_M_std_score,D6_DT_W_min_max,D6_DT_W_std_score,D7,D7_DT_D_min_max,D7_DT_D_std_score,D7_DT_M_min_max,D7_DT_M_std_score,D7_DT_W_min_max,D7_DT_W_std_score,D8,D8_D9_decimal_dist,D8_DT_D_min_max,D8_DT_D_std_score,D8_DT_M_min_max,D8_DT_M_std_score,D8_DT_W_min_max,D8_DT_W_std_score,D8_not_same_day,D9,D9_not_na,DT,DT_D,DT_D_total,DT_M,DT_M_total,DT_W,DT_W_total,DT_day_month,DT_day_week,DT_hour,DeviceInfo,DeviceInfo__P_emaildomain,DeviceInfo_device,DeviceInfo_version,DeviceType,M1,M2,M3,M4,M5,M6,M7,M8,M9,P_emaildomain,P_emaildomain__C2,P_emaildomain_bin,P_emaildomain_suffix,P_isproton,ProductCD,R_emaildomain,R_emaildomain_bin,R_emaildomain_suffix,R_isproton,TransactionAmt,TransactionAmt_DT_D_min_max,TransactionAmt_DT_D_std_score,TransactionAmt_DT_M_min_max,TransactionAmt_DT_M_std_score,TransactionAmt_DT_W_min_max,TransactionAmt_DT_W_std_score,TransactionAmt_check,TransactionAmt_decimal,TransactionAmt_to_mean_card1,TransactionAmt_to_mean_card4,TransactionAmt_to_std_card1,TransactionAmt_to_std_card4,TransactionDT,TransactionID,Transaction_day_of_week,Transaction_hour_of_day,V1,V10,V100,V101,V102,V103,V104,V105,V106,V107,V108,V109,V11,V110,V111,V112,V113,V114,V115,V116,V117,V118,V119,V12,V120,V121,V122,V123,V124,V125,V126,V127,V128,V129,V13,V130,V131,V132,V133,V134,V135,V136,V137,V138,V139,V14,V140,V141,V142,V143,V144,V145,V146,V147,V148,V149,V15,V150,V151,V152,V153,V154,V155,V156,V157,V158,V159,V16,V160,V161,V162,V163,V164,V165,V166,V167,V168,V169,V17,V170,V171,V172,V173,V174,V175,V176,V177,V178,V179,V18,V180,V181,V182,V183,V184,V185,V186,V187,V188,V189,V19,V190,V191,V192,V193,V194,V195,V196,V197,V198,V199,V2,V20,V200,V201,V202,V203,V204,V205,V206,V207,V208,V209,V21,V210,V211,V212,V213,V214,V215,V216,V217,V218,V219,V22,V220,V221,V222,V223,V224,V225,V226,V227,V228,V229,V23,V230,V231,V232,V233,V234,V235,V236,V237,V238,V239,V24,V240,V241,V242,V243,V244,V245,V246,V247,V248,V249,V25,V250,V251,V252,V253,V254,V255,V256,V257,V258,V259,V26,V260,V261,V262,V263,V264,V265,V266,V267,V268,V269,V27,V270,V271,V272,V273,V274,V275,V276,V277,V278,V279,V28,V280,V281,V282,V283,V284,V285,V286,V287,V288,V289,V29,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V3,V30,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V31,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V32,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V33,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339,V34,V35,V36,V37,V38,V39,V4,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V5,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V6,V60,V61,V62,V63,V64,V65,V66,V67,V68,V69,V7,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V8,V80,V81,V82,V83,V84,V85,V86,V87,V88,V89,V9,V90,V91,V92,V93,V94,V95,V96,V97,V98,V99,addr1,addr1__card1,addr1_count_full,addr2,addr2_count_full,bank_type,bank_type_D10_mean,bank_type_D10_std,bank_type_D11_mean,bank_type_D11_std,bank_type_D12_mean,bank_type_D12_std,bank_type_D13_mean,bank_type_D13_std,bank_type_D14_mean,bank_type_D14_std,bank_type_D15_mean,bank_type_D15_std,bank_type_D1_mean,bank_type_D1_std,bank_type_D2_mean,bank_type_D2_std,bank_type_D3_mean,bank_type_D3_std,bank_type_D4_mean,bank_type_D4_std,bank_type_D5_mean,bank_type_D5_std,bank_type_D6_mean,bank_type_D6_std,bank_type_D7_mean,bank_type_D7_std,bank_type_D8_mean,bank_type_D8_std,bank_type_D9_mean,bank_type_D9_std,bank_type_DT_D,bank_type_DT_D_hour_dist,bank_type_DT_D_hour_dist_best,bank_type_DT_M,bank_type_DT_M_month_day_dist,bank_type_DT_M_month_day_dist_best,bank_type_DT_W,bank_type_DT_W_week_day_dist,bank_type_DT_W_week_day_dist_best,bank_type_TransactionAmt_mean,bank_type_TransactionAmt_std,card1,card1_TransactionAmt_mean,card1_TransactionAmt_std,card1__card5,card1_count_full,card1_fq_enc,card2,card2_TransactionAmt_mean,card2_TransactionAmt_std,card2__dist1,card2__id_20,card2_count_full,card2_fq_enc,card3,card3_DT_D_hour_dist,card3_DT_D_hour_dist_best,card3_DT_M_month_day_dist,card3_DT_M_month_day_dist_best,card3_DT_W_week_day_dist,card3_DT_W_week_day_dist_best,card3_TransactionAmt_mean,card3_TransactionAmt_std,card3_count_full,card3_fq_enc,card4,card4_count_full,card5,card5_DT_D_hour_dist,card5_DT_D_hour_dist_best,card5_DT_M_month_day_dist,card5_DT_M_month_day_dist_best,card5_DT_W_week_day_dist,card5_DT_W_week_day_dist_best,card5_TransactionAmt_mean,card5_TransactionAmt_std,card5__P_emaildomain,card5_count_full,card5_fq_enc,card6,card6_count_full,dist1,dist2,id_01,id_01_count_dist,id_02,id_02__D8,id_02__id_20,id_02_to_mean_card1,id_02_to_mean_card4,id_02_to_std_card1,id_02_to_std_card4,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_30_device,id_30_version,id_31,id_31_count_dist,id_31_device,id_32,id_33,id_33_0,id_33_1,id_33_count_dist,id_34,id_34_count_full,id_35,id_35_count_dist,id_36,id_36_count_dist,id_36_count_full,id_37,id_38,isFraud,is_december,is_holiday,lastest_df_browser,nulls1,product_type,product_type_DT_D,product_type_DT_M,product_type_DT_W,uid,uid2,uid2_D10_mean,uid2_D10_std,uid2_D11_mean,uid2_D11_std,uid2_D12_mean,uid2_D12_std,uid2_D13_mean,uid2_D13_std,uid2_D14_mean,uid2_D14_std,uid2_D15_mean,uid2_D15_std,uid2_D1_mean,uid2_D1_std,uid2_D2_mean,uid2_D2_std,uid2_D3_mean,uid2_D3_std,uid2_D4_mean,uid2_D4_std,uid2_D5_mean,uid2_D5_std,uid2_D6_mean,uid2_D6_std,uid2_D7_mean,uid2_D7_std,uid2_D8_mean,uid2_D8_std,uid2_D9_mean,uid2_D9_std,uid2_TransactionAmt_mean,uid2_TransactionAmt_std,uid2_fq_enc,uid3,uid3_D10_mean,uid3_D10_std,uid3_D11_mean,uid3_D11_std,uid3_D12_mean,uid3_D12_std,uid3_D13_mean,uid3_D13_std,uid3_D14_mean,uid3_D14_std,uid3_D15_mean,uid3_D15_std,uid3_D1_mean,uid3_D1_std,uid3_D2_mean,uid3_D2_std,uid3_D3_mean,uid3_D3_std,uid3_D4_mean,uid3_D4_std,uid3_D5_mean,uid3_D5_std,uid3_D6_mean,uid3_D6_std,uid3_D7_mean,uid3_D7_std,uid3_D8_mean,uid3_D8_std,uid3_D9_mean,uid3_D9_std,uid3_TransactionAmt_mean,uid3_TransactionAmt_std,uid3_fq_enc,uid4,uid4_D10_mean,uid4_D10_std,uid4_D11_mean,uid4_D11_std,uid4_D12_mean,uid4_D12_std,uid4_D13_mean,uid4_D13_std,uid4_D14_mean,uid4_D14_std,uid4_D15_mean,uid4_D15_std,uid4_D1_mean,uid4_D1_std,uid4_D2_mean,uid4_D2_std,uid4_D3_mean,uid4_D3_std,uid4_D4_mean,uid4_D4_std,uid4_D5_mean,uid4_D5_std,uid4_D6_mean,uid4_D6_std,uid4_D7_mean,uid4_D7_std,uid4_D8_mean,uid4_D8_std,uid4_D9_mean,uid4_D9_std,uid4_TransactionAmt_mean,uid4_TransactionAmt_std,uid4_fq_enc,uid5,uid5_D10_mean,uid5_D10_std,uid5_D11_mean,uid5_D11_std,uid5_D12_mean,uid5_D12_std,uid5_D13_mean,uid5_D13_std,uid5_D14_mean,uid5_D14_std,uid5_D15_mean,uid5_D15_std,uid5_D1_mean,uid5_D1_std,uid5_D2_mean,uid5_D2_std,uid5_D3_mean,uid5_D3_std,uid5_D4_mean,uid5_D4_std,uid5_D5_mean,uid5_D5_std,uid5_D6_mean,uid5_D6_std,uid5_D7_mean,uid5_D7_std,uid5_D8_mean,uid5_D8_std,uid5_D9_mean,uid5_D9_std,uid5_TransactionAmt_mean,uid5_TransactionAmt_std,uid5_fq_enc,uid_D10_mean,uid_D10_std,uid_D11_mean,uid_D11_std,uid_D12_mean,uid_D12_std,uid_D13_mean,uid_D13_std,uid_D14_mean,uid_D14_std,uid_D15_mean,uid_D15_std,uid_D1_mean,uid_D1_std,uid_D2_mean,uid_D2_std,uid_D3_mean,uid_D3_std,uid_D4_mean,uid_D4_std,uid_D5_mean,uid_D5_std,uid_D6_mean,uid_D6_std,uid_D7_mean,uid_D7_std,uid_D8_mean,uid_D8_std,uid_D9_mean,uid_D9_std,uid_TransactionAmt_mean,uid_TransactionAmt_std,uid_fq_enc,id_01_count_dist_plus_C6_fq_enc,id_01_count_dist_plus_card5_fq_enc,id_01_count_dist_plus_C1_fq_enc,id_01_count_dist_plus_card5_count_full,id_01_count_dist_plus_C14_fq_enc,id_01_count_dist_plus_C13_fq_enc,id_01_count_dist_plus_id_31_count_dist,id_01_count_dist_plus_card2_fq_enc,id_01_count_dist_plus_card2_count_full,id_01_count_dist_plus_addr1_count_full,id_01_count_dist_plus_card1_count_full,id_01_count_dist_plus_card3_count_full,id_01_count_dist_plus_card4_count_full,id_01_count_dist_plus_card6_count_full,id_01_count_dist_plus_addr2_count_full,id_01_count_dist_plus_id_34_count_full,id_01_count_dist_plus_id_36_count_full,id_01_count_dist_plus_id_01_count_dist,id_01_count_dist_plus_id_33_count_dist,id_01_count_dist_plus_id_35_count_dist,id_01_count_dist_plus_id_36_count_dist,id_01_count_dist_plus_card3_fq_enc,id_01_count_dist_plus_C2_fq_enc,id_01_count_dist_plus_C3_fq_enc,id_01_count_dist_plus_C4_fq_enc,id_01_count_dist_plus_C5_fq_enc,id_01_count_dist_plus_C7_fq_enc,id_01_count_dist_plus_C8_fq_enc,id_01_count_dist_plus_C9_fq_enc,id_01_count_dist_plus_C10_fq_enc,id_01_count_dist_plus_C11_fq_enc,id_01_count_dist_plus_C1,id_01_count_dist_plus_C2,id_01_count_dist_plus_C3,id_01_count_dist_plus_C4,id_01_count_dist_plus_C5,id_01_count_dist_plus_C6,id_01_count_dist_plus_C7,id_01_count_dist_plus_C8,id_01_count_dist_plus_C9,id_01_count_dist_plus_C10,id_01_count_dist_plus_C11,id_01_count_dist_plus_C12,id_01_count_dist_plus_C13,id_01_count_dist_plus_C14,id_01_count_dist_plus_D1,id_01_count_dist_plus_D2,id_01_count_dist_plus_D3,id_01_count_dist_plus_D4,id_01_count_dist_plus_D5,id_01_count_dist_plus_D6,id_01_count_dist_plus_D7,id_01_count_dist_plus_D8,id_01_count_dist_plus_D9,id_01_count_dist_plus_D10,id_01_count_dist_plus_D11,id_01_count_dist_plus_D12,id_01_count_dist_plus_D13,id_01_count_dist_plus_D14,id_01_count_dist_plus_D15,C6_fq_enc_plus_card5_fq_enc,C6_fq_enc_plus_C1_fq_enc,C6_fq_enc_plus_card5_count_full,C6_fq_enc_plus_C14_fq_enc,C6_fq_enc_plus_C13_fq_enc,C6_fq_enc_plus_id_31_count_dist,C6_fq_enc_plus_card2_fq_enc,C6_fq_enc_plus_card2_count_full,C6_fq_enc_plus_addr1_count_full,C6_fq_enc_plus_card1_count_full,C6_fq_enc_plus_card3_count_full,C6_fq_enc_plus_card4_count_full,C6_fq_enc_plus_card6_count_full,C6_fq_enc_plus_addr2_count_full,C6_fq_enc_plus_id_34_count_full,C6_fq_enc_plus_id_36_count_full,C6_fq_enc_plus_id_01_count_dist,C6_fq_enc_plus_id_33_count_dist,C6_fq_enc_plus_id_35_count_dist,C6_fq_enc_plus_id_36_count_dist,C6_fq_enc_plus_card3_fq_enc,C6_fq_enc_plus_C2_fq_enc,C6_fq_enc_plus_C3_fq_enc,C6_fq_enc_plus_C4_fq_enc,C6_fq_enc_plus_C5_fq_enc,C6_fq_enc_plus_C6_fq_enc,C6_fq_enc_plus_C7_fq_enc,C6_fq_enc_plus_C8_fq_enc,C6_fq_enc_plus_C9_fq_enc,C6_fq_enc_plus_C10_fq_enc,C6_fq_enc_plus_C11_fq_enc,C6_fq_enc_plus_C1,C6_fq_enc_plus_C2,C6_fq_enc_plus_C3,C6_fq_enc_plus_C4,C6_fq_enc_plus_C5,C6_fq_enc_plus_C6,C6_fq_enc_plus_C7,C6_fq_enc_plus_C8,C6_fq_enc_plus_C9,C6_fq_enc_plus_C10,C6_fq_enc_plus_C11,C6_fq_enc_plus_C12,C6_fq_enc_plus_C13,C6_fq_enc_plus_C14,C6_fq_enc_plus_D1,C6_fq_enc_plus_D2,C6_fq_enc_plus_D3,C6_fq_enc_plus_D4,C6_fq_enc_plus_D5,C6_fq_enc_plus_D6,C6_fq_enc_plus_D7,C6_fq_enc_plus_D8,C6_fq_enc_plus_D9,C6_fq_enc_plus_D10,C6_fq_enc_plus_D11,C6_fq_enc_plus_D12,C6_fq_enc_plus_D13,C6_fq_enc_plus_D14,C6_fq_enc_plus_D15,card5_fq_enc_plus_C1_fq_enc,card5_fq_enc_plus_card5_count_full,card5_fq_enc_plus_C14_fq_enc,card5_fq_enc_plus_C13_fq_enc,card5_fq_enc_plus_id_31_count_dist,card5_fq_enc_plus_card2_fq_enc,card5_fq_enc_plus_card2_count_full,card5_fq_enc_plus_addr1_count_full,card5_fq_enc_plus_card1_count_full,card5_fq_enc_plus_card3_count_full,card5_fq_enc_plus_card4_count_full,card5_fq_enc_plus_card6_count_full,card5_fq_enc_plus_addr2_count_full,card5_fq_enc_plus_id_34_count_full,card5_fq_enc_plus_id_36_count_full,card5_fq_enc_plus_id_01_count_dist,card5_fq_enc_plus_id_33_count_dist,card5_fq_enc_plus_id_35_count_dist,card5_fq_enc_plus_id_36_count_dist,card5_fq_enc_plus_card3_fq_enc,card5_fq_enc_plus_card5_fq_enc,card5_fq_enc_plus_C2_fq_enc,card5_fq_enc_plus_C3_fq_enc,card5_fq_enc_plus_C4_fq_enc,card5_fq_enc_plus_C5_fq_enc,card5_fq_enc_plus_C6_fq_enc,card5_fq_enc_plus_C7_fq_enc,card5_fq_enc_plus_C8_fq_enc,card5_fq_enc_plus_C9_fq_enc,card5_fq_enc_plus_C10_fq_enc,card5_fq_enc_plus_C11_fq_enc,card5_fq_enc_plus_C1,card5_fq_enc_plus_C2,card5_fq_enc_plus_C3,card5_fq_enc_plus_C4
0,1.0,0.0,824420,2.0,168780,0.0,856080,1.0,367617,1.0,598520,586742,1.0,581696,0.0,1081890,0.0,823454,0.0,707008,1.0,631237,0.0,961237,0.0,816304,1.0,420354,5548,3208,0.018705,-0.70985,0.018031,-0.608978,0.018705,-0.689842,2304,0.026804,-0.866757,0.024904,-0.681681,0.026639,-0.894949,42,963260,,,,,,,911895,,,,,,,919850,,,,,,,357981,0.0,-1.052971,0.0,-0.807884,0.0,-1.079525,0.0,0.0,0.0,0.0,0.0,0.0,0.021881,515566,,11525,0.026694,-0.29547,0.019288,-0.261741,0.026694,-0.26841,245773,,,,,,,534216,,,,,,,899261,,,,,,,998181,,,,,,,947967,,0.0,-0.185638,0.0,-0.27198,0.0,-0.188084,0,947967,0,2017-12-01 00:00:00,335,5122,12,137321,48,12093,1,4,0,863508,8523,811091,811091,2,1.0,1.0,1.0,0.113739,0.0,1.0,,,,49,4604,8,6,False,0.020399,49,8,6,False,4.241327,0.020518,-0.307812,0.013642,-0.296515,0.016626,-0.280889,1,500,0.19458,0.257812,0.0,0.0,86400,2987000,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,117.0,0.0,0.0,1.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,1.0,,,,,,,,,,,0.0,,,,,,,,,,,0.0,,,,,,,,,,,0.0,,,,,,,,,,,0.0,,,,,,,,,,,1.0,,,,,,,,,,,1.0,1.0,,,,,,,,,,,0.0,,,,,,,,,,,0.0,,,,,,,,,,,1.0,,,,,,,,,,,1.0,,,,,,,,,,,1.0,,,,,,,,,,,1.0,,,,,,,,,,,0.0,,,,,,,,,,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,0.0,,,,,,,,,,,0.0,,,,,,1.0,,,,,,,,,,,1.0,,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,315.0,29248,43035,87.0,956415,331,69.125,125.3125,100.1875,157.0,,,0.0,,5.714844,15.117188,122.75,183.25,49.46875,96.875,100.625,118.375,26.390625,49.0625,91.6875,146.625,41.90625,77.6875,0.0,0.0,,,170.5,inf,0.595215,0.2854,0.00039,-10.0,-20,0.000269,-14.081081,-30,0.000496,-1.333333,-2,185.22583,322.071136,13926.0,316.574768,351.523865,4648,56,56,,,,30942,9239,17587,22739,150.0,-14.682021,-16.0,-15.135826,-21.0,-0.838393,0.0,147.520355,249.324692,956845,956845,9524.0,9524,142.0,-10.0,-20.0,-14.081081,-30.0,-1.333333,-2.0,185.22583,322.071136,508,309,309,267648.0,267648,19.0,,,446307,,259908,270749,,,,,,,,,,,,,,2,,,3,2,,,,,,,3,,,,2,2,2,949007,811091,811091,820324,450258,811091,,953271,,,517251,4,947251,2,449555,2,449555,819269,2,2,0,1,0,0.0,234,1324,0.002538,0.001085,0.00215,2662,2764,2.5,5.167969,4.667969,7.234375,,,,,,,0.333252,0.516113,2.333984,5.714844,,,6.5,9.195312,0.0,0.0,,,,,,,,,,,257.916656,210.732864,6,12577,6.5,9.195312,13.0,,,,,,,,0.0,0.0,7.0,9.898438,,,13.0,,0.0,,,,,,,,,,,,193.0,176.069595,2,30423,13.0,,13.0,,,,,,,,0.0,,14.0,,,,13.0,,,,,,,,,,,,,,68.5,,1,21429,6.5,9.195312,13.0,,,,,,,,0.0,0.0,7.0,9.898438,,,13.0,,0.0,,,,,,,,,,,,193.0,176.069595,2,2.5,5.167969,4.667969,7.234375,,,,,,,0.333252,0.516113,2.333984,5.714844,,,6.5,9.195312,0.0,0.0,,,,,,,,,,,257.916656,210.732864,6,1077544,446616,1033049,446616,1044827,813924,896565,469046,463894,489342,446363,1403152,455831,713955,1402722,1393558,1265576,892614,963558,895862,895862,1403152,1028003,1528197,1269761,1153315,1407544,1262611,866661,1270727,615087,446308.0,446308.0,446307.0,446307.0,446307.0,446308.0,446307.0,446307.0,446308.0,446307.0,446309.0,446307.0,446308.0,446308.0,451855,961873,457832,692080,980523,1345568,1444488,1394274,1394274,449515,448611,1409567,1358202,1366157,804288,631546,1217979,631546,1229757,998854,1081495,653976,648824,674272,631293,1588082,640761,898885,1587652,1578488,1450506,1077544,1148488,1080792,1080792,1588082,1212933,1713127,1454691,1338245,1262474,1592474,1447541,1051591,1455657,800017,631238.0,631238.0,631237.0,631237.0,631237.0,631238.0,631237.0,631237.0,631238.0,631237.0,631239.0,631237.0,631238.0,631238.0,636785,1146803,642762,877010,1165453,1530498,1629418,1579204,1579204,634445,633541,1594497,1543132,1551087,989218,587051,618,598829,367926,450567,23048,17896,43344,365,957154,9833,267957,956724,947560,819578,446616,517560,449864,449864,957154,618,582005,1082199,823763,707317,631546,961546,816613,420663,824729,169089,310.0,310.0,309.0,309.0
1,1.0,0.0,824420,1.0,731792,0.0,856080,1.0,367617,1.0,598520,586742,1.0,581696,0.0,1081890,0.0,823454,0.0,707008,1.0,631237,0.0,961237,0.0,816304,0.0,341561,507854,449929,0.0,-0.786407,0.0,-0.688037,0.0,-0.767932,455805,,,,,,,3627,963260,,,,,,,911895,,,,,,,919850,,,,,,,357981,0.0,-1.052971,0.0,-0.807884,0.0,-1.079525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,515566,,466020,,,,,,,344864,0.0,-0.918008,0.0,-0.737244,0.0,-0.951856,534216,,,,,,,899261,,,,,,,998181,,,,,,,947967,,0.0,-0.185638,0.0,-0.27198,0.0,-0.188084,0,947967,0,2017-12-01 00:00:01,335,5122,12,137321,48,12093,1,4,0,863508,8507,811091,811091,2,,,,0.036649,1.0,1.0,,,,16,1585,4,0,False,0.020399,49,8,6,False,3.401197,0.00835,-0.466953,0.005742,-0.493464,0.006766,-0.468352,1,0,0.123779,0.219116,0.0,0.114258,86401,2987001,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,1.0,,,,,,,,,,,0.0,,,,,,,,,,,0.0,,,,,,,,,,,0.0,,,,,,,,,,,0.0,,,,,,,,,,,1.0,,,,,,,,,,,,1.0,,,,,,,,,,,0.0,,,,,,,,,,,0.0,,,,,,,,,,,1.0,,,,,,,,,,,1.0,,,,,,,,,,,1.0,,,,,,,,,,,1.0,,,,,,,,,,,0.0,,,,,,,,,,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,0.0,,,,,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,325.0,32205,76902,87.0,956415,308,131.875,202.0,144.0,207.75,41.75,119.3125,23.984375,72.75,94.75,163.125,157.875,224.875,67.1875,142.375,156.125,181.125,33.09375,83.875,127.875,207.75,48.25,110.875,132.25,183.25,47.375,136.0,171.125,inf,0.597168,0.289795,0.0328,-14.511905,-18,0.050954,-15.29384,-23,0.035558,-0.897674,0,230.463577,356.551208,2755.0,212.240234,380.296265,10936,1338,1338,404.0,226.811493,369.575531,18042,5273,5593,5593,150.0,-14.682021,-16.0,-15.135826,-21.0,-0.838393,0.0,147.520355,249.324692,956845,956845,347386.0,347386,102.0,-14.068421,-18.0,-15.445075,-21.0,-0.894068,0.0,211.665604,341.981689,43,49491,49491,267648.0,267648,,,,446307,,259908,270749,,,,,,,,,,,,,,2,,,3,2,,,,,,,3,,,,2,2,2,949007,811091,811091,820324,450258,811091,,953271,,,517251,4,947251,2,449555,2,449555,819269,2,2,0,1,0,0.0,230,19686,0.009762,0.01195,0.008931,6229,6455,174.875,236.875,203.375,250.125,0.0,,17.125,50.65625,60.46875,200.75,208.625,253.0,95.8125,178.75,178.875,215.375,23.640625,58.96875,173.75,247.5,40.75,104.125,43.5,155.625,145.625,182.625,204.75,inf,0.61084,0.32251,212.80986,381.502014,1328,29384,107.0,161.875,71.25,126.4375,,,0.0,,0.0,0.0,131.125,188.5,59.3125,101.75,118.625,117.1875,11.5,20.1875,112.3125,201.0,52.375,141.625,0.0,,,,316.0,inf,0.466553,0.355225,239.984528,503.766602,105,73058,98.0625,180.0,134.625,inf,,,0.0,,0.0,,120.4375,218.125,9.953125,25.90625,66.3125,27.6875,49.25,40.03125,95.75,183.75,40.625,106.5625,0.0,,,,124.25,132.125,0.770508,0.029343,512.464294,996.705078,21,51039,108.125,162.375,71.25,126.4375,,,,,,,132.5,189.0,63.59375,104.0625,120.9375,117.125,11.5,20.1875,114.4375,202.25,52.375,141.625,,,,,,,,,244.116074,520.628235,98,174.875,236.875,203.375,250.125,0.0,,17.125,50.65625,60.46875,200.75,208.625,253.0,95.8125,178.75,178.875,215.375,23.640625,58.96875,173.75,247.5,40.75,104.125,43.5,155.625,145.625,182.625,204.75,inf,0.61084,0.32251,212.80986,381.502014,1328,1077544,495798,1033049,495798,1044827,813924,896565,451900,451900,523209,447645,1403152,793693,713955,1402722,1393558,1265576,892614,963558,895862,895862,1403152,1028003,1528197,1269761,1153315,1407544,1262611,787868,1270727,1178099,446308.0,446308.0,446307.0,446307.0,446307.0,446308.0,446307.0,446307.0,446307.0,446307.0,446308.0,446307.0,446308.0,446308.0,954161,961873,912327,791171,980523,1345568,1444488,1394274,1394274,896236,902112,1409567,1358202,1366157,804288,680728,1217979,680728,1229757,998854,1081495,636830,636830,708139,632575,1588082,978623,898885,1587652,1578488,1450506,1077544,1148488,1080792,1080792,1588082,1212933,1713127,1454691,1338245,1262474,1592474,1447541,972798,1455657,1363029,631238.0,631238.0,631237.0,631237.0,631237.0,631238.0,631237.0,631237.0,631237.0,631237.0,631238.0,631237.0,631238.0,631238.0,1139091,1146803,1097257,976101,1165453,1530498,1629418,1579204,1579204,1081166,1087042,1594497,1543132,1551087,989218,636233,98982,648011,417108,499749,55084,55084,126393,50829,1006336,396877,317139,1005906,996742,868760,495798,566742,499046,499046,1006336,98982,631187,1131381,872945,756499,680728,1010728,865795,391052,873911,781283,49492.0,49492.0,49491.0,49491.0
2,1.0,0.0,824420,1.0,731792,0.0,856080,1.0,367617,1.0,598520,586742,1.0,581696,0.0,1081890,0.0,823454,0.0,707008,1.0,631237,0.0,961237,0.0,816304,1.0,420354,507854,449929,0.0,-0.786407,0.0,-0.688037,0.0,-0.767932,487,0.649485,0.97366,0.603448,1.233641,0.645492,0.92502,248,963260,,,,,,,911895,,,,,,,919850,,,,,,,683,0.453237,0.641171,0.433884,0.93751,0.453237,0.625623,1.611328,2.519531,1.865234,1.486328,1.851562,1.541016,0.0,515566,,466020,,,,,,,344864,0.0,-0.918008,0.0,-0.737244,0.0,-0.951856,534216,,,,,,,899261,,,,,,,998181,,,,,,,947967,,0.0,-0.185638,0.0,-0.27198,0.0,-0.188084,0,947967,0,2017-12-01 00:01:09,335,5122,12,137321,48,12093,1,4,0,863508,8527,811091,811091,2,1.0,1.0,1.0,0.036649,0.0,0.0,0.0,0.0,0.0,35,5059,5,0,False,0.020399,49,8,6,False,4.094345,0.017591,-0.346086,0.011742,-0.343883,0.014254,-0.325975,1,0,0.608398,0.443115,0.589355,0.258545,86469,2987002,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,1.0,,,,,,,,,,,0.0,,,,,,,,,,,0.0,,,,,,,,,,,0.0,,,,,,,,,,,0.0,,,,,,,,,,,1.0,,,,,,,,,,,1.0,1.0,,,,,,,,,,,0.0,,,,,,,,,,,0.0,,,,,,,,,,,1.0,,,,,,,,,,,1.0,,,,,,,,,,,1.0,,,,,,,,,,,1.0,,,,,,,,,,,0.0,,,,,,,,,,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,0.0,,,,,,,,,,,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,330.0,35560,48387,87.0,956415,343,128.0,191.375,136.25,194.25,9.867188,35.8125,15.335938,59.21875,54.65625,146.25,167.0,216.25,85.8125,143.75,140.0,160.5,27.90625,61.125,142.375,201.875,46.75,98.0,115.5,199.125,34.84375,110.625,123.625,213.25,0.565918,0.30127,0.13647,-16.02432,-21,0.073973,-14.190096,0,0.123294,-0.757881,0,99.52317,141.788864,4663.0,104.87664,130.383972,12999,1794,1794,490.0,136.141693,227.658859,22906,6795,70496,70496,150.0,-14.682021,-16.0,-15.135826,-21.0,-0.838393,0.0,147.520355,249.324692,956845,956845,719649.0,719649,166.0,-16.004274,-21.0,-14.215954,0.0,-0.76016,0.0,98.772415,140.992554,750,102930,102930,824959.0,824959,287.0,,,446307,,259908,270749,,,,,,,,,,,,,,2,,,3,2,,,,,,,3,,,,2,2,2,949007,811091,811091,820324,450258,811091,,953271,,,517251,4,947251,2,449555,2,449555,819269,2,2,0,1,0,0.0,211,57946,0.035923,0.03285,0.036633,7350,7616,107.3125,174.25,98.5625,159.125,,,,,,,143.875,199.5,51.8125,93.0625,88.625,107.125,22.84375,49.125,130.125,188.625,51.40625,106.875,,,,,131.375,113.1875,0.736328,0.023956,104.827538,130.366119,1787,34475,63.09375,139.75,150.25,193.875,,,,,,,145.25,203.125,22.609375,51.96875,86.6875,70.6875,46.5,62.3125,84.1875,157.5,82.0,99.9375,,,,,,,,,65.685463,55.175182,46,85989,98.25,71.75,413.25,71.75,,,,,,,413.25,71.75,97.75,71.25,130.375,35.3125,51.65625,42.65625,98.25,71.75,56.34375,36.84375,,,,,,,,,73.5,17.058722,4,60241,63.09375,139.75,150.25,193.875,,,,,,,145.25,203.125,23.109375,52.46875,86.6875,70.6875,46.5,62.3125,84.1875,157.5,82.0,99.9375,,,,,,,,,65.47847,55.780586,45,107.3125,174.25,98.5625,159.125,,,,,,,143.875,199.5,51.8125,93.0625,88.625,107.125,22.84375,49.125,130.125,188.625,51.40625,106.875,,,,,131.375,113.1875,0.736328,0.023956,104.827538,130.366119,1787,1077544,549237,1033049,549237,1044827,813924,896565,516803,516803,494694,448101,1403152,1165956,1271266,1402722,1393558,1265576,892614,963558,895862,895862,1403152,1028003,1528197,1269761,1153315,1407544,1262611,866661,1270727,1178099,446308.0,446308.0,446307.0,446307.0,446307.0,446308.0,446307.0,446307.0,446308.0,446307.0,446308.0,446307.0,446308.0,446308.0,954161,961873,912327,791171,980523,1345568,1444488,1394274,1394274,896236,446794,1409567,1358202,1366157,446990,734167,1217979,734167,1229757,998854,1081495,701733,701733,679624,633031,1588082,1350886,1456196,1587652,1578488,1450506,1077544,1148488,1080792,1080792,1588082,1212933,1713127,1454691,1338245,1262474,1592474,1447541,1051591,1455657,1363029,631238.0,631238.0,631237.0,631237.0,631237.0,631238.0,631237.0,631237.0,631238.0,631237.0,631238.0,631237.0,631238.0,631238.0,1139091,1146803,1097257,976101,1165453,1530498,1629418,1579204,1579204,1081166,631724,1594497,1543132,1551087,631920,689672,205860,701450,470547,553188,173426,173426,151317,104724,1059775,822579,927889,1059345,1050181,922199,549237,620181,552485,552485,1059775,205860,684626,1184820,926384,809938,734167,1064167,919234,523284,927350,834722,102931.0,102931.0,102930.0,102930.0
3,2.0,0.0,824420,1.0,731792,0.0,856080,25.0,5659,1.0,598520,197520,5.0,31682,0.0,1081890,0.0,823454,0.0,707008,4.0,34225,0.0,961237,0.0,816304,1.0,420354,1440,1588,0.120863,-0.291731,0.116505,-0.177196,0.120863,-0.263349,455805,,,,,,,3627,963260,,,,,,,911895,,,,,,,919850,,,,,,,994,0.159712,-0.455988,0.152893,-0.192841,0.159712,-0.478663,0.686035,0.550293,0.720215,0.57666,0.520508,0.558594,0.175049,1374,0.175049,146326,0.0,-0.542432,0.0,-0.468147,0.0,-0.503406,894,0.171846,-0.396551,0.129477,-0.187538,0.143075,-0.427985,139264,0.0,-0.539373,0.0,-0.496167,0.0,-0.527816,899261,,,,,,,998181,,,,,,,947967,,0.0,-0.185638,0.0,-0.27198,0.0,-0.188084,0,947967,0,2017-12-01 00:01:39,335,5122,12,137321,48,12093,1,4,0,863508,8544,811091,811091,2,,,,0.036649,1.0,0.0,,,,55,6410,9,0,False,0.020399,49,8,6,False,3.931826,0.014819,-0.382346,0.009942,-0.388757,0.012008,-0.368688,1,0,0.405029,0.377686,0.259521,0.196899,86499,2987003,0.0,0.0,,,4.0,1.0,38.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,50.0,1758.0,925.0,0.0,1.0,354.0,135.0,50.0,1404.0,790.0,0.0,0.0,0.0,,,1.0,,,,,,,,,,,0.0,,,,,,,,,,,0.0,,,,,,,,,,,0.0,,,,,,,,,,,0.0,,,,,,,,,,,1.0,,,,,,,,,,,,1.0,,,,,,,,,,,0.0,,,,,,,,,,,0.0,,,,,,,,,,,1.0,,,,,,,,,,,1.0,,,,,,,,,,,1.0,,,,,,,,,,,1.0,,,,,,,,,,,0.0,,,,,,,,,,1.0,0.0,28.0,0.0,0.0,0.0,0.0,10.0,0.0,4.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,38.0,24.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,50.0,1758.0,925.0,0.0,0.0,354.0,0.0,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0,0.0,,,,,,,,,0.0,,,,,,,,,,,0.0,1.0,1.0,1.0,1.0,0.0,,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,48.0,28.0,0.0,10.0,476.0,47566,17455,87.0,956415,313,184.375,229.625,197.625,230.375,38.75,132.0,28.890625,87.0,36.59375,131.125,231.375,251.375,130.625,178.125,190.375,184.875,27.796875,62.34375,191.25,235.75,43.8125,95.4375,125.6875,234.0,60.15625,157.625,183.625,251.625,0.579102,0.304688,0.047442,-15.555556,-17,0.03537,-14.703521,0,0.044323,-0.783582,0,124.382835,191.876907,18132.0,120.957214,196.450729,9636,7635,7635,567.0,133.447342,222.226135,29266,8638,11287,11287,150.0,-14.682021,-16.0,-15.135826,-21.0,-0.838393,0.0,147.520355,249.324692,956845,956845,347386.0,347386,117.0,-15.555556,-17.0,-14.703521,0.0,-0.783582,0.0,124.38945,191.875519,170,47061,47061,824959.0,824959,,,,446307,,259908,270749,,,,,,,,,,,,,,2,,,3,2,,,,,,,3,,,,2,2,2,949007,811091,811091,820324,450258,811091,,953271,,,517251,4,947251,2,449555,2,449555,819269,2,2,0,1,0,0.0,227,3070,0.032019,0.008666,0.033325,5503,5700,180.875,227.375,203.0,237.375,1.666992,4.082031,89.3125,114.6875,25.0,100.5625,235.25,255.25,150.375,196.25,214.5,202.0,28.4375,66.4375,197.5,240.5,44.78125,99.0,199.375,246.625,22.140625,109.125,169.875,190.25,0.562988,0.314209,120.965797,196.710419,7602,25581,125.9375,147.875,216.375,215.5,,,,,0.0,,208.875,196.875,133.125,133.5,163.125,128.875,10.632812,36.375,194.125,194.375,18.90625,53.8125,,,,,135.5,190.5,0.78125,0.157349,113.090599,211.77887,362,62691,115.5,76.8125,206.5,209.125,,,,,,,170.75,134.875,117.6875,86.5,127.6875,76.6875,6.851562,40.5,144.0,133.75,10.117188,49.15625,,,,,133.75,,0.75,,74.112679,57.115608,142,44042,125.9375,147.875,216.375,215.5,,,,,,,208.875,196.875,136.125,133.375,163.125,128.875,10.632812,36.375,194.125,194.375,18.90625,53.8125,,,,,,,,,113.697166,213.744934,354,180.875,227.375,203.0,237.375,1.666992,4.082031,89.3125,114.6875,25.0,100.5625,235.25,255.25,150.375,196.25,214.5,202.0,28.4375,66.4375,197.5,240.5,44.78125,99.0,199.375,246.625,22.140625,109.125,169.875,190.25,0.562988,0.314209,120.965797,196.710419,7602,480532,493368,643827,493368,1044827,451966,896565,457594,457594,463762,453942,1403152,793693,1271266,1402722,1393558,1265576,892614,963558,895862,895862,1403152,477989,1528197,1269761,1153315,1407544,1262611,866661,1270727,1178099,446309.0,446312.0,446307.0,446307.0,446307.0,446311.0,446307.0,446307.0,446308.0,446307.0,446308.0,446307.0,446332.0,446308.0,447747,447681,592633,447201,585571,1345568,1444488,1394274,1394274,447895,902112,1409567,1358202,1366157,447301,81286,231745,81286,632745,39884,484483,45512,45512,51680,41860,991070,381611,859184,990640,981476,853494,480532,551476,483780,483780,991070,65907,1116115,857679,741233,68450,995462,850529,454579,858645,766017,34227.0,34230.0,34225.0,34225.0,34225.0,34229.0,34225.0,34225.0,34226.0,34225.0,34226.0,34225.0,34250.0,34226.0,35665,35599,180551,35119,173489,933486,1032406,982192,982192,35813,490030,997485,946120,954075,35219,244581,94122,645581,52720,497319,58348,58348,64516,54696,1003906,394447,872020,1003476,994312,866330,493368,564312,496616,496616,1003906,94122,78743,1128951,870515,754069,81286,1008298,863365,467415,871481,778853,47063.0,47066.0,47061.0,47061.0
4,1.0,1.0,192574,1.0,731792,0.0,856080,1.0,367617,1.0,598520,586742,1.0,581696,0.0,1081890,0.0,823454,0.0,707008,1.0,631237,0.0,961237,1.0,202326,0.0,341561,507854,88567,,,,,,,455805,,,,,,,2452,963260,,,,,,,911895,,,,,,,919850,,,,,,,101182,,,,,,,,,,,,,0.0,515566,,466020,,,,,,,245773,,,,,,,534216,,,,,,,899261,,,,,,,998181,,,,,,,947967,,0.0,-0.185638,0.0,-0.27198,0.0,-0.188084,0,947967,0,2017-12-01 00:01:46,335,5122,12,137321,48,12093,1,4,0,675,4375,358,9,1,,,,,,,,,,16,1585,4,0,False,0.047662,49,8,6,False,3.931826,0.014819,-0.382346,0.009942,-0.388757,0.012008,-0.368688,1,0,0.515625,0.377686,0.882812,0.196899,86506,2987004,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,6.0,18.0,140.0,0.0,0.0,0.0,0.0,,1803.0,49.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,15560.0,,169690.796875,0.0,0.0,0.0,515.0,5155.0,2840.0,0.0,0.0,0.0,,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,420.0,40560,7107,87.0,956415,308,131.875,202.0,144.0,207.75,41.75,119.3125,23.984375,72.75,94.75,163.125,157.875,224.875,67.1875,142.375,156.125,181.125,33.09375,83.875,127.875,207.75,48.25,110.875,132.25,183.25,47.375,136.0,171.125,inf,0.597168,0.289795,0.0328,-14.511905,-18,0.050954,-15.29384,-23,0.035558,-0.897674,0,230.463577,356.551208,4497.0,99.813019,69.834427,12822,30,30,514.0,221.781326,370.584381,25382,7169,27225,27225,150.0,-14.682021,-16.0,-15.135826,-21.0,-0.838393,0.0,147.520355,249.324692,956845,956845,347386.0,347386,102.0,-14.068421,-18.0,-15.445075,-21.0,-0.894068,0.0,211.665604,341.981689,43,49491,49491,267648.0,267648,,,0.0,19555,70787.0,221143,229802,0.764648,0.373291,1.75293,0.425781,,,,,,,,,100.0,1,,-480.0,1,1,166.0,,542.0,144.0,,,3,,,,2,1,1,3573,11783,3573,1067,1061,4294,32.0,1430,2220.0,1080.0,544,3,132185,1,77814,0,134066,267353,1,1,0,1,0,0.0,137,18496,0.018743,0.04724,0.029604,7266,7527,232.25,inf,204.375,232.25,,,0.0,0.0,0.0,0.0,222.0,inf,95.5625,179.0,204.75,217.625,53.40625,115.875,188.75,233.875,57.0625,102.625,92.75,185.5,371.0,,205.75,216.625,0.625,0.310547,99.813019,69.834427,30,34053,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,50.0,,1,85074,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,50.0,,1,59585,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,50.0,,1,232.25,inf,204.375,232.25,,,0.0,0.0,0.0,0.0,222.0,inf,95.5625,179.0,204.75,217.625,53.40625,115.875,188.75,233.875,57.0625,102.625,92.75,185.5,371.0,,205.75,216.625,0.625,0.310547,99.813019,69.834427,30,650792,69046,606297,69046,618075,387172,20616,46780,46780,26662,19585,976400,366941,287203,975970,151740,286908,39110,20099,97369,153621,976400,601251,1101445,843009,726563,980792,221881,361116,212129,751347,19556.0,19556.0,19555.0,19555.0,19555.0,19556.0,19555.0,19556.0,19555.0,19556.0,19556.0,19555.0,19556.0,19556.0,527409,535121,485575,265328,553771,918816,1017736,967522,967522,108122,475360,982815,931450,939405,120737,680728,1217979,680728,1229757,998854,632298,658462,658462,638344,631267,1588082,978623,898885,1587652,763422,898590,650792,631781,709051,765303,1588082,1212933,1713127,1454691,1338245,1262474,1592474,833563,972798,823811,1363029,631238.0,631238.0,631237.0,631237.0,631237.0,631238.0,631237.0,631238.0,631237.0,631238.0,631238.0,631237.0,631238.0,631238.0,1139091,1146803,1097257,877010,1165453,1530498,1629418,1579204,1579204,719804,1087042,1594497,1543132,1551087,732419,636233,98982,648011,417108,50552,76716,76716,56598,49521,1006336,396877,317139,1005906,181676,316844,69046,50035,127305,183557,1006336,98982,631187,1131381,872945,756499,680728,1010728,251817,391052,242065,781283,49492.0,49492.0,49491.0,49491.0


### 1.2 Add target mean features

In [87]:
data = deepcopy(data_backup)

In [88]:
train_df = data[:590540]
test_df= data[590540:]

In [92]:
gc.collect()

8

In [95]:
n_values = 50

In [96]:
# decrease the cardinality of all feature down to 50 in order to do a proper target mean 

categorical_features = []
for fea in data:
    if data[fea].nunique() > 50:
        categorical_features.append(fea)

categories = []
for column in categorical_features:
    print(column)
    categories.append(list(train_df[column].value_counts().iloc[: n_values - 1].index) + ['Other'])
    values2use = categories[-1]
    train_df[column] = train_df[column].apply(lambda x: x if x in values2use else 'Other')
    test_df[column] = test_df[column].apply(lambda x: x if x in values2use else 'Other')

C1
C10
C10_fq_enc
C11
C11_fq_enc
C12
C12_fq_enc
C13
C13_fq_enc
C14
C14_fq_enc
C1_fq_enc
C2
C2_fq_enc
C4
C4_fq_enc
C5
C5_fq_enc
C6
C6_fq_enc
C7
C7_fq_enc
C8
C8_fq_enc
C9
C9_fq_enc
D1
D10
D10_DT_D_min_max
D10_DT_D_std_score
D10_DT_M_min_max
D10_DT_M_std_score
D10_DT_W_min_max
D10_DT_W_std_score
D11
D11_DT_D_min_max
D11_DT_D_std_score
D11_DT_M_min_max
D11_DT_M_std_score
D11_DT_W_min_max
D11_DT_W_std_score
D11__DeviceInfo
D12
D12_DT_D_min_max
D12_DT_D_std_score
D12_DT_M_min_max
D12_DT_M_std_score
D12_DT_W_min_max
D12_DT_W_std_score
D13
D13_DT_D_min_max
D13_DT_D_std_score
D13_DT_M_min_max
D13_DT_M_std_score
D13_DT_W_min_max
D13_DT_W_std_score
D14
D14_DT_D_min_max
D14_DT_D_std_score
D14_DT_M_min_max
D14_DT_M_std_score
D14_DT_W_min_max
D14_DT_W_std_score
D15
D15_DT_D_min_max
D15_DT_D_std_score
D15_DT_M_min_max
D15_DT_M_std_score
D15_DT_W_min_max
D15_DT_W_std_score
D15_to_mean_addr1
D15_to_mean_card1
D15_to_mean_card4
D15_to_std_addr1
D15_to_std_card1
D15_to_std_card4
D1_scaled
D2
D2_scaled
D3

In [97]:
categorical_features = list(data)

In [98]:
# add the target mean 

for col in categorical_features:
    print(col)
    temp_dict = train_df.groupby([col])[TARGET].agg(['mean']).reset_index().rename(
                                                        columns={'mean': col+'_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col+'_target_mean'].to_dict()

    train_df[col+'_target_mean'] = train_df[col].map(temp_dict)
    test_df[col+'_target_mean']  = test_df[col].map(temp_dict)

C1
C10
C10_fq_enc
C11
C11_fq_enc
C12
C12_fq_enc
C13
C13_fq_enc
C14


KeyboardInterrupt: 

In [99]:
gc.collect()
del data 

8

### 1.3. Add df[A] - df.groupby(B)[A].transform('mean') features

In [100]:
def add_GroupBy_mean(df, by,  window = 10, target = 'TransactionAmt'):
    for fea_by in by:
        print('Processing', fea_by)
        #df[fea_by + target +'minus_mean_window'] =  df[target] - df.groupby(fea_by)[target].transform(lambda x: x.rolling(window, 1).mean())
        df[fea_by + target +'minus_mean_all'] =   df[target] - df.groupby(fea_by)[target].transform('mean')
    
    return df

In [102]:
data = deepcopy(data_backup)

START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

data['TransactionDT'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))

data['Date'] = data['TransactionDT'].apply(lambda x: str(x)[:10])

data['Date+Hour'] = data['TransactionDT'].apply(lambda x: str(x)[:13])

data['dayofwweek'] = data['TransactionDT'].dt.dayofweek
data['isweekend'] = ( data['TransactionDT'].dt.dayofweek >= 5 )
data['hour'] = data['TransactionDT'].dt.hour

##### 1.3.1 Add non time-depend interactions

In [103]:
gc.collect()

62

In [104]:
cat_list = []
for fea in data:
    if data[fea].nunique() <= 100:
        cat_list.append(fea)  
        
for fea in cat_list:
    print(fea)
    data[fea] = data[fea].astype(str)

C10_fq_enc
C12_fq_enc
C3
C3_fq_enc
C4_fq_enc
C7_fq_enc
C8_fq_enc
D8_D9_decimal_dist
D8_not_same_day
D9
D9_not_na
DT_M
DT_M_total
DT_W
DT_W_total
DT_day_month
DT_day_week
DT_hour
DeviceType
M1
M2
M3
M4
M5
M6
M7
M8
M9
P_emaildomain
P_emaildomain_bin
P_emaildomain_suffix
P_isproton
ProductCD
R_emaildomain
R_emaildomain_bin
R_emaildomain_suffix
R_isproton
TransactionAmt_check
Transaction_day_of_week
Transaction_hour_of_day
V1
V10
V100
V104
V105
V106
V107
V108
V109
V11
V110
V111
V112
V113
V114
V115
V116
V117
V118
V119
V12
V120
V121
V122
V123
V124
V125
V13
V138
V139
V14
V140
V141
V142
V144
V146
V147
V148
V149
V15
V151
V152
V153
V154
V155
V156
V157
V158
V16
V169
V17
V170
V171
V172
V173
V174
V175
V18
V181
V184
V185
V186
V188
V189
V19
V191
V192
V193
V194
V195
V196
V197
V198
V2
V20
V200
V201
V21
V22
V220
V221
V223
V224
V225
V227
V23
V235
V236
V237
V238
V239
V24
V240
V241
V242
V243
V244
V245
V247
V248
V249
V25
V250
V251
V252
V254
V255
V256
V259
V26
V260
V261
V262
V27
V28
V281
V282
V283
V284
V285


In [105]:
num_list = ['TransactionAmt', 'D15']

In [106]:
for target in num_list:
    print(target)
    data = add_GroupBy_mean(data, by = cat_list,  window = 100, target = target)

TransactionAmt
Processing C10_fq_enc
Processing C12_fq_enc
Processing C3
Processing C3_fq_enc
Processing C4_fq_enc
Processing C7_fq_enc
Processing C8_fq_enc
Processing D8_D9_decimal_dist
Processing D8_not_same_day
Processing D9
Processing D9_not_na
Processing DT_M
Processing DT_M_total
Processing DT_W
Processing DT_W_total
Processing DT_day_month
Processing DT_day_week
Processing DT_hour
Processing DeviceType
Processing M1
Processing M2
Processing M3
Processing M4
Processing M5
Processing M6
Processing M7
Processing M8
Processing M9
Processing P_emaildomain
Processing P_emaildomain_bin
Processing P_emaildomain_suffix
Processing P_isproton
Processing ProductCD
Processing R_emaildomain
Processing R_emaildomain_bin
Processing R_emaildomain_suffix
Processing R_isproton
Processing TransactionAmt_check
Processing Transaction_day_of_week
Processing Transaction_hour_of_day
Processing V1
Processing V10
Processing V100
Processing V104
Processing V105
Processing V106
Processing V107
Processing V1

KeyboardInterrupt: 

##### 1.3.2 Add time-depend interactions

In [121]:
num_list  =['TransactionAmt',
 'dist1',
 'dist2',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'id_02',
 'id_05',
 'id_06',
 'nulls1',
 'V307',
 'V310',
 'V313',
 'C5_fq_enc',
 'P_emaildomain_bin',
 'card5_count_full',
 'card5_TransactionAmt_std',
 'V315',
 'id_02_to_mean_card1',
 'C11_fq_enc',
 'C9_fq_enc',
 'D15_to_mean_addr1',
 'D15_to_mean_card4',
 'card5_TransactionAmt_mean',
 'C2_fq_enc',
 'D15_to_std_card1',
 'C14_fq_enc',
 'C6_fq_enc',
 'id_31_count_dist',
 'Transaction_day_of_week',
 'C1_fq_enc',
 'uid_TransactionAmt_std',
 'D15_to_mean_card1',
 'TransactionAmt_decimal',
 'card2_TransactionAmt_std',
 'TransactionAmt_to_mean_card4',
 'uid_TransactionAmt_mean',
 'C13_fq_enc',
 'Transaction_hour_of_day',
 'TransactionAmt_to_std_card1',
 'TransactionAmt_to_std_card4',
 'card1_TransactionAmt_std',
 'card2_TransactionAmt_mean',
 'V13TransactionAmtminus_mean_all',
 'TransactionAmt_to_mean_card1',
 'card1_TransactionAmt_mean',
 'uid2_TransactionAmt_std',
 'card2_fq_enc',
 'uid2_TransactionAmt_mean',
 'card1_count_full']

In [122]:
cat_list = ['Date', 'Date+Hour', 'dayofwweek', 'isweekend', 'hour']

In [123]:
for fea in num_list:
    print(fea)
    data[fea] = data[fea].astype('float')

TransactionAmt
dist1
dist2
C1
C2
C3
C4
C5
C6
C7
C8
C9
C10
C11
C12
C13
C14
D1
D2
D3
D4
D5
D6
D7
D8
D9
D10
D11
D12
D13
D14
D15
id_02
id_05
id_06
nulls1
V307
V310
V313
C5_fq_enc
P_emaildomain_bin
card5_count_full
card5_TransactionAmt_std
V315
id_02_to_mean_card1
C11_fq_enc
C9_fq_enc
D15_to_mean_addr1
D15_to_mean_card4
card5_TransactionAmt_mean
C2_fq_enc
D15_to_std_card1
C14_fq_enc
C6_fq_enc
id_31_count_dist
Transaction_day_of_week
C1_fq_enc
uid_TransactionAmt_std
D15_to_mean_card1
TransactionAmt_decimal
card2_TransactionAmt_std
TransactionAmt_to_mean_card4
uid_TransactionAmt_mean
C13_fq_enc
Transaction_hour_of_day
TransactionAmt_to_std_card1
TransactionAmt_to_std_card4
card1_TransactionAmt_std
card2_TransactionAmt_mean
V13TransactionAmtminus_mean_all
TransactionAmt_to_mean_card1
card1_TransactionAmt_mean
uid2_TransactionAmt_std
card2_fq_enc
uid2_TransactionAmt_mean
card1_count_full


In [124]:
gc.collect()

973

In [125]:
for target in num_list:
    print(target)
    data = add_GroupBy_mean(data, by = cat_list,  window = 100, target = target)

TransactionAmt
Processing Date
Processing Date+Hour
Processing dayofwweek
Processing isweekend
Processing hour
dist1
Processing Date
Processing Date+Hour
Processing dayofwweek
Processing isweekend
Processing hour
dist2
Processing Date
Processing Date+Hour
Processing dayofwweek
Processing isweekend


KeyboardInterrupt: 

### Note: the above features engineering can give you LB 9506

# 2. All feature engineering for CV 9437 solution

- features used: 373 features selected + 5 features from team member AutoML
- 5 other features : 'D1_reverse','gap1_reverse','user_def_2_dist1_std', 'gap2','user_def_4_act_rank_reverse'

# 3 All feature for CV 9445 solution

### 3.1 Add some time-depend pecentage statistic

In [126]:
def add_GroupBy_percentage(df, by,  window = 10, target = 'TransactionAmt'):
    for fea_by in by:
        print('Processing', fea_by)
        #df[fea_by + target +'minus_mean_window'] =  df[target] - df.groupby(fea_by)[target].transform(lambda x: x.rolling(window, 1).mean())

        #df[fea_by + target +'minus_mean_all'] =   df[target] - df.groupby(fea_by)[target].transform('mean')
        df[fea_by + target +'_percentage_all'] =   data.groupby([fea_by,target])[target].transform('count')  / data.groupby(fea_by)[target].transform('count')
    
    return df

In [127]:
data = deepcopy(data_backup)

START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

data['TransactionDT'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))

data['Date'] = data['TransactionDT'].apply(lambda x: str(x)[:10])

data['Date+Hour'] = data['TransactionDT'].apply(lambda x: str(x)[:13])

data['dayofwweek'] = data['TransactionDT'].dt.dayofweek
data['isweekend'] = ( data['TransactionDT'].dt.dayofweek >= 5 )
data['hour'] = data['TransactionDT'].dt.hour

In [128]:
cat_list = ['Date', 'Date+Hour', 'dayofwweek', 'isweekend', 'hour']

In [None]:
train_df = data[:590540]
test_df= data[590540:]

# decrease the cardinality of all feature down to 50
categorical_features = []
for fea in data:
    if data[fea].nunique() > 50:
        categorical_features.append(fea)
        
for column in categorical_features:
    categories.append(list(train_df[column].value_counts().iloc[: n_values - 1].index) + ['Other'])
    values2use = categories[-1]
    train_df[column] = train_df[column].apply(lambda x: x if x in values2use else 'Other')
    test_df[column] = test_df[column].apply(lambda x: x if x in values2use else 'Other')

In [None]:
num_list = list(data)

In [None]:
for target in num_list:
    print(target)
    data = add_GroupBy_mean(data, by = cat_list,  window = 100, target = target)

# 5 Apply PCA to V features

In [130]:
train_transaction = pd.read_csv('train_transaction.csv', index_col='TransactionID')
test_transaction = pd.read_csv('test_transaction.csv', index_col='TransactionID')
train_identity = pd.read_csv('train_identity.csv', index_col='TransactionID')
test_identity = pd.read_csv('test_identity.csv', index_col='TransactionID')
sample_submission = pd.read_csv('sample_submission.csv', index_col='TransactionID')

# merge 
train_df = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test_df = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

print("Train shape : "+str(train_df.shape))
print("Test shape  : "+str(test_df.shape))

Train shape : (590540, 433)
Test shape  : (506691, 432)


In [131]:
data = pd.concat([train_df, test_df])

In [132]:
features1 = ['V'+str(i) for i in range(1,12)]
features2 = ['V'+str(i) for i in range(12,35)]
features3 = ['V'+str(i) for i in range(35,53)]
features4 = ['V'+str(i) for i in range(53,75)]
features5 = ['V'+str(i) for i in range(75,95)]
features6 = ['V'+str(i) for i in range(95,138)]
features7 = ['V'+str(i) for i in range(138,167)]
features8 = ['V'+str(i) for i in range(167,217)]
features9 = ['V'+str(i) for i in range(217,279)]
features10 = ['V'+str(i) for i in range(279,322)]
features11 = ['V'+str(i) for i in range(322,340)]

In [133]:
group_list = [features1,features2,features3,features4,features5,features6,
              features7,features8,features9,features10,features11]

In [135]:
for i,group in enumerate(group_list):
    print(i)
    data['PCA_V_Group_'+ str(i)] = np.nan

0
1
2
3
4
5
6
7
8
9
10


In [136]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1,  whiten=True)

for i,group in enumerate(group_list):
    print(i)
    data[group][data[group].isnull().sum(axis=1) == 0].shape
    tmp = pca.fit_transform(data[group][data[group].isnull().sum(axis=1) == 0])
    tmp.shape
    data['PCA_V_Group_'+ str(i)][data[group].isnull().sum(axis=1) == 0] = tmp

0


(641426, 11)

(641426, 1)

1


(1008569, 23)

(1008569, 1)

2


(851408, 18)

(851408, 1)

3


(1007236, 22)

(1007236, 1)

4


(995986, 20)

(995986, 1)

5


(1096917, 43)

(1096917, 1)

6


(157730, 29)

(157730, 1)

7


(275920, 50)

(275920, 1)

8


(253988, 62)

(253988, 1)

9


(1089920, 43)

(1089920, 1)

10


(158782, 18)

(158782, 1)