# Prepare Data

In [None]:
!gdown https://drive.google.com/uc?id=1x6xmQifUHQWZi7nDJs53do1G4q5aEnWI&export=download

Downloading...
From: https://drive.google.com/uc?id=1x6xmQifUHQWZi7nDJs53do1G4q5aEnWI
To: /content/ieee-fraud-detection.zip
100% 124M/124M [00:01<00:00, 121MB/s] 


In [None]:
!unzip ieee-fraud-detection.zip

Archive:  ieee-fraud-detection.zip
  inflating: sample_submission.csv   
  inflating: test_identity.csv       
  inflating: test_transaction.csv    
  inflating: train_identity.csv      
  inflating: train_transaction.csv   


In [None]:
# General imports
import numpy as np
import pandas as pd
import os, warnings, datetime, math

from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')

In [None]:
########################### Helpers
#################################################################################
## -------------------
## Memory Reducer
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
## -------------------

In [None]:
########################### Vars
#################################################################################
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')

In [None]:
########################### DATA LOAD
#################################################################################
print('Load Data')
train_df = pd.read_csv('train_transaction.csv')
test_df = pd.read_csv('test_transaction.csv')
test_df['isFraud'] = 0

train_identity = pd.read_csv('train_identity.csv')
test_identity = pd.read_csv('test_identity.csv')

Load Data




# Data Pre-Processing

In [None]:
import re
names = test_identity.columns.tolist()
for name in names:
  if re.findall(r'id',name):
    names[names.index(name)] = 'id_' + ' '.join(map(str, re.findall(r'\d{2}',name)))
#names[names.index(r' ')] = 'new_name'
test_identity.columns = names


In [None]:
########################### Base check
#################################################################################

for df in [train_df, test_df, train_identity, test_identity]:
    original = df.copy()
    df = reduce_mem_usage(df)

    for col in list(df):
        if df[col].dtype!='O':
            if (df[col]-original[col]).sum()!=0:
                df[col] = original[col]
                print('Bad transformation', col)

Mem. usage decreased to 542.35 Mb (69.4% reduction)
Bad transformation TransactionAmt
Bad transformation dist1
Bad transformation dist2
Bad transformation C1
Bad transformation C2
Bad transformation C4
Bad transformation C6
Bad transformation C7
Bad transformation C8
Bad transformation C10
Bad transformation C11
Bad transformation C12
Bad transformation C13
Bad transformation D8
Bad transformation D9
Bad transformation V126
Bad transformation V127
Bad transformation V128
Bad transformation V129
Bad transformation V130
Bad transformation V131
Bad transformation V132
Bad transformation V133
Bad transformation V134
Bad transformation V135
Bad transformation V136
Bad transformation V137
Bad transformation V150
Bad transformation V159
Bad transformation V164
Bad transformation V202
Bad transformation V203
Bad transformation V204
Bad transformation V205
Bad transformation V206
Bad transformation V207
Bad transformation V208
Bad transformation V209
Bad transformation V210
Bad transformation V

In [None]:
########################### TransactionDT
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
dates_range = pd.date_range(start='2017-10-01', end='2019-01-01')
us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())

for df in [train_df, test_df]:
    
    # Temporary variables for aggregation
    df['DT'] = df['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
    df['DT_M'] = ((df['DT'].dt.year-2017)*12 + df['DT'].dt.month).astype(np.int8)
    df['DT_W'] = ((df['DT'].dt.year-2017)*52 + df['DT'].dt.weekofyear).astype(np.int8)
    df['DT_D'] = ((df['DT'].dt.year-2017)*365 + df['DT'].dt.dayofyear).astype(np.int16)
    
    df['DT_hour'] = (df['DT'].dt.hour).astype(np.int8)
    df['DT_day_week'] = (df['DT'].dt.dayofweek).astype(np.int8)
    df['DT_day_month'] = (df['DT'].dt.day).astype(np.int8)
    df['DT_week_month'] = (df['DT'].dt.day)/7
    df['DT_week_month'] = df['DT_week_month'].apply(lambda x: math.ceil(x))

    # Possible solo feature
    df['is_december'] = df['DT'].dt.month
    df['is_december'] = (df['is_december']==12).astype(np.int8)

    # Holidays
    df['is_holiday'] = (df['DT'].dt.date.astype('datetime64').isin(us_holidays)).astype(np.int8)

# Total transactions per timeblock
for col in ['DT_M','DT_W','DT_D']:
    temp_df = pd.concat([train_df[[col]], test_df[[col]]])
    fq_encode = temp_df[col].value_counts().to_dict()
            
    train_df[col+'_total'] = train_df[col].map(fq_encode)
    test_df[col+'_total']  = test_df[col].map(fq_encode)

In [None]:
########################### card4, card6, ProductCD
#################################################################################
# Converting Strings to ints(or floats if nan in column) using frequency encoding
# We will be able to use these columns as category or as numerical feature

for col in ['card4', 'card6', 'ProductCD']:
    print('Encoding', col)
    temp_df = pd.concat([train_df[[col]], test_df[[col]]])
    col_encoded = temp_df[col].value_counts().to_dict()   
    train_df[col] = train_df[col].map(col_encoded)
    test_df[col]  = test_df[col].map(col_encoded)
    print(col_encoded)

Encoding card4
{'visa': 719649, 'mastercard': 347386, 'american express': 16009, 'discover': 9524}
Encoding card6
{'debit': 824959, 'credit': 267648, 'debit or credit': 30, 'charge card': 16}
Encoding ProductCD
{'W': 800657, 'C': 137785, 'R': 73346, 'H': 62397, 'S': 23046}


In [None]:
########################### M columns
#################################################################################
# Converting Strings to ints(or floats if nan in column)

for col in ['M1','M2','M3','M5','M6','M7','M8','M9']:
    train_df[col] = train_df[col].map({'T':1, 'F':0})
    test_df[col]  = test_df[col].map({'T':1, 'F':0})

for col in ['M4']:
    print('Encoding', col)
    temp_df = pd.concat([train_df[[col]], test_df[[col]]])
    col_encoded = temp_df[col].value_counts().to_dict()   
    train_df[col] = train_df[col].map(col_encoded)
    test_df[col]  = test_df[col].map(col_encoded)
    print(col_encoded)

Encoding M4
{'M0': 357789, 'M2': 122947, 'M1': 97306}


In [None]:
train_identity['id_34']

0         match_status:2
1         match_status:1
2                    NaN
3                    NaN
4         match_status:2
               ...      
144228               NaN
144229    match_status:2
144230               NaN
144231    match_status:2
144232               NaN
Name: id_34, Length: 144233, dtype: object

In [None]:
########################### Identity columns
#################################################################################

def minify_identity_df(df):

    df['id_12'] = df['id_12'].map({'Found':1, 'NotFound':0})
    df['id_15'] = df['id_15'].map({'New':2, 'Found':1, 'Unknown':0})
    df['id_16'] = df['id_16'].map({'Found':1, 'NotFound':0})

    df['id_23'] = df['id_23'].map({'TRANSPARENT':4, 'IP_PROXY':3, 'IP_PROXY:ANONYMOUS':2, 'IP_PROXY:HIDDEN':1})

    df['id_27'] = df['id_27'].map({'Found':1, 'NotFound':0})
    df['id_28'] = df['id_28'].map({'New':2, 'Found':1})

    df['id_29'] = df['id_29'].map({'Found':1, 'NotFound':0})

    df['id_35'] = df['id_35'].map({'T':1, 'F':0})
    df['id_36'] = df['id_36'].map({'T':1, 'F':0})
    df['id_37'] = df['id_37'].map({'T':1, 'F':0})
    df['id_38'] = df['id_38'].map({'T':1, 'F':0})

    df['id_34'] = df['id_34'].fillna(':0')
    df['id_34'] = df['id_34'].apply(lambda x: x.split(':')[1]).astype(np.int8)
    df['id_34'] = np.where(df['id_34']==0, np.nan, df['id_34'])
    
    df['id_33'] = df['id_33'].fillna('0x0')
    df['id_33_0'] = df['id_33'].apply(lambda x: x.split('x')[0]).astype(int)
    df['id_33_1'] = df['id_33'].apply(lambda x: x.split('x')[1]).astype(int)
    df['id_33'] = np.where(df['id_33']=='0x0', np.nan, df['id_33'])

    df['DeviceType'].map({'desktop':1, 'mobile':0})
    return df

train_identity = minify_identity_df(train_identity)
test_identity = minify_identity_df(test_identity)

for col in ['id_33']:
    train_identity[col] = train_identity[col].fillna('unseen_before_label')
    test_identity[col]  = test_identity[col].fillna('unseen_before_label')
    
    le = LabelEncoder()
    le.fit(list(train_identity[col])+list(test_identity[col]))
    train_identity[col] = le.transform(train_identity[col])
    test_identity[col]  = le.transform(test_identity[col])

In [None]:
########################### Deltas

for df in [train_df, test_df]:
    for col in ['D'+str(i) for i in range(1,16) if i!=9]: 
        new_col = 'uid_td_'+str(col)
        df[new_col] = df[col].fillna(0).astype(int)
        df[new_col] = df[new_col].apply(lambda x: pd.Timedelta(x, unit='D'))
        df[new_col] = (df['DT'] - df[new_col]).dt.date
        df[new_col] = df[new_col].astype(str)
        df[new_col] = np.where(df[col].isna(), np.nan, df[new_col])

In [None]:
########################### Final check
#################################################################################

for df in [train_df, test_df, train_identity, test_identity]:
    original = df.copy()
    df = reduce_mem_usage(df)

    for col in list(df):
        if df[col].dtype!='O':
            if (df[col]-original[col]).sum()!=0:
                df[col] = original[col]
                print('Bad transformation', col)

Mem. usage decreased to 585.15 Mb (32.4% reduction)
Bad transformation TransactionAmt
Bad transformation dist1
Bad transformation dist2
Bad transformation C1
Bad transformation C2
Bad transformation C4
Bad transformation C6
Bad transformation C7
Bad transformation C8
Bad transformation C10
Bad transformation C11
Bad transformation C12
Bad transformation C13
Bad transformation D8
Bad transformation D9
Bad transformation V126
Bad transformation V127
Bad transformation V128
Bad transformation V129
Bad transformation V130
Bad transformation V131
Bad transformation V132
Bad transformation V133
Bad transformation V134
Bad transformation V135
Bad transformation V136
Bad transformation V137
Bad transformation V150
Bad transformation V159
Bad transformation V164
Bad transformation V202
Bad transformation V203
Bad transformation V204
Bad transformation V205
Bad transformation V206
Bad transformation V207
Bad transformation V208
Bad transformation V209
Bad transformation V210
Bad transformation V

In [None]:
########################### Export
#################################################################################

train_df.to_pickle('train_transaction.pkl')
test_df.to_pickle('test_transaction.pkl')

train_identity.to_pickle('train_identity.pkl')
test_identity.to_pickle('test_identity.pkl')

In [None]:
########################### Full minification for fast tests
#################################################################################
for df in [train_df, test_df, train_identity, test_identity]:
    df = reduce_mem_usage(df)

Mem. usage decreased to 585.15 Mb (28.5% reduction)
Mem. usage decreased to 509.80 Mb (27.5% reduction)
Mem. usage decreased to 15.54 Mb (5.0% reduction)
Mem. usage decreased to 15.29 Mb (5.0% reduction)


In [None]:
########################### Export
#################################################################################

train_df.to_pickle('train_transaction_mini.pkl')
test_df.to_pickle('test_transaction_mini.pkl')

train_identity.to_pickle('train_identity_mini.pkl')
test_identity.to_pickle('test_identity_mini.pkl')

In [None]:
########################### Export
#################################################################################
import pickle
possible_goups = [['V1', 'V2', 'V6', 'V7', 'V8', 'V9'],
 ['V1', 'V2', 'V3', 'V6', 'V7', 'V8', 'V9'],
 ['V2', 'V3', 'V6', 'V7', 'V8', 'V9'],
 ['V4', 'V5'],
 ['V10', 'V11'],
 ['V12', 'V13'],
 ['V14', 'V65'],
 ['V15', 'V16', 'V33', 'V34', 'V57', 'V58', 'V79', 'V94'],
 ['V15', 'V16', 'V33', 'V34', 'V57'],
 ['V17', 'V18', 'V21', 'V22'],
 ['V19', 'V20'],
 ['V17', 'V18', 'V21', 'V22', 'V63', 'V84'],
 ['V23', 'V24'],
 ['V25', 'V26'],
 ['V27', 'V28', 'V68', 'V89'],
 ['V29', 'V30', 'V69', 'V90', 'V91'],
 ['V29', 'V30', 'V70', 'V90', 'V91'],
 ['V31', 'V32', 'V50', 'V71', 'V92', 'V93'],
 ['V31', 'V32', 'V92'],
 ['V15', 'V16', 'V33', 'V34', 'V51', 'V94'],
 ['V15', 'V16', 'V33', 'V34'],
 ['V35', 'V36'],
 ['V37', 'V38'],
 ['V39', 'V40'],
 ['V41', 'V46', 'V47'],
 ['V42', 'V43', 'V84'],
 ['V42', 'V43'],
 ['V44', 'V45'],
 ['V48', 'V49'],
 ['V31', 'V50', 'V71', 'V92'],
 ['V33', 'V51', 'V52', 'V73', 'V94'],
 ['V51', 'V52'],
 ['V53', 'V54'],
 ['V15', 'V16', 'V57', 'V58', 'V73', 'V79', 'V94'],
 ['V15', 'V57', 'V58', 'V79'],
 ['V59', 'V60', 'V63'],
 ['V59', 'V60'],
 ['V61', 'V62'],
 ['V21', 'V59', 'V63', 'V64', 'V84'],
 ['V63', 'V64'],
 ['V66', 'V67'],
 ['V29', 'V69', 'V70', 'V90'],
 ['V30', 'V69', 'V70', 'V90', 'V91'],
 ['V31', 'V50', 'V71', 'V72', 'V92', 'V93'],
 ['V71', 'V72', 'V92', 'V93'],
 ['V51', 'V57', 'V73', 'V74', 'V94'],
 ['V73', 'V74'],
 ['V75', 'V76'],
 ['V80', 'V81', 'V84'],
 ['V80', 'V81'],
 ['V82', 'V83'],
 ['V21', 'V42', 'V63', 'V80', 'V84', 'V85'],
 ['V84', 'V85'],
 ['V86', 'V87'],
 ['V29', 'V30', 'V69', 'V70', 'V90', 'V91'],
 ['V31', 'V32', 'V50', 'V71', 'V72', 'V92', 'V93'],
 ['V31', 'V71', 'V72', 'V92', 'V93'],
 ['V15', 'V33', 'V51', 'V57', 'V73', 'V94'],
 ['V101',
  'V102',
  'V103',
  'V143',
  'V167',
  'V168',
  'V177',
  'V178',
  'V179',
  'V279',
  'V280',
  'V293',
  'V295',
  'V322',
  'V323',
  'V324',
  'V95',
  'V96',
  'V97'],
 ['V101',
  'V102',
  'V103',
  'V143',
  'V167',
  'V168',
  'V177',
  'V178',
  'V179',
  'V279',
  'V280',
  'V293',
  'V294',
  'V295',
  'V322',
  'V323',
  'V324',
  'V95',
  'V96',
  'V97'],
 ['V105', 'V106', 'V296', 'V298', 'V299', 'V329', 'V330'],
 ['V105', 'V106', 'V298', 'V299', 'V329', 'V330'],
 ['V111', 'V113'],
 ['V126', 'V128', 'V132', 'V134'],
 ['V127', 'V128', 'V133', 'V134'],
 ['V126', 'V127', 'V128', 'V132', 'V133', 'V134', 'V332'],
 ['V129', 'V266', 'V269', 'V309', 'V334'],
 ['V130', 'V310'],
 ['V131', 'V312'],
 ['V126', 'V128', 'V132', 'V133', 'V134'],
 ['V127', 'V128', 'V132', 'V133', 'V134'],
 ['V126', 'V127', 'V128', 'V132', 'V133', 'V134', 'V318', 'V332'],
 ['V136', 'V137'],
 ['V101',
  'V102',
  'V103',
  'V143',
  'V167',
  'V177',
  'V178',
  'V179',
  'V279',
  'V280',
  'V293',
  'V295',
  'V322',
  'V323',
  'V324',
  'V95',
  'V96',
  'V97'],
 ['V144', 'V145', 'V150', 'V151'],
 ['V148', 'V149', 'V153', 'V154', 'V155', 'V156', 'V157', 'V158'],
 ['V144', 'V145', 'V150', 'V151', 'V152'],
 ['V151', 'V152'],
 ['V161', 'V163'],
 ['V162', 'V163'],
 ['V161', 'V162', 'V163'],
 ['V101',
  'V102',
  'V103',
  'V167',
  'V168',
  'V177',
  'V178',
  'V179',
  'V279',
  'V280',
  'V293',
  'V295',
  'V322',
  'V323',
  'V324',
  'V95',
  'V96',
  'V97'],
 ['V176', 'V190', 'V199', 'V228', 'V246', 'V257'],
 ['V180', 'V182', 'V183'],
 ['V181', 'V328'],
 ['V180', 'V182', 'V183', 'V330'],
 ['V186', 'V191', 'V196'],
 ['V187', 'V192'],
 ['V187', 'V192', 'V193'],
 ['V192', 'V193', 'V196'],
 ['V194', 'V197'],
 ['V195', 'V198'],
 ['V186', 'V191', 'V193', 'V196'],
 ['V202', 'V204', 'V211', 'V213'],
 ['V203', 'V204', 'V212'],
 ['V202', 'V203', 'V204', 'V213'],
 ['V202', 'V211', 'V213'],
 ['V203', 'V212', 'V213'],
 ['V202', 'V204', 'V211', 'V212', 'V213'],
 ['V214', 'V276', 'V337'],
 ['V215', 'V216', 'V277', 'V278', 'V338', 'V339'],
 ['V217', 'V219', 'V231', 'V233'],
 ['V218', 'V219', 'V232', 'V233'],
 ['V217', 'V218', 'V219', 'V231', 'V232', 'V233'],
 ['V222', 'V230'],
 ['V224', 'V225'],
 ['V229', 'V230', 'V258'],
 ['V222', 'V229', 'V230', 'V258'],
 ['V236', 'V237'],
 ['V238', 'V239'],
 ['V240', 'V241', 'V247', 'V252', 'V260'],
 ['V242', 'V244'],
 ['V245', 'V259'],
 ['V240', 'V241', 'V247', 'V249', 'V252'],
 ['V248', 'V249', 'V254'],
 ['V247', 'V248', 'V249', 'V252'],
 ['V250', 'V251'],
 ['V248', 'V254'],
 ['V255', 'V256'],
 ['V240', 'V241', 'V260'],
 ['V263', 'V265', 'V273', 'V274', 'V275'],
 ['V264', 'V265'],
 ['V263', 'V264', 'V265'],
 ['V129', 'V266', 'V269', 'V309', 'V334', 'V336'],
 ['V268', 'V336'],
 ['V270', 'V272'],
 ['V263', 'V273', 'V274', 'V275'],
 ['V291', 'V292'],
 ['V102', 'V280', 'V294', 'V295', 'V323', 'V96'],
 ['V105', 'V296', 'V298', 'V299', 'V329'],
 ['V105', 'V106', 'V296', 'V298', 'V299', 'V329'],
 ['V105', 'V106', 'V296', 'V298', 'V299', 'V330'],
 ['V300', 'V301'],
 ['V302', 'V304'],
 ['V303', 'V304'],
 ['V302', 'V303', 'V304'],
 ['V306', 'V308', 'V316', 'V318'],
 ['V307', 'V308', 'V317'],
 ['V306', 'V307', 'V308', 'V318'],
 ['V313', 'V315'],
 ['V306', 'V316', 'V318'],
 ['V307', 'V317', 'V318'],
 ['V134', 'V306', 'V308', 'V316', 'V317', 'V318'],
 ['V320', 'V321'],
 ['V326', 'V327'],
 ['V105', 'V106', 'V296', 'V298', 'V329', 'V330'],
 ['V105', 'V106', 'V183', 'V299', 'V329', 'V330'],
 ['V331', 'V332', 'V333'],
 ['V128', 'V134', 'V331', 'V332', 'V333'],
 ['V335', 'V336'],
 ['V266', 'V268', 'V269', 'V334', 'V335', 'V336']]

with open('possible_goups.pickle', 'wb') as f:
    pickle.dump(possible_goups, f, pickle.HIGHEST_PROTOCOL)

# IEEE - Basic FE - part 1

In [None]:
import numpy as np
import pandas as pd
import os, sys, gc, warnings, random, datetime, math

from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold,GroupShuffleSplit
from sklearn.preprocessing import LabelEncoder

import seaborn as sns
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

In [None]:
########################### Helpers
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    
## Global frequency encoding    
def frequency_encoding(df, columns, self_encoding=False):
    for col in columns:
        fq_encode = df[col].value_counts(dropna=False).to_dict()
        if self_encoding:
            df[col] = df[col].map(fq_encode)
        else:
            df[col+'_fq_enc'] = df[col].map(fq_encode)
    return df


def values_normalization(dt_df, periods, columns, enc_type='both'):
    for period in periods:
        for col in columns:
            new_col = col +'_'+ period
            dt_df[col] = dt_df[col].astype(float)  

            temp_min = dt_df.groupby([period])[col].agg(['min']).reset_index()
            temp_min.index = temp_min[period].values
            temp_min = temp_min['min'].to_dict()

            temp_max = dt_df.groupby([period])[col].agg(['max']).reset_index()
            temp_max.index = temp_max[period].values
            temp_max = temp_max['max'].to_dict()

            temp_mean = dt_df.groupby([period])[col].agg(['mean']).reset_index()
            temp_mean.index = temp_mean[period].values
            temp_mean = temp_mean['mean'].to_dict()

            temp_std = dt_df.groupby([period])[col].agg(['std']).reset_index()
            temp_std.index = temp_std[period].values
            temp_std = temp_std['std'].to_dict()

            dt_df['temp_min'] = dt_df[period].map(temp_min)
            dt_df['temp_max'] = dt_df[period].map(temp_max)
            dt_df['temp_mean'] = dt_df[period].map(temp_mean)
            dt_df['temp_std'] = dt_df[period].map(temp_std)
            
            if enc_type=='both':
                dt_df[new_col+'_min_max'] = (dt_df[col]-dt_df['temp_min'])/(dt_df['temp_max']-dt_df['temp_min'])
                dt_df[new_col+'_std_score'] = (dt_df[col]-dt_df['temp_mean'])/(dt_df['temp_std'])
            elif enc_type=='norm':
                 dt_df[new_col+'_std_score'] = (dt_df[col]-dt_df['temp_mean'])/(dt_df['temp_std'])
            elif enc_type=='min_max':
                dt_df[new_col+'_min_max'] = (dt_df[col]-dt_df['temp_min'])/(dt_df['temp_max']-dt_df['temp_min'])

            del dt_df['temp_min'],dt_df['temp_max'],dt_df['temp_mean'],dt_df['temp_std']
    return dt_df

def get_new_columns(temp_list):
    temp_list = [col for col in list(full_df) if col not in temp_list]
    temp_list.sort()

    temp_list2 = [col if col not in remove_features else '-' for col in temp_list ]
    temp_list2.sort()

    temp_list = {'New columns (including dummy)': temp_list,
                 'New Features': temp_list2}
    temp_list = pd.DataFrame.from_dict(temp_list)
    return temp_list

In [None]:
########################### Vars
#################################################################################
SEED = 42
seed_everything(SEED)
LOCAL_TEST = True
MAKE_TESTS = True
TARGET = 'isFraud'

In [None]:
########################### Model params
lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':80000,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': SEED,
                    'early_stopping_rounds':100, 
                } 

In [None]:
########################### Model
import lightgbm as lgb

def make_test(old_score=0, output=False):

    features_columns = [col for col in list(full_df) if col not in remove_features]
    train_mask = full_df['TransactionID'].isin(local_train_id['TransactionID'])
    test_mask = full_df['TransactionID'].isin(local_test_id['TransactionID'])
    
    X,y = full_df[train_mask][features_columns], full_df[train_mask][TARGET]    
    P,P_y = full_df[test_mask][features_columns], full_df[test_mask][TARGET]  

    for col in list(X):
        if X[col].dtype=='O':
            X[col] = X[col].fillna('unseen_before_label')
            P[col] = P[col].fillna('unseen_before_label')

            X[col] = X[col].astype(str)
            P[col] = P[col].astype(str)

            le = LabelEncoder()
            le.fit(list(X[col])+list(P[col]))
            X[col] = le.transform(X[col])
            P[col]  = le.transform(P[col])

            X[col] = X[col].astype('category')
            P[col] = P[col].astype('category')
        
    tt_df = full_df[test_mask][['TransactionID','DT_W',TARGET]]        
    tt_df['prediction'] = 0
    
    tr_data = lgb.Dataset(X, label=y)
    vl_data = lgb.Dataset(P, label=P_y) 
    estimator = lgb.train(
            lgb_params,
            tr_data,
            valid_sets = [tr_data, vl_data],
            verbose_eval = 200,
        )   
        
    tt_df['prediction'] = estimator.predict(P)
    feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(),X.columns)), columns=['Value','Feature'])
    
    if output:
        tt_df[['TransactionID','prediction']].to_csv('oof.csv',index=False)
        print('---Wrote OOF to file---')
    
    m_results = []
    print('#'*20)
    g_auc = metrics.roc_auc_score(tt_df[TARGET], tt_df['prediction'])
    score_diff = g_auc - old_score
    print('Global AUC', g_auc)
    m_results.append(g_auc)
    
    for i in range(full_df[test_mask]['DT_W'].min(), full_df[test_mask]['DT_W'].max()+1):
        mask = tt_df['DT_W']==i
        w_auc = metrics.roc_auc_score(tt_df[mask][TARGET], tt_df[mask]['prediction'])
        print('Week', i, w_auc, len(tt_df[mask]))
        m_results.append(w_auc)
        
    print('#'*20)
    print('Features Preformance:', g_auc)
    print('Diff with previous__:', score_diff)
    
    return tt_df, feature_imp, m_results, estimator

In [None]:
########################### DATA LOAD
#################################################################################
print('Load Data')
train_df = pd.read_pickle('train_transaction.pkl')
test_df = pd.read_pickle('test_transaction.pkl')

# Full Data set (careful with target encoding)
full_df = pd.concat([train_df, test_df]).reset_index(drop=True)

# Local test IDs with one month gap
local_test_id  = train_df[train_df['DT_M']==train_df['DT_M'].max()].reset_index(drop=True)
local_train_id = train_df[train_df['DT_M']<(train_df['DT_M'].max()-1)].reset_index(drop=True)
local_train_id = local_train_id[['TransactionID']]
local_test_id  = local_test_id[['TransactionID']]
del train_df, test_df

# Identity Data set
train_identity = pd.read_pickle('train_identity.pkl')
test_identity = pd.read_pickle('test_identity.pkl')
identity_df = pd.concat([train_identity, test_identity]).reset_index(drop=True)
del train_identity, test_identity

print('Shape control (for local test):', local_train_id.shape, local_test_id.shape)

Load Data
Shape control (for local test): (417559, 1) (89326, 1)


In [None]:
########################### All features columns
#################################################################################
# Add list of feature that we will remove for sure
remove_features = [
    'TransactionID','TransactionDT', 
    TARGET,
    'DT','DT_M','DT_W','DT_D','DTT',
    'DT_hour','DT_day_week','DT_day_month',
    'DT_M_total','DT_W_total','DT_D_total',
    'is_december','is_holiday','temp','weight',
    ]

# Make sure that TransactionAmt is float64
# To not lose values during aggregations
full_df['TransactionAmt'] = full_df['TransactionAmt'].astype(float)

# Base lists for features to do frequency encoding
# and saved initial state
fq_encode = []
base_columns = list(full_df)

# We don't need V columns in the initial phase 
# removing them to make predictions faster
remove_features += ['V'+str(i) for i in range(1,340)]

# Removing transformed D columns
remove_features += ['uid_td_D'+str(i) for i in range(1,16) if i!=9]

# Make sure we have m_results variable
m_results = [0]

In [None]:
########################### This is start baseline
if MAKE_TESTS:
    tt_df, feature_imp, m_results, model = make_test()

Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.955715	valid_1's auc: 0.896712
[400]	training's auc: 0.982231	valid_1's auc: 0.909409
[600]	training's auc: 0.991262	valid_1's auc: 0.912886
[800]	training's auc: 0.995355	valid_1's auc: 0.914254
Early stopping, best iteration is:
[830]	training's auc: 0.995736	valid_1's auc: 0.914355
####################
Global AUC 0.9143548683934609
Week 70 0.9230980050488626 18970
Week 71 0.9067099636764909 20726
Week 72 0.9051352197870371 20332
Week 73 0.9249483056957462 19010
Week 74 0.9078872187307547 10288
####################
Features Preformance: 0.9143548683934609
Diff with previous__: 0.9143548683934609


In [None]:
########################### Fix card columns and encode
print('Fix card4 and card6 values')
saved_state = list(full_df)
####

####
# card4 and card5 have strong connection
# with card1 - we can unify values
# to guarantee that it will be same combinations
# for all data.

# I've tried to fill others NaNs
# But seems that there are no more bad values.
# All rest NaNs are meaningful.
####

full_df['card6'] = np.where(full_df['card6']==30, np.nan, full_df['card6'])
full_df['card6'] = np.where(full_df['card6']==16, np.nan, full_df['card6'])

i_cols = ['card4','card6']

for col in i_cols:
    temp_df = full_df.groupby(['card1',col])[col].agg(['count']).reset_index()
    temp_df = temp_df.sort_values(by=['card1','count'], ascending=False)
    del temp_df['count']
    temp_df = temp_df.drop_duplicates(subset=['card1'], keep='first').reset_index(drop=True)
    temp_df.index = temp_df['card1'].values
    temp_df = temp_df[col].to_dict()
    full_df[col] = full_df['card1'].map(temp_df)
    
# Add cards features for later encoding
i_cols = ['card1','card2','card3','card4','card5','card6']
fq_encode += i_cols

####
if MAKE_TESTS:
    print(get_new_columns(saved_state))
    tt_df, feature_imp, m_results, model = make_test(m_results[0])
####

Fix card4 and card6 values
Empty DataFrame
Columns: [New columns (including dummy), New Features]
Index: []
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.955504	valid_1's auc: 0.897144
[400]	training's auc: 0.98198	valid_1's auc: 0.910281
[600]	training's auc: 0.991192	valid_1's auc: 0.913685
[800]	training's auc: 0.995295	valid_1's auc: 0.914555
Early stopping, best iteration is:
[865]	training's auc: 0.996118	valid_1's auc: 0.914708
####################
Global AUC 0.914707567231095
Week 70 0.9236173561571233 18970
Week 71 0.9067164773859334 20726
Week 72 0.9055725078756296 20332
Week 73 0.9252666186012978 19010
Week 74 0.9088326006873009 10288
####################
Features Preformance: 0.914707567231095
Diff with previous__: 0.00035269883763411336


In [None]:
########################### Client Virtual ID
print('Create client identification ID')
saved_state = list(full_df)
####

####
# Client subgroups:

# bank_type -> looking on card3 and card5 distributions
# I would say it is bank branch and country
# full_addr -> Client registration address in bank
# uid1 -> client identification by bank and card type
# uid2 -> client identification with additional geo information
####

# Bank type
full_df['bank_type'] = full_df['card3'].astype(str)+'_'+full_df['card5'].astype(str)

# Full address
full_df['full_addr'] = full_df['addr1'].astype(str)+'_'+full_df['addr2'].astype(str)

# Virtual client uid
i_cols = ['card1','card2','card3','card4','card5','card6']
full_df['uid1'] = ''
for col in i_cols:
    full_df['uid1'] += full_df[col].astype(str)+'_'

# Virtual client uid + full_addr
full_df['uid2'] = full_df['uid1']+'_'+full_df['full_addr'].astype(str)


# Add uids features for later encoding
i_cols = ['full_addr','bank_type','uid1','uid2']
fq_encode += i_cols

# We can't use this features directly because
# test data will have many unknow values
remove_features += i_cols

# We've created just "ghost" features -> no need to run test
if False: 
    print(get_new_columns(saved_state))
    tt_df, feature_imp, m_results, model = make_test(m_results[0])
####

Create client identification ID


In [None]:
########################### Client identification using deltas
print('Create client identification ID using deltas')
saved_state = list(full_df)
####

# Temporary list
client_cols = []

# Convert all delta columns to some date
# D8 and D9 are not days deltas -
# we can try convert D8 to int and 
# probably it will give us date
# but I'm very very unsure about it.

# We will do all D columns transformation
# (but save original values) as we will
# use it later for other features.

for col in ['D'+str(i) for i in range(1,16) if i!=9]: 
    new_col = 'uid_td_'+str(col)
    
    new_col = 'uid_td_'+str(col)
    full_df[new_col] = full_df['TransactionDT'] / (24*60*60)
    full_df[new_col] = np.floor(full_df[new_col] - full_df[col])    
    remove_features.append(new_col)
    
    # Date is useless itself -> add to dummy features
    #remove_features.append(new_col)


# The most possible deltas to identify account or client
# initial activity are 'D1','D10','D15'
# We can try to find certain client using uid and date
# If client is the same uid+date combination will be
# unique per client and all his transactions
for col in ['D1','D10','D15']:
    new_col = 'uid_td_'+str(col)

    # card1 + full_addr + date
    full_df[new_col+'_cUID_1'] = full_df['card1'].astype(str)+'_'+full_df['full_addr'].astype(str)+'_'+full_df[new_col].astype(str)
    
    # uid1 + full_addr + date
    full_df[new_col+'_cUID_2'] = full_df['uid2'].astype(str)+'_'+full_df[new_col].astype(str)

    # columns 'D1','D2' are clipped we can't trust maximum values
    if col in ['D1','D2']:
        full_df[new_col+'_cUID_1'] = np.where(full_df[col]>=640, 'very_old_client', full_df[new_col+'_cUID_1'])
        full_df[new_col+'_cUID_2'] = np.where(full_df[col]>=640, 'very_old_client', full_df[new_col+'_cUID_2'])

    full_df[new_col+'_cUID_1'] = np.where(full_df[col].isna(), np.nan, full_df[new_col+'_cUID_1'])
    full_df[new_col+'_cUID_2'] = np.where(full_df[col].isna(), np.nan, full_df[new_col+'_cUID_2'])

    # reset cUID_1 if both address are nan (very unstable prediction)
    full_df[new_col+'_cUID_1'] = np.where(full_df['addr1'].isna()&full_df['addr2'].isna(), np.nan, full_df[new_col+'_cUID_1'])

    # cUID is useless itself -> add to dummy features
    remove_features += [new_col+'_cUID_1',new_col+'_cUID_2']
    
    # Add to temporary list (to join with encoding list later)
    client_cols += [new_col+'_cUID_1',new_col+'_cUID_2']
    
## Best candidate for client complete identification
## uid_td_D1_cUID_1
        
# Add cUIDs features for later encoding
fq_encode += client_cols

# We will save this list and even append 
# few more columns for later use
client_cols += ['card1','card2','card3','card4','card5',
                'uid1','uid2']

####
# We've created just "ghost" features -> no need to run test
if False: 
    print(get_new_columns(saved_state))
    tt_df, feature_imp, m_results, model = make_test(m_results[0])
####

Create client identification ID using deltas


In [None]:
########################### Mark card columns "outliers"
print('Outliers mark')
saved_state = list(full_df)
####

####
# We are checking card and uid activity -
# weither activity is constant during the year
# or we have just single card/account use cases.

# These features are categorical ones and
# Catboost benefits the most from them.

# Strange things:
# - "Time window" should be big enough 
# - Doesn't work for DT_W and DT_D
# even when local test showing score boost.

# Seems to me that catboost start to combine 
# them with themselfs and loosing "magic".
####

i_cols = client_cols.copy()
periods = ['DT_M'] 

for period in periods:
    for col in i_cols:
        full_df[col+'_catboost_check_'+period] = full_df.groupby([col])[period].transform('nunique')
        full_df[col+'_catboost_check_'+period] = np.where(full_df[col+'_catboost_check_'+period]==1,1,0)
        
####
if MAKE_TESTS:
    print(get_new_columns(saved_state))
    tt_df, feature_imp, m_results, model = make_test(m_results[0])
####

Outliers mark
            New columns (including dummy)                           New Features
0               card1_catboost_check_DT_M              card1_catboost_check_DT_M
1               card2_catboost_check_DT_M              card2_catboost_check_DT_M
2               card3_catboost_check_DT_M              card3_catboost_check_DT_M
3               card4_catboost_check_DT_M              card4_catboost_check_DT_M
4               card5_catboost_check_DT_M              card5_catboost_check_DT_M
5                uid1_catboost_check_DT_M               uid1_catboost_check_DT_M
6                uid2_catboost_check_DT_M               uid2_catboost_check_DT_M
7   uid_td_D10_cUID_1_catboost_check_DT_M  uid_td_D10_cUID_1_catboost_check_DT_M
8   uid_td_D10_cUID_2_catboost_check_DT_M  uid_td_D10_cUID_2_catboost_check_DT_M
9   uid_td_D15_cUID_1_catboost_check_DT_M  uid_td_D15_cUID_1_catboost_check_DT_M
10  uid_td_D15_cUID_2_catboost_check_DT_M  uid_td_D15_cUID_2_catboost_check_DT_M
11   uid_td_D1

In [None]:
########################### V columns compact and assign groups
print('V columns / Nan groups')
saved_state = list(full_df)
####

####
# Nangroups identification are categorical features
# and Catboost benefits the most from them.

# Mean/std just occasion transformation.
####

nans_groups = {}
nans_df = full_df.isna()

i_cols = ['V'+str(i) for i in range(1,340)]
for col in i_cols:
    cur_group = nans_df[col].sum()
    try:
        nans_groups[cur_group].append(col)
    except:
        nans_groups[cur_group]=[col]

for col in nans_groups:
    # Very doubtful features -> Seems it works in tandem with other feature
    # But I'm not sure
    full_df['nan_group_sum_'+str(col)] = full_df[nans_groups[col]].to_numpy().sum(axis=1)
    full_df['nan_group_mean_'+str(col)] = full_df[nans_groups[col]].to_numpy().mean(axis=1)
        
    # lgbm doesn't benefit from such feature -> 
    # let's transform and add it to dummy features list
    full_df['nan_group_catboost_'+str(col)]  = np.where(nans_df[nans_groups[col]].sum(axis=1)>0,1,0).astype(np.int8)
    remove_features.append('nan_group_catboost_'+str(col))
        
####
if MAKE_TESTS:
    print(get_new_columns(saved_state))
    tt_df, feature_imp, m_results, model = make_test(m_results[0])
####

V columns / Nan groups
   New columns (including dummy)           New Features
0      nan_group_catboost_101245                      -
1          nan_group_catboost_15                      -
2      nan_group_catboost_245823                      -
3         nan_group_catboost_314                      -
4      nan_group_catboost_455805                      -
5        nan_group_catboost_7300                      -
6      nan_group_catboost_818499                      -
7      nan_group_catboost_820866                      -
8      nan_group_catboost_821037                      -
9      nan_group_catboost_840073                      -
10      nan_group_catboost_88662                      -
11      nan_group_catboost_89995                      -
12     nan_group_catboost_938449                      -
13     nan_group_catboost_939225                      -
14     nan_group_catboost_939501                      -
15         nan_group_mean_101245  nan_group_mean_101245
16             nan_group_

In [None]:
########################### Mean encoding using M columns
print('Mean encoding, using M columns')
saved_state = list(full_df)
####

main_cols = {
             'uid_td_D1_cUID_1':   ['M'+str(i) for i in [2,3,5,7,8,9]],
             'uid_td_D1_cUID_2':   ['M'+str(i) for i in [2,3,5,6,9]],
             'uid_td_D10_cUID_1':  ['M'+str(i) for i in [5,7,8,9]],
             'uid_td_D10_cUID_2':  ['M'+str(i) for i in [3,6,7,8]],
             'uid_td_D15_cUID_1':  ['M'+str(i) for i in [2,3,5,6,8,]],
             'uid_td_D15_cUID_2':  ['M'+str(i) for i in [2,3,5,6,7,8]],
             'card1':  ['M'+str(i) for i in [2,3,5,6,7,8,9]],
             'card2':  ['M'+str(i) for i in [1,2,3,7,9]],
             'card4':  ['M'+str(i) for i in [3,7,8]],
             'card5':  ['M'+str(i) for i in [5,6,8]],
             'uid1':   ['M'+str(i) for i in [3,5,6,7,8,9]],
             'uid2':   ['M'+str(i) for i in [2,3,5,6,7,8,9]],
            }

for main_col,i_cols in main_cols.items():
    for agg_type in ['mean']:
        temp_df = full_df[[main_col]+i_cols]
        temp_df = temp_df.groupby([main_col])[i_cols].transform(agg_type)
        temp_df.columns = [main_col+'_'+col+'_'+agg_type for col in list(temp_df)]
        full_df = pd.concat([full_df,temp_df], axis=1)
        
####
if MAKE_TESTS:
    print(get_new_columns(saved_state))
    tt_df, feature_imp, m_results, model = make_test(m_results[0])
####

Mean encoding, using M columns
   New columns (including dummy)              New Features
0                  card1_M2_mean             card1_M2_mean
1                  card1_M3_mean             card1_M3_mean
2                  card1_M5_mean             card1_M5_mean
3                  card1_M6_mean             card1_M6_mean
4                  card1_M7_mean             card1_M7_mean
..                           ...                       ...
56      uid_td_D1_cUID_2_M2_mean  uid_td_D1_cUID_2_M2_mean
57      uid_td_D1_cUID_2_M3_mean  uid_td_D1_cUID_2_M3_mean
58      uid_td_D1_cUID_2_M5_mean  uid_td_D1_cUID_2_M5_mean
59      uid_td_D1_cUID_2_M6_mean  uid_td_D1_cUID_2_M6_mean
60      uid_td_D1_cUID_2_M9_mean  uid_td_D1_cUID_2_M9_mean

[61 rows x 2 columns]
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.971478	valid_1's auc: 0.907757
[400]	training's auc: 0.991866	valid_1's auc: 0.922012
[600]	training's auc: 0.996987	valid_1's auc: 0.92657
[800]	train

In [None]:
########################### D Columns Mean/Std
print('D columns Mean/Std')
saved_state = list(full_df)
####

i_cols = ['D'+str(i) for i in range(1,16)]
main_cols = {
             'uid_td_D1_cUID_1': ['D'+str(i) for i in [1,2,3,10,11,14,15]],
            }

for main_col,i_cols in main_cols.items():
    print(main_col)
    for agg_type in ['mean','std']:
        temp_df = full_df.groupby([main_col])[i_cols].transform(agg_type)
        temp_df.columns = [main_col+'_'+col+'_'+agg_type for col in list(temp_df)]
        full_df = pd.concat([full_df,temp_df], axis=1)
        
####
if MAKE_TESTS:
    print(get_new_columns(saved_state))
    tt_df, feature_imp, m_results, model = make_test(m_results[0])
####

D columns Mean/Std
uid_td_D1_cUID_1
   New columns (including dummy)               New Features
0      uid_td_D1_cUID_1_D10_mean  uid_td_D1_cUID_1_D10_mean
1       uid_td_D1_cUID_1_D10_std   uid_td_D1_cUID_1_D10_std
2      uid_td_D1_cUID_1_D11_mean  uid_td_D1_cUID_1_D11_mean
3       uid_td_D1_cUID_1_D11_std   uid_td_D1_cUID_1_D11_std
4      uid_td_D1_cUID_1_D14_mean  uid_td_D1_cUID_1_D14_mean
5       uid_td_D1_cUID_1_D14_std   uid_td_D1_cUID_1_D14_std
6      uid_td_D1_cUID_1_D15_mean  uid_td_D1_cUID_1_D15_mean
7       uid_td_D1_cUID_1_D15_std   uid_td_D1_cUID_1_D15_std
8       uid_td_D1_cUID_1_D1_mean   uid_td_D1_cUID_1_D1_mean
9        uid_td_D1_cUID_1_D1_std    uid_td_D1_cUID_1_D1_std
10      uid_td_D1_cUID_1_D2_mean   uid_td_D1_cUID_1_D2_mean
11       uid_td_D1_cUID_1_D2_std    uid_td_D1_cUID_1_D2_std
12      uid_td_D1_cUID_1_D3_mean   uid_td_D1_cUID_1_D3_mean
13       uid_td_D1_cUID_1_D3_std    uid_td_D1_cUID_1_D3_std
Training until validation scores don't improve for 100 rounds.
[

In [None]:
########################### TransactionAmt
print('TransactionAmt normalization')
saved_state = list(full_df)
####

# Decimal part
full_df['TransactionAmt_cents'] = np.round(100.*(full_df['TransactionAmt'] - np.floor(full_df['TransactionAmt'])),0)
full_df['TransactionAmt_cents'] = full_df['TransactionAmt_cents'].astype(np.int8)

# Clip top values
full_df['TransactionAmt'] = full_df['TransactionAmt'].clip(0,5000)

# Normalization by product
main_cols = [
             'uid_td_D1_cUID_1','uid_td_D1_cUID_2',
             'uid_td_D10_cUID_1','uid_td_D10_cUID_2',
             'uid_td_D15_cUID_1','uid_td_D15_cUID_2',
             'card1','card3',
            ]

for col in main_cols:
    for agg_type in ['mean','std']:
        full_df[col+'_TransactionAmt_Product_' + agg_type] =\
                full_df.groupby([col,'ProductCD'])['TransactionAmt'].transform(agg_type)

    f_std = col+'_TransactionAmt_Product_std'
    f_mean = col+'_TransactionAmt_Product_mean'
    full_df[col+'_Product_norm'] = (full_df['TransactionAmt']-full_df[f_mean])/full_df[f_std]
    del full_df[f_mean], full_df[f_std]
    

####
if MAKE_TESTS:
    print(get_new_columns(saved_state))
    tt_df, feature_imp, m_results, model = make_test(m_results[0])
####

TransactionAmt normalization
    New columns (including dummy)                    New Features
0            TransactionAmt_cents            TransactionAmt_cents
1              card1_Product_norm              card1_Product_norm
2              card3_Product_norm              card3_Product_norm
3  uid_td_D10_cUID_1_Product_norm  uid_td_D10_cUID_1_Product_norm
4  uid_td_D10_cUID_2_Product_norm  uid_td_D10_cUID_2_Product_norm
5  uid_td_D15_cUID_1_Product_norm  uid_td_D15_cUID_1_Product_norm
6  uid_td_D15_cUID_2_Product_norm  uid_td_D15_cUID_2_Product_norm
7   uid_td_D1_cUID_1_Product_norm   uid_td_D1_cUID_1_Product_norm
8   uid_td_D1_cUID_2_Product_norm   uid_td_D1_cUID_2_Product_norm
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.980482	valid_1's auc: 0.922491
[400]	training's auc: 0.994725	valid_1's auc: 0.933014
[600]	training's auc: 0.998444	valid_1's auc: 0.93513
[800]	training's auc: 0.999475	valid_1's auc: 0.935413
Early stopping, best iteratio

In [None]:
########################### TransactionAmt clients columns encoding
print('TransactionAmt encoding clients columns')
saved_state = list(full_df)
####

i_cols = ['TransactionAmt']
main_cols = client_cols.copy()

for main_col in main_cols:
    print(main_col)
    for agg_type in ['mean','std']:
        temp_df = full_df.groupby([main_col])[i_cols].transform(agg_type)
        temp_df.columns = [main_col+'_'+col+'_'+agg_type for col in list(temp_df)]
        full_df = pd.concat([full_df,temp_df], axis=1)

####
if MAKE_TESTS:
    print(get_new_columns(saved_state))
    tt_df, feature_imp, m_results, model = make_test(m_results[0])
####

TransactionAmt encoding clients columns
uid_td_D1_cUID_1
uid_td_D1_cUID_2
uid_td_D10_cUID_1
uid_td_D10_cUID_2
uid_td_D15_cUID_1
uid_td_D15_cUID_2
card1
card2
card3
card4
card5
uid1
uid2
            New columns (including dummy)                           New Features
0               card1_TransactionAmt_mean              card1_TransactionAmt_mean
1                card1_TransactionAmt_std               card1_TransactionAmt_std
2               card2_TransactionAmt_mean              card2_TransactionAmt_mean
3                card2_TransactionAmt_std               card2_TransactionAmt_std
4               card3_TransactionAmt_mean              card3_TransactionAmt_mean
5                card3_TransactionAmt_std               card3_TransactionAmt_std
6               card4_TransactionAmt_mean              card4_TransactionAmt_mean
7                card4_TransactionAmt_std               card4_TransactionAmt_std
8               card5_TransactionAmt_mean              card5_TransactionAmt_mean
9   

In [None]:
########################### Mark card columns "outliers"
print('Categorical outliers')
## 
saved_state = list(full_df)
####

i_cols = ['TransactionAmt','ProductCD','P_emaildomain','R_emaildomain',]
periods = ['DT_M']

for period in periods:
    for col in i_cols:
        full_df[col+'_catboost_check_'+period] = full_df.groupby([col])[period].transform('nunique')
        full_df[col+'_catboost_check_'+period] = np.where(full_df[col+'_catboost_check_'+period]==1,1,0).astype(np.int8)

        
####
if MAKE_TESTS:
    print(get_new_columns(saved_state))
    tt_df, feature_imp, m_results, model = make_test(m_results[0])
####

Categorical outliers
        New columns (including dummy)                        New Features
0   P_emaildomain_catboost_check_DT_M   P_emaildomain_catboost_check_DT_M
1       ProductCD_catboost_check_DT_M       ProductCD_catboost_check_DT_M
2   R_emaildomain_catboost_check_DT_M   R_emaildomain_catboost_check_DT_M
3  TransactionAmt_catboost_check_DT_M  TransactionAmt_catboost_check_DT_M
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.982078	valid_1's auc: 0.923681
[400]	training's auc: 0.995681	valid_1's auc: 0.93326
[600]	training's auc: 0.998933	valid_1's auc: 0.93497
[800]	training's auc: 0.999692	valid_1's auc: 0.935572
Early stopping, best iteration is:
[764]	training's auc: 0.99962	valid_1's auc: 0.935635
####################
Global AUC 0.9356351161172467
Week 70 0.9386337795379774 18970
Week 71 0.9288584915781568 20726
Week 72 0.9300257999972269 20332
Week 73 0.9470109589041096 19010
Week 74 0.9292052820468619 10288
####################
Fe

In [None]:
########################### D Columns Normalize and remove original columns
print('D columns transformations')
## 
saved_state = list(full_df)
####

# Remove original features
# test data will have many unknow values
i_cols = ['D'+str(i) for i in range(1,16)]
remove_features += i_cols

####### Values Normalization
i_cols.remove('D1')
i_cols.remove('D2')
i_cols.remove('D9')
periods = ['DT_D']
for col in i_cols:
    full_df[col] = full_df[col].clip(0)
full_df = values_normalization(full_df, periods, i_cols, enc_type='norm')

i_cols = ['D1','D2','D9']
for col in i_cols:
    full_df[col+'_scaled'] = full_df[col]/full_df[col].max()


####
if MAKE_TESTS:
    print(get_new_columns(saved_state))
    tt_df, feature_imp, m_results, model = make_test(m_results[0])
####

D columns transformations
   New columns (including dummy)        New Features
0             D10_DT_D_std_score  D10_DT_D_std_score
1             D11_DT_D_std_score  D11_DT_D_std_score
2             D12_DT_D_std_score  D12_DT_D_std_score
3             D13_DT_D_std_score  D13_DT_D_std_score
4             D14_DT_D_std_score  D14_DT_D_std_score
5             D15_DT_D_std_score  D15_DT_D_std_score
6                      D1_scaled           D1_scaled
7                      D2_scaled           D2_scaled
8              D3_DT_D_std_score   D3_DT_D_std_score
9              D4_DT_D_std_score   D4_DT_D_std_score
10             D5_DT_D_std_score   D5_DT_D_std_score
11             D6_DT_D_std_score   D6_DT_D_std_score
12             D7_DT_D_std_score   D7_DT_D_std_score
13             D8_DT_D_std_score   D8_DT_D_std_score
14                     D9_scaled           D9_scaled
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.982209	valid_1's auc: 0.923013
[400]	tr

In [None]:
########################### Dist
print('Distance normalization')
## 
saved_state = list(full_df)
####

i_cols = ['dist1','dist2']
main_cols = [
             'uid_td_D1_cUID_1',
             'card1',
            ]


for main_col in main_cols:
    print(main_col)
    for agg_type in ['mean','std']:
        temp_df = full_df.groupby([main_col])[i_cols].transform(agg_type)
        temp_df.columns = [main_col+'_'+col+'_'+agg_type for col in list(temp_df)]
        full_df = pd.concat([full_df,temp_df], axis=1)
    
    for col in i_cols:
        f_std = main_col+'_'+col+'_std'
        f_mean = main_col+'_'+col+'_mean'
        full_df[main_col+'_'+col+'_norm'] = (full_df[col]-full_df[f_mean])/full_df[f_std]
        del full_df[f_mean], full_df[f_std]


####
if MAKE_TESTS:
    print(get_new_columns(saved_state))
    tt_df, feature_imp, m_results, model = make_test(m_results[0])
####

Distance normalization
uid_td_D1_cUID_1
card1
  New columns (including dummy)                 New Features
0              card1_dist1_norm             card1_dist1_norm
1              card1_dist2_norm             card1_dist2_norm
2   uid_td_D1_cUID_1_dist1_norm  uid_td_D1_cUID_1_dist1_norm
3   uid_td_D1_cUID_1_dist2_norm  uid_td_D1_cUID_1_dist2_norm
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.981946	valid_1's auc: 0.922745
[400]	training's auc: 0.995923	valid_1's auc: 0.931731
[600]	training's auc: 0.999088	valid_1's auc: 0.934337
[800]	training's auc: 0.999761	valid_1's auc: 0.935391
[1000]	training's auc: 0.999941	valid_1's auc: 0.93551
Early stopping, best iteration is:
[1007]	training's auc: 0.999943	valid_1's auc: 0.935584
####################
Global AUC 0.9355834593166266
Week 70 0.9385412718738577 18970
Week 71 0.9298409888577251 20726
Week 72 0.9294413982411064 20332
Week 73 0.9472956741167987 19010
Week 74 0.9279291163401926 10288
####

In [None]:
########################### Count similar transactions per period
print('Similar transactions per period')
## 
saved_state = list(full_df)
####

periods = ['DT_W','DT_D'] 

for period in periods:
    full_df['TransactionAmt_Product_counts_' + period] =\
        full_df.groupby([period,'ProductCD','TransactionAmt'])['TransactionAmt'].transform('count')
    full_df['TransactionAmt_Product_counts_' + period] /= full_df[period+'_total']

####
if MAKE_TESTS:
    print(get_new_columns(saved_state))
    tt_df, feature_imp, m_results, model = make_test(m_results[0])
####

Similar transactions per period
        New columns (including dummy)                        New Features
0  TransactionAmt_Product_counts_DT_D  TransactionAmt_Product_counts_DT_D
1  TransactionAmt_Product_counts_DT_W  TransactionAmt_Product_counts_DT_W
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.982573	valid_1's auc: 0.924481
[400]	training's auc: 0.996356	valid_1's auc: 0.933441
[600]	training's auc: 0.999231	valid_1's auc: 0.936153
[800]	training's auc: 0.99981	valid_1's auc: 0.937201
Early stopping, best iteration is:
[899]	training's auc: 0.999902	valid_1's auc: 0.937398
####################
Global AUC 0.9373973028683664
Week 70 0.9401675379206669 18970
Week 71 0.9323021748126351 20726
Week 72 0.9325083875766098 20332
Week 73 0.9475128334534967 19010
Week 74 0.9294423770460063 10288
####################
Features Preformance: 0.9373973028683664
Diff with previous__: 0.0018138435517398577


In [None]:
########################### Find nunique dates per client
print('Nunique dates per client')
## 
saved_state = list(full_df)
####

main_cols = {
            'uid_td_D1_cUID_1': ['uid_td_D'+str(i) for i in range(2,16) if i!=9] + ['D8','D9'],
            }

for main_col,i_cols in main_cols.items():
    for col in i_cols:
        full_df[col+'_catboost_check_'+main_col] = full_df.groupby([main_col])[col].transform('nunique')

####
if MAKE_TESTS:
    print(get_new_columns(saved_state))
    tt_df, feature_imp, m_results, model = make_test(m_results[0])
####

Nunique dates per client
                 New columns (including dummy)                                New Features
0           D8_catboost_check_uid_td_D1_cUID_1          D8_catboost_check_uid_td_D1_cUID_1
1           D9_catboost_check_uid_td_D1_cUID_1          D9_catboost_check_uid_td_D1_cUID_1
2   uid_td_D10_catboost_check_uid_td_D1_cUID_1  uid_td_D10_catboost_check_uid_td_D1_cUID_1
3   uid_td_D11_catboost_check_uid_td_D1_cUID_1  uid_td_D11_catboost_check_uid_td_D1_cUID_1
4   uid_td_D12_catboost_check_uid_td_D1_cUID_1  uid_td_D12_catboost_check_uid_td_D1_cUID_1
5   uid_td_D13_catboost_check_uid_td_D1_cUID_1  uid_td_D13_catboost_check_uid_td_D1_cUID_1
6   uid_td_D14_catboost_check_uid_td_D1_cUID_1  uid_td_D14_catboost_check_uid_td_D1_cUID_1
7   uid_td_D15_catboost_check_uid_td_D1_cUID_1  uid_td_D15_catboost_check_uid_td_D1_cUID_1
8    uid_td_D2_catboost_check_uid_td_D1_cUID_1   uid_td_D2_catboost_check_uid_td_D1_cUID_1
9    uid_td_D3_catboost_check_uid_td_D1_cUID_1   uid_td_D3_catboo

In [None]:
########################### Email transformation
print('Email split')
saved_state = list(full_df)
####

p = 'P_emaildomain'
r = 'R_emaildomain'

full_df['full_email'] = full_df[p].astype(str) +'_'+ full_df[r].astype(str)
full_df['email_p_extension'] = full_df[p].apply(lambda x: str(x).split('.')[-1])
full_df['email_r_extension'] = full_df[r].apply(lambda x: str(x).split('.')[-1])
full_df['email_p_domain'] = full_df[p].apply(lambda x: str(x).split('.')[0])
full_df['email_r_domain'] = full_df[r].apply(lambda x: str(x).split('.')[0])

i_cols = ['P_emaildomain','R_emaildomain',
          'full_email',
          'email_p_extension','email_r_extension',
          'email_p_domain','email_r_domain']

full_df = frequency_encoding(full_df, i_cols, self_encoding=True)

####
if MAKE_TESTS:
    print(get_new_columns(saved_state))
    tt_df, feature_imp, m_results, model = make_test(m_results[0])
####

Email split
  New columns (including dummy)       New Features
0                email_p_domain     email_p_domain
1             email_p_extension  email_p_extension
2                email_r_domain     email_r_domain
3             email_r_extension  email_r_extension
4                    full_email         full_email
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.982567	valid_1's auc: 0.924148
[400]	training's auc: 0.996307	valid_1's auc: 0.933213
[600]	training's auc: 0.99922	valid_1's auc: 0.935671
[800]	training's auc: 0.999795	valid_1's auc: 0.936018
[1000]	training's auc: 0.999949	valid_1's auc: 0.936216
Early stopping, best iteration is:
[1012]	training's auc: 0.999954	valid_1's auc: 0.936295
####################
Global AUC 0.9362955394479312
Week 70 0.9381353594567502 18970
Week 71 0.9312575290817968 20726
Week 72 0.9302946432920185 20332
Week 73 0.9482524152847873 19010
Week 74 0.928360834104704 10288
####################
Features Preforma

In [None]:
########################### Device info and identity
print('Identity sets')
saved_state = list(full_df)
####

########################### Device info
identity_df['DeviceInfo'] = identity_df['DeviceInfo'].fillna('unknown_device').str.lower()
identity_df['DeviceInfo_device'] = identity_df['DeviceInfo'].apply(lambda x: ''.join([i for i in x if i.isalpha()]))
identity_df['DeviceInfo_version'] = identity_df['DeviceInfo'].apply(lambda x: ''.join([i for i in x if i.isnumeric()]))
    
########################### Device info 2
identity_df['id_30'] = identity_df['id_30'].fillna('unknown_device').str.lower()
identity_df['id_30_device'] = identity_df['id_30'].apply(lambda x: ''.join([i for i in x if i.isalpha()]))
identity_df['id_30_version'] = identity_df['id_30'].apply(lambda x: ''.join([i for i in x if i.isnumeric()]))
    
########################### Browser
identity_df['id_31'] = identity_df['id_31'].fillna('unknown_device').str.lower()
identity_df['id_31_device'] = identity_df['id_31'].apply(lambda x: ''.join([i for i in x if i.isalpha()]))
    
########################### Merge Identity columns
temp_df = full_df[['TransactionID']]
temp_df = temp_df.merge(identity_df, on=['TransactionID'], how='left')
del temp_df['TransactionID']
full_df = pd.concat([full_df,temp_df], axis=1)
  
i_cols = [
          'DeviceInfo','DeviceInfo_device','DeviceInfo_version',
          'id_30','id_30_device','id_30_version',
          'id_31','id_31_device',
          'id_33','DeviceType'
         ]

####### Global Self frequency encoding
full_df = frequency_encoding(full_df, i_cols, self_encoding=True)

####
if MAKE_TESTS:
    print(get_new_columns(saved_state))
    tt_df, feature_imp, m_results, model = make_test(m_results[0])
####

Identity sets
   New columns (including dummy)        New Features
0                     DeviceInfo          DeviceInfo
1              DeviceInfo_device   DeviceInfo_device
2             DeviceInfo_version  DeviceInfo_version
3                     DeviceType          DeviceType
4                          id_01               id_01
5                          id_02               id_02
6                          id_03               id_03
7                          id_04               id_04
8                          id_05               id_05
9                          id_06               id_06
10                         id_07               id_07
11                         id_08               id_08
12                         id_09               id_09
13                         id_10               id_10
14                         id_11               id_11
15                         id_12               id_12
16                         id_13               id_13
17                         id_14

In [None]:
########################### Export
full_df.to_pickle('baseline_full_df.pkl')

remove_features_df = pd.DataFrame(remove_features, columns=['features_to_remove'])
remove_features_df.to_pickle('baseline_remove_features.pkl')

NameError: ignored