In [1]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, warnings, random, datetime, psutil

from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold, GroupKFold
from sklearn.preprocessing import LabelEncoder

from catboost import CatBoostClassifier

warnings.filterwarnings('ignore')

In [2]:
########################### Helpers
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
## Memory Reducer
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Simple "Memory profilers" to see memory usage
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

In [3]:
########################### Model
def make_predictions(tr_df, tt_df, features_columns, target, cat_params, NFOLDS=2, kfold_mode='grouped'):
    
    X,y = tr_df[features_columns], tr_df[target]    
    P,P_y = tt_df[features_columns], tt_df[target]  
    split_groups = tr_df['DT_M']

    tt_df = tt_df[['TransactionID',target]] 
    tr_df = tr_df[['TransactionID',target]] 
    
    predictions = np.zeros(len(tt_df))
    oof = np.zeros(len(tr_df))

    if kfold_mode=='grouped':
        folds = GroupKFold(n_splits=NFOLDS)
        folds_split = folds.split(X, y, groups=split_groups)
    else:
        folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
        folds_split = folds.split(X, y)

    for fold_, (trn_idx, val_idx) in enumerate(folds_split):        
        print('Fold:',fold_)
        
        estimator = CatBoostClassifier(**cat_params)        
        estimator.fit(
            X.iloc[trn_idx,:],y[trn_idx],
            eval_set=(X.iloc[val_idx,:], y[val_idx]),
            cat_features=categorical_features,
            use_best_model=True,
            verbose=True)
        
        pp_p = estimator.predict_proba(P)[:,1]
        predictions += pp_p/NFOLDS
        
        oof_preds = estimator.predict_proba(X.iloc[val_idx,:])[:,1]
        oof[val_idx] = (oof_preds - oof_preds.min())/(oof_preds.max() - oof_preds.min())
        
        del estimator
        gc.collect()
        
    tt_df['prediction'] = predictions
    print('OOF AUC:', metrics.roc_auc_score(y, oof))
    if LOCAL_TEST:
        print('Holdout AUC:', metrics.roc_auc_score(tt_df[TARGET], tt_df['prediction']))
    
    return tt_df
## -------------------

In [4]:
########################### Vars
SEED = 42
seed_everything(SEED)
LOCAL_TEST = False
TARGET = 'isFraud'
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')

In [5]:
########################### Model params
cat_params = {
                'n_estimators':5000,
                'learning_rate': 0.07,
                'eval_metric':'AUC',
                'loss_function':'Logloss',
                'random_seed':SEED,
                'metric_period':500,
                'od_wait':500,
                'task_type':'GPU',
                'depth': 8,
                #'colsample_bylevel':0.7,
                } 

In [6]:
########################### DATA LOAD
#################################################################################
print('Load Data')

if LOCAL_TEST:
    train_df = pd.read_pickle('../input/ieee-fe-for-local-test/train_df.pkl')
    test_df = pd.read_pickle('../input/ieee-fe-for-local-test/test_df.pkl')
else:
    train_df = pd.read_pickle('../input/ieee-fe-with-some-eda/train_df.pkl')
    test_df = pd.read_pickle('../input/ieee-fe-with-some-eda/test_df.pkl')
    
remove_features = pd.read_pickle('../input/ieee-fe-with-some-eda/remove_features.pkl')
remove_features = list(remove_features['features_to_remove'].values)
print('Shape control:', train_df.shape, test_df.shape)

Load Data
Shape control: (590540, 791) (506691, 791)


In [7]:
########################### Encode NaN goups
nans_groups = {}
temp_df = train_df.isna()
temp_df2 = test_df.isna()
nans_df = pd.concat([temp_df, temp_df2])

for col in list(nans_df):
    cur_group = nans_df[col].sum()
    if cur_group>0:
        try:
            nans_groups[cur_group].append(col)
        except:
            nans_groups[cur_group]=[col]

add_category = []
for col in nans_groups:
    if len(nans_groups[col])>1:
        train_df['nan_group_'+str(col)] = np.where(temp_df[nans_groups[col]].sum(axis=1)>0,1,0).astype(np.int8)
        test_df['nan_group_'+str(col)]  = np.where(temp_df2[nans_groups[col]].sum(axis=1)>0,1,0).astype(np.int8)
        add_category.append('nan_group_'+str(col))
        
del temp_df, temp_df2, nans_df, nans_groups

In [8]:
########################### Copy original Categorical features
categorical_features = ['ProductCD','M4',
                        'card1','card2','card3','card4','card5','card6',
                        'addr1','addr2','dist1','dist2',
                        'P_emaildomain','R_emaildomain',
                       ]

o_trans = pd.concat([pd.read_pickle('../input/ieee-data-minification/train_transaction.pkl'),
                     pd.read_pickle('../input/ieee-data-minification/test_transaction.pkl')])

o_ident = pd.concat([pd.read_pickle('../input/ieee-data-minification/train_identity.pkl'),
                     pd.read_pickle('../input/ieee-data-minification/test_identity.pkl')])

o_trans = o_trans.merge(o_ident, on=['TransactionID'], how='left')
o_trans = o_trans[['TransactionID'] + categorical_features]
o_features = categorical_features.copy()
categorical_features = [col+'_cat' for col in categorical_features]
o_trans.columns = ['TransactionID'] + categorical_features
del o_ident

temp_df = train_df[['TransactionID']]
temp_df = temp_df.merge(o_trans, on=['TransactionID'], how='left')
del temp_df['TransactionID']
train_df = pd.concat([train_df, temp_df], axis=1)

temp_df = test_df[['TransactionID']]
temp_df = temp_df.merge(o_trans, on=['TransactionID'], how='left')
del temp_df['TransactionID']
test_df = pd.concat([test_df, temp_df], axis=1)
del temp_df, o_trans

for col in o_features:
    if train_df[col].equals(train_df[col+'_cat']):
        print('No transformation (keep only categorical)', col)
        del train_df[col], test_df[col]
        
    col = col+'_cat'    
    train_df[col] = train_df[col].fillna(-999)
    test_df[col]  = test_df[col].fillna(-999)

categorical_features += add_category

No transformation (keep only categorical) card4
No transformation (keep only categorical) addr1
No transformation (keep only categorical) addr2
No transformation (keep only categorical) dist1
No transformation (keep only categorical) dist2


In [9]:
########################### Transform Heavy Dominated columns
total_items = len(train_df)
keep_cols = [TARGET,'C3_fq_enc']

for col in list(train_df):
    if train_df[col].dtype.name!='category':
        cur_dominator = list(train_df[col].fillna(-999).value_counts())[0]
        if (cur_dominator/total_items > 0.85) and (col not in keep_cols):
            cur_dominator = train_df[col].fillna(-999).value_counts().index[0]
            print('Column:', col, ' | Dominator:', cur_dominator)
            train_df[col] = np.where(train_df[col].fillna(-999)==cur_dominator,1,0)
            test_df[col] = np.where(test_df[col].fillna(-999)==cur_dominator,1,0)

            train_df[col] = train_df[col].fillna(-999).astype(int)
            test_df[col] = test_df[col].fillna(-999).astype(int)

            if col not in categorical_features:
                categorical_features.append(col)
                
categorical_features +=['D8_not_same_day','TransactionAmt_check']

Column: card3  | Dominator: 150.0
Column: C3  | Dominator: 0.0
Column: C7  | Dominator: 0.0
Column: D6  | Dominator: 899261
Column: D7  | Dominator: 998181
Column: D8  | Dominator: 947967
Column: D9  | Dominator: 947967
Column: D12  | Dominator: 963260
Column: D13  | Dominator: 911895
Column: D14  | Dominator: 919850
Column: V14  | Dominator: 1.0
Column: V27  | Dominator: 0.0
Column: V28  | Dominator: 0.0
Column: V65  | Dominator: 1.0
Column: V68  | Dominator: 0.0
Column: V95  | Dominator: 0.0
Column: V98  | Dominator: 0.0
Column: V101  | Dominator: 0.0
Column: V102  | Dominator: 0.0
Column: V103  | Dominator: 0.0
Column: V104  | Dominator: 0.0
Column: V105  | Dominator: 0.0
Column: V106  | Dominator: 0.0
Column: V107  | Dominator: 1.0
Column: V108  | Dominator: 1.0
Column: V109  | Dominator: 1.0
Column: V110  | Dominator: 1.0
Column: V111  | Dominator: 1.0
Column: V112  | Dominator: 1.0
Column: V113  | Dominator: 1.0
Column: V114  | Dominator: 1.0
Column: V115  | Dominator: 1.0
Column

In [10]:
########################### Restore some categorical features
## These features weren't useful for lgbm
## but catboost can use it
restore_features = [
                    'uid','uid2','uid3','uid4','uid5','bank_type',
                    ]

for col in restore_features:
    categorical_features.append(col)
    remove_features.remove(col)

In [11]:
########################### Remove 100% duplicated columns
cols_sum = {}
bad_types = ['datetime64[ns]', 'category','object']

for col in list(train_df):
    if train_df[col].dtype.name not in bad_types:
        cur_col = train_df[col].values
        cur_sum = cur_col.mean()
        try:
            cols_sum[cur_sum].append(col)
        except:
            cols_sum[cur_sum] = [col]

cols_sum = {k:v for k,v in cols_sum.items() if len(v)>1}   

for k,v in cols_sum.items():
    for col in v[1:]:
        if train_df[v[0]].equals(train_df[col]):
            print('Duplicate', col)
            del train_df[col], test_df[col]

Duplicate card3_fq_enc
Duplicate card3_TransactionAmt_mean
Duplicate card3_TransactionAmt_std
Duplicate card3_cat
Duplicate C7_fq_enc
Duplicate D6_DT_D_min_max
Duplicate D6_DT_D_std_score
Duplicate D6_DT_W_min_max
Duplicate D6_DT_W_std_score
Duplicate D6_DT_M_min_max
Duplicate D6_DT_M_std_score
Duplicate nan_group_899261
Duplicate D7_DT_D_min_max
Duplicate D7_DT_D_std_score
Duplicate D7_DT_W_min_max
Duplicate D7_DT_W_std_score
Duplicate D7_DT_M_min_max
Duplicate D7_DT_M_std_score
Duplicate nan_group_998181
Duplicate D9
Duplicate D9_not_na
Duplicate D8_D9_decimal_dist
Duplicate D8_DT_D_min_max
Duplicate D8_DT_W_min_max
Duplicate D8_DT_M_min_max
Duplicate id_09
Duplicate id_10
Duplicate nan_group_947967
Duplicate D12_DT_D_min_max
Duplicate D12_DT_D_std_score
Duplicate D12_DT_W_min_max
Duplicate D12_DT_W_std_score
Duplicate D12_DT_M_min_max
Duplicate D12_DT_M_std_score
Duplicate nan_group_963260
Duplicate D13_DT_D_min_max
Duplicate D13_DT_D_std_score
Duplicate D13_DT_W_min_max
Duplicate D

In [12]:
########################### Encode Str columns
# As we restored some original features
# we nned to run LabelEncoder to reduce
# memory usage and garant that there are no nans
for col in list(train_df):
    if train_df[col].dtype=='O':
        print(col)
        train_df[col] = train_df[col].fillna('unseen_before_label')
        test_df[col]  = test_df[col].fillna('unseen_before_label')
        
        train_df[col] = train_df[col].astype(str)
        test_df[col] = test_df[col].astype(str)
        
        le = LabelEncoder()
        le.fit(list(train_df[col])+list(test_df[col]))
        train_df[col] = le.transform(train_df[col])
        test_df[col]  = le.transform(test_df[col])
    
    elif col in categorical_features:
        train_df[col] = train_df[col].astype(float).fillna(-999)
        test_df[col]  = test_df[col].astype(float).fillna(-999)
        
        le = LabelEncoder()
        le.fit(list(train_df[col])+list(test_df[col]))
        train_df[col] = le.transform(train_df[col])
        test_df[col]  = le.transform(test_df[col])

P_emaildomain_cat
R_emaildomain_cat


In [13]:
########################### Final features list
features_columns = [col for col in list(train_df) if col not in remove_features]
categorical_features = [col for col in categorical_features if col in features_columns]

########################### Final Minification
## I don't like this part as it changes float numbers
## small change but change.
## To be able to train catboost without 
## minification we need to do some changes on model
## we will do it later.
if not LOCAL_TEST:
    train_df = reduce_mem_usage(train_df)
    test_df  = reduce_mem_usage(test_df)
    
train_df = train_df[['TransactionID','DT_M',TARGET]+features_columns]
test_df  = test_df[['TransactionID','DT_M',TARGET]+features_columns]
gc.collect()

Mem. usage decreased to 1092.02 Mb (54.0% reduction)
Mem. usage decreased to 943.73 Mb (53.8% reduction)


112

In [14]:
########################### Cleaning
# Check what variables consume memory
for name, size in sorted(((name, sys.getsizeof(value)) for name,value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name,sizeof_fmt(size)))
print('Memory in Gb', get_memory_usage())

# Confirm thar variable exist
temp_df = 0

del temp_df
gc.collect()

                      train_df:   1.1GiB
                       test_df: 930.2MiB
              features_columns:   6.1KiB
                           _i2:   2.4KiB
                           _i8:   1.7KiB
                           _i3:   1.7KiB
          categorical_features:   1.2KiB
                      cols_sum:   1.2KiB
                           _ii:   1.1KiB
                          _i12:   1.1KiB
Memory in Gb 4.21


0

In [15]:
########################### Model Train
if LOCAL_TEST:
    test_predictions = make_predictions(train_df, test_df, features_columns, TARGET, cat_params, 
                                        NFOLDS=4, kfold_mode='grouped')

else:    
    # Why NFOLDS = 6 -> we have 6 months -> let's split it by month))
    NFOLDS = 6
    folds = GroupKFold(n_splits=NFOLDS)

    X,y = train_df[features_columns], train_df[TARGET]    
    P,P_y = test_df[features_columns], test_df[TARGET]  
    
    split_groups = train_df['DT_M']
    # We don't need original sets anymore
    # let's reduce it
    train_df = train_df[['TransactionID',TARGET]] 
    test_df = test_df[['TransactionID',TARGET]] 
    test_df['prediction'] = 0
    gc.collect()
    
    oof = np.zeros(len(train_df))
    predictions = np.zeros(len(test_df))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups=split_groups)):
        print('Fold:',fold_)
        
        estimator = CatBoostClassifier(**cat_params)        
        estimator.fit(
            X.iloc[trn_idx,:],y[trn_idx],
            eval_set=(X.iloc[val_idx,:], y[val_idx]),
            cat_features=categorical_features,
            use_best_model=True,
            verbose=True)

        oof_preds = estimator.predict_proba(X.iloc[val_idx,:])[:,1]
        oof[val_idx] = (oof_preds - oof_preds.min())/(oof_preds.max() - oof_preds.min())
        test_df['prediction'] += estimator.predict_proba(P)[:,1]/NFOLDS
        
        del estimator
        gc.collect()
        
    print('OOF AUC:', metrics.roc_auc_score(y, oof))

Fold: 0
0:	learn: 0.8613991	test: 0.7356347	best: 0.7356347 (0)	total: 144ms	remaining: 11m 57s
500:	learn: 0.9709161	test: 0.9051092	best: 0.9051092 (500)	total: 1m 4s	remaining: 9m 35s
1000:	learn: 0.9775985	test: 0.9116461	best: 0.9116461 (1000)	total: 2m 7s	remaining: 8m 27s
1500:	learn: 0.9827214	test: 0.9151939	best: 0.9152679 (1487)	total: 3m 10s	remaining: 7m 24s
2000:	learn: 0.9866871	test: 0.9154721	best: 0.9158686 (1749)	total: 4m 14s	remaining: 6m 21s
2500:	learn: 0.9898605	test: 0.9164411	best: 0.9167305 (2472)	total: 5m 18s	remaining: 5m 17s
3000:	learn: 0.9924441	test: 0.9172840	best: 0.9175852 (2967)	total: 6m 22s	remaining: 4m 14s
bestTest = 0.9175851643
bestIteration = 2967
Shrink model to first 2968 iterations.
Fold: 1
0:	learn: 0.8663313	test: 0.8422300	best: 0.8422300 (0)	total: 150ms	remaining: 12m 30s
500:	learn: 0.9680697	test: 0.9430394	best: 0.9430426 (498)	total: 1m 5s	remaining: 9m 47s
1000:	learn: 0.9750525	test: 0.9443727	best: 0.9443772 (995)	total: 2m 10

In [16]:
########################### Export
if not LOCAL_TEST:
    test_df['isFraud'] = test_df['prediction']
    test_df[['TransactionID','isFraud']].to_csv('submission.csv', index=False)