In [1]:
run_checks = False

### Overview
This notebook works on the IEEE-CIS Fraud Detection competition. Here I build a simple XGBoost model based on a balanced dataset.

### Lessons:

. keep the categorical variables as single items

. Use a high max_depth for xgboost (maybe 40)


### Ideas to try:

. train divergence of expected value (eg. for TransactionAmt and distance based on the non-fraud subset (not all subset as in the case now)

. try using a temporal approach to CV

In [2]:
# all imports necessary for this notebook
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import gc
import copy
import missingno as msno 
import xgboost
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split 
from sklearn.metrics import roc_auc_score, r2_score

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-preprocessed/master_df_top_all.csv
/kaggle/input/ieee-preprocessed/master_df_top_300.csv
/kaggle/input/ieee-preprocessed/master_df_top_100.csv
/kaggle/input/ieee-preprocessed/master_df_top_200.csv


In [3]:
# Helpers
    
def seed_everything(seed=0):
    '''Seed to make all processes deterministic '''
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
def drop_correlated_cols(df, threshold, sample_frac = 1):
    '''Drops one of two dataframe's columns whose pairwise pearson's correlation is above the provided threshold'''
    if sample_frac != 1:
        dataset = df.sample(frac = sample_frac).copy()
    else:
        dataset = df
        
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        if corr_matrix.columns[i] in col_corr:
            continue
        for j in range(i):
            if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
    del dataset
    gc.collect()
    df.drop(columns = col_corr, inplace = True)

def calc_feature_difference(df, feature_name, indep_features, min_r2 = 0.1, min_r2_improv = 0, frac1 = 0.1,
                              max_depth_start = 2, max_depth_step = 4):
    
    from copy import deepcopy
    
    print("Feature name %s" %feature_name)
    #print("Indep_features %s" %indep_features)
    
    is_imrpoving = True
    curr_max_depth = max_depth_start
    best_r2 = float("-inf")
    clf_best = np.nan
    
    while is_imrpoving:
        clf = XGBRegressor(max_depth = curr_max_depth)

        rand_sample_indeces = df[df[feature_name].notnull()].sample(frac = frac1).index
        clf.fit(df.loc[rand_sample_indeces, indep_features], df.loc[rand_sample_indeces, feature_name]) 

        rand_sample_indeces = df[df[feature_name].notnull()].sample(frac = frac1).index
        
        pred_y = clf.predict(df.loc[rand_sample_indeces, indep_features])
        r2Score = r2_score(df.loc[rand_sample_indeces, feature_name], pred_y)
        print("%d, R2 score %.4f" % (curr_max_depth, r2Score))
        
        curr_max_depth = curr_max_depth + max_depth_step
        
        if r2Score > best_r2:
            best_r2 = r2Score
            clf_best = deepcopy(clf)
        if r2Score < best_r2 + (best_r2 * min_r2_improv) or (curr_max_depth > max_depth_start * max_depth_step and best_r2 < min_r2 / 2):
            is_imrpoving = False

    print("The best R2 score of %.4f" % ( best_r2))
    
    if best_r2 > min_r2:
        pred_feature = clf_best.predict(df.loc[:, indep_features])
        return (df[feature_name] - pred_feature)
    else:
        return df[feature_name]

In [4]:
seed_everything()
pd.set_option('display.max_columns', 500)

In [5]:
master_df = pd.read_csv('/kaggle/input/ieee-preprocessed/master_df_top_100.csv')
master_df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C5,C6,C8,C9,C10,C11,C13,C14,D1,D2,D3,D4,D5,D8,D9,D10,D11,D14,D15,M3,M4,M5,M6,M7,M8,M9,V4,V12,V20,V35,V36,V53,V56,V62,V70,V75,V76,V82,V87,V91,V94,V99,V102,V103,V112,V126,V127,V128,V130,V131,V133,V141,V145,V149,V152,V162,V173,V176,V177,V188,V189,V191,V201,V206,V208,V209,V223,V230,V244,V245,V254,V257,V258,V269,V271,V283,V285,V294,V304,V306,V307,V308,V310,V312,V313,V314,V315,V317,V320,V324,V329,id_01,id_02,id_05,id_06,id_13,id_17,id_19,id_20,id_26,DeviceInfo,is_train_df,P_emaildomain_2,P_emaildomain_3,id_31_edge,id_31_chrome_version_newness,id_31_safari_version_newness,id_30_android,id_33_resolution,TransactionAmt_decimal,weekday,hours,card3_144.0,card3_150.0,card3_185.0,R_emaildomain_2_com,id_14_-420.0,id_14_60.0,M4_M0,M4_M1,card6_credit,M8_F,hours_5.0,hours_11.0,card4_american express,R_emaildomain_anonymous.com,addr2_87.0,M6_F,M3_F,M5_F,card1_9500,card1_9633,card1_17188,card1_infrequent_category,P_emaildomain_comcast.net,P_emaildomain_gmail.com,P_emaildomain_hotmail.com,P_emaildomain_me.com,P_emaildomain_msn.com,P_emaildomain_optonline.net,P_emaildomain_verizon.net,P_emaildomain_infrequent_category,addr1_204.0,addr1_299.0,addr1_325.0,addr1_337.0,addr1_472.0,addr1_485.0,addr1_infrequent_category,ProductCD_H,ProductCD_R,id_20_333.0,id_20_549.0,id_20_612.0,card2_225.0,card2_268.0,card2_321.0,card2_481.0,card2_490.0,card2_553.0,card2_555.0,card2_567.0,card2_infrequent_category,M7_F,weekday_1.0,weekday_2.0,weekday_3.0,weekday_4.0,weekday_5.0,card5_166.0,card5_224.0,P_emaildomain_1_infrequent_category,id_17_166.0,id_32_24.0
0,2987000,0.0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,1.0,0.0,1.0,0.0,2.0,1.0,1.0,14.0,,13.0,,,,,13.0,13.0,,0.0,T,M2,F,T,,,,1.0,1.0,1.0,,,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,117.0,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,,,,,,,,,,,,,1,,,,,,,,500,0.0,0.0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2987001,0.0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,,,0.0,,,,0.0,,,0.0,,M0,T,T,,,,,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,1,com,,,,,,,0,0.0,0.0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,2987002,0.0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,,outlook.com,,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,,,0.0,,,,0.0,315.0,,315.0,T,M0,F,F,F,F,F,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,1,com,,,,,,,0,0.0,0.0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
3,2987003,0.0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,,,yahoo.com,,2.0,5.0,0.0,4.0,0.0,1.0,0.0,1.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,,,84.0,,,111.0,,M0,T,F,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,10.0,38.0,24.0,1.0,50.0,1758.0,925.0,354.0,135.0,1404.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,10.0,38.0,0.0,50.0,1758.0,925.0,354.0,135.0,0.0,0.0,0.0,1404.0,0.0,,,,,,,,,,,,,1,com,,,,,,,0,0.0,0.0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,2987004,0.0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,,,gmail.com,,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.0,0.0,64.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70787.0,,,,166.0,542.0,144.0,,SAMSUNG SM-G892A Build/NRD90M,1,com,,False,,,True,14.689979,0,0.0,0.0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


master_df.shape

In [6]:
for col in master_df.select_dtypes(exclude='number').columns:
    master_df[col] = master_df[col].astype('category').cat.codes

In [7]:
train_balanced = master_df[master_df['isFraud'].notnull()]

temp_list_to_drop = []
temp_list_to_drop.extend(['isFraud', 'TransactionID', 'TransactionDT', 'is_train_df'])

print(train_balanced.shape)


clf = XGBClassifier(max_depth=50)
clf.fit(train_balanced.drop(columns=temp_list_to_drop), train_balanced['isFraud'])

(590540, 193)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=50,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [8]:
gc.collect()

19

In [9]:
# prepare submission
temp_list_to_drop = []
#temp_list_to_drop = list(cols_cat)
temp_list_to_drop.extend(['isFraud', 'TransactionID', 'TransactionDT'])
temp_list_to_include = list(set(master_df.columns).difference(set(temp_list_to_drop)))

temp_list_to_drop = []
#temp_list_to_drop = list(cols_cat)
temp_list_to_drop.extend(['isFraud', 'TransactionID', 'TransactionDT'])

temp_list_to_include =  list(train_balanced.drop(columns=temp_list_to_drop).columns)

temp_list_to_drop = []
#temp_list_to_drop = list(cols_cat)
temp_list_to_drop.extend(['isFraud', 'TransactionID', 'TransactionDT', 'is_train_df'])

counter_from = master_df.loc[master_df['is_train_df']==0, 'isFraud'].index[0]

len_master_df = len(master_df)

print(counter_from)
print(len_master_df)
print('start!!')
while counter_from < len_master_df:
    print(counter_from)
    counter_to = counter_from + 10000
    pred = pd.DataFrame()
    #print(len(master_df['isFraud'].loc[counter_from:counter_to]))
    #print(len(master_df.loc[counter_from:counter_to, [col for col in master_df.columns if col not in temp_list_to_drop]]))
    master_df['isFraud'].loc[counter_from:counter_to] =  clf.predict_proba(master_df.loc[counter_from:counter_to, [col for col in master_df.columns if col not in temp_list_to_drop]])[:, 1]
    
    counter_from += 10000
    gc.collect()
#print(temp_list_to_include)

590540
1097231
start!!
590540
600540
610540
620540
630540
640540
650540
660540
670540
680540
690540
700540
710540
720540
730540
740540
750540
760540
770540
780540
790540
800540
810540
820540
830540
840540
850540
860540
870540
880540
890540
900540
910540
920540
930540
940540
950540
960540
970540
980540
990540
1000540
1010540
1020540
1030540
1040540
1050540
1060540
1070540
1080540
1090540


In [10]:
#sample_submission.head()

In [11]:
counter_from = master_df.loc[master_df['is_train_df']==0, 'isFraud'].index[0]
submission = pd.DataFrame(master_df[['TransactionID', 'isFraud']].loc[counter_from:]).reset_index(drop = True)
submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000637
1,3663550,0.000576
2,3663551,0.00126
3,3663552,0.000719
4,3663553,0.000409


In [12]:
submission.describe()

Unnamed: 0,TransactionID,isFraud
count,506691.0,506691.0
mean,3916894.0,0.020971
std,146269.2,0.118523
min,3663549.0,5.9e-05
25%,3790222.0,0.000497
50%,3916894.0,0.00111
75%,4043566.0,0.003014
max,4170239.0,0.999622


In [13]:
submission.to_csv('submission.csv', index=False)