In [1]:
run_checks = False

### Overview
This notebook works on the IEEE-CIS Fraud Detection competition. Here I build a simple XGBoost model based on a balanced dataset.

### Lessons:

. keep the categorical variables as single items

. Use a high max_depth for xgboost (maybe 40)


### Ideas to try:

. train divergence of expected value (eg. for TransactionAmt and distance based on the non-fraud subset (not all subset as in the case now)

. try using a temporal approach to CV

In [2]:
# all imports necessary for this notebook
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import gc
import copy
import missingno as msno 
import xgboost
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split 
from sklearn.metrics import roc_auc_score, r2_score

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/master-df-time-adjusted-top-100-v2csv/master_df_time_adjusted_top_100.csv
/kaggle/input/master-df-time-adjusted-top-100csv/master_df_time_adjusted_top_100.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-preprocessed/master_df_top_300.csv
/kaggle/input/ieee-preprocessed/master_df_top_200.csv
/kaggle/input/ieee-preprocessed/master_df_top_100.csv
/kaggle/input/ieee-preprocessed/master_df_top_all.csv


In [3]:
# Helpers
    
def seed_everything(seed=0):
    '''Seed to make all processes deterministic '''
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
def drop_correlated_cols(df, threshold, cols_to_keep, sample_frac = 1):
    '''Drops one of two dataframe's columns whose pairwise pearson's correlation is above the provided threshold'''
    if sample_frac != 1:
        dataset = df.sample(frac = sample_frac).copy()
    else:
        dataset = df
        
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        if corr_matrix.columns[i] in col_corr:
            continue
        for j in range(i):
            if corr_matrix.columns[j] in cols_to_keep:
                continue
            if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
    del dataset
    gc.collect()
    df.drop(columns = col_corr, inplace = True)

def calc_feature_difference(df, feature_name, indep_features, min_r2 = 0.1, min_r2_improv = 0, frac1 = 0.1,
                              max_depth_start = 2, max_depth_step = 4):
    
    from copy import deepcopy
    
    print("Feature name %s" %feature_name)
    #print("Indep_features %s" %indep_features)
    
    is_imrpoving = True
    curr_max_depth = max_depth_start
    best_r2 = float("-inf")
    clf_best = np.nan
    
    while is_imrpoving:
        clf = XGBRegressor(max_depth = curr_max_depth)

        rand_sample_indeces = df[df[feature_name].notnull()].sample(frac = frac1).index
        clf.fit(df.loc[rand_sample_indeces, indep_features], df.loc[rand_sample_indeces, feature_name]) 

        rand_sample_indeces = df[df[feature_name].notnull()].sample(frac = frac1).index
        
        pred_y = clf.predict(df.loc[rand_sample_indeces, indep_features])
        r2Score = r2_score(df.loc[rand_sample_indeces, feature_name], pred_y)
        print("%d, R2 score %.4f" % (curr_max_depth, r2Score))
        
        curr_max_depth = curr_max_depth + max_depth_step
        
        if r2Score > best_r2:
            best_r2 = r2Score
            clf_best = deepcopy(clf)
        if r2Score < best_r2 + (best_r2 * min_r2_improv) or (curr_max_depth > max_depth_start * max_depth_step and best_r2 < min_r2 / 2):
            is_imrpoving = False

    print("The best R2 score of %.4f" % ( best_r2))
    
    if best_r2 > min_r2:
        pred_feature = clf_best.predict(df.loc[:, indep_features])
        return (df[feature_name] - pred_feature)
    else:
        return df[feature_name]

In [4]:
seed_everything()
pd.set_option('display.max_columns', 500)

In [5]:
master_df = pd.read_csv('/kaggle/input/ieee-preprocessed/master_df_top_100.csv')
master_df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C5,C6,C8,C9,C10,C11,C13,C14,D1,D2,D3,D4,D5,D8,D9,D10,D11,D14,D15,M3,M4,M5,M6,M7,M8,M9,V4,V12,V20,V35,V36,V53,V56,V62,V70,V75,V76,V82,V87,V91,V94,V99,V102,V103,V112,V126,V127,V128,V130,V131,V133,V141,V145,V149,V152,V162,V173,V176,V177,V188,V189,V191,V201,V206,V208,V209,V223,V230,V244,V245,V254,V257,V258,V269,V271,V283,V285,V294,V304,V306,V307,V308,V310,V312,V313,V314,V315,V317,V320,V324,V329,id_01,id_02,id_05,id_06,id_13,id_17,id_19,id_20,id_26,DeviceInfo,is_train_df,P_emaildomain_2,P_emaildomain_3,id_31_edge,id_31_chrome_version_newness,id_31_safari_version_newness,id_30_android,id_33_resolution,TransactionAmt_decimal,weekday,hours,card3_144.0,card3_150.0,card3_185.0,R_emaildomain_2_com,id_14_-420.0,id_14_60.0,M4_M0,M4_M1,card6_credit,M8_F,hours_5.0,hours_11.0,card4_american express,R_emaildomain_anonymous.com,addr2_87.0,M6_F,M3_F,M5_F,card1_9500,card1_9633,card1_17188,card1_infrequent_category,P_emaildomain_comcast.net,P_emaildomain_gmail.com,P_emaildomain_hotmail.com,P_emaildomain_me.com,P_emaildomain_msn.com,P_emaildomain_optonline.net,P_emaildomain_verizon.net,P_emaildomain_infrequent_category,addr1_204.0,addr1_299.0,addr1_325.0,addr1_337.0,addr1_472.0,addr1_485.0,addr1_infrequent_category,ProductCD_H,ProductCD_R,id_20_333.0,id_20_549.0,id_20_612.0,card2_225.0,card2_268.0,card2_321.0,card2_481.0,card2_490.0,card2_553.0,card2_555.0,card2_567.0,card2_infrequent_category,M7_F,weekday_1.0,weekday_2.0,weekday_3.0,weekday_4.0,weekday_5.0,card5_166.0,card5_224.0,P_emaildomain_1_infrequent_category,id_17_166.0,id_32_24.0
0,2987000,0.0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,1.0,0.0,1.0,0.0,2.0,1.0,1.0,14.0,,13.0,,,,,13.0,13.0,,0.0,T,M2,F,T,,,,1.0,1.0,1.0,,,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,117.0,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,,,,,,,,,,,,,1,,,,,,,,500,0.0,0.0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2987001,0.0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,,,0.0,,,,0.0,,,0.0,,M0,T,T,,,,,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,1,com,,,,,,,0,0.0,0.0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,2987002,0.0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,,outlook.com,,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,,,0.0,,,,0.0,315.0,,315.0,T,M0,F,F,F,F,F,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,1,com,,,,,,,0,0.0,0.0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
3,2987003,0.0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,,,yahoo.com,,2.0,5.0,0.0,4.0,0.0,1.0,0.0,1.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,,,84.0,,,111.0,,M0,T,F,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,10.0,38.0,24.0,1.0,50.0,1758.0,925.0,354.0,135.0,1404.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,10.0,38.0,0.0,50.0,1758.0,925.0,354.0,135.0,0.0,0.0,0.0,1404.0,0.0,,,,,,,,,,,,,1,com,,,,,,,0,0.0,0.0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,2987004,0.0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,,,gmail.com,,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.0,0.0,64.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70787.0,,,,166.0,542.0,144.0,,SAMSUNG SM-G892A Build/NRD90M,1,com,,False,,,True,14.689979,0,0.0,0.0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [6]:
master_df_time_adjusted = pd.read_csv('/kaggle/input/master-df-time-adjusted-top-100-v2csv/master_df_time_adjusted_top_100.csv')
master_df_time_adjusted.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,P_emaildomain,R_emaildomain,C1,C2,C6,C8,C11,C13,C14,D1,D10,D15,M3,M4,M5,M6,M7,M8,M9,V12,V35,V36,V53,V70,V75,V76,V87,V91,V99,V126,V127,V128,V133,V145,V149,V152,V176,V177,V230,V257,V258,V285,V304,V306,V307,id_02,id_13,id_17,id_19,id_20,id_26,DeviceInfo,is_train_df,P_emaildomain_2,P_emaildomain_3,id_31_edge,id_30_android,TransactionAmt_decimal,weekday,hours,R_emaildomain_2_com,id_14_-420.0,M4_M0,M8_F,R_emaildomain_anonymous.com,M6_F,M5_F,P_emaildomain_hotmail.com,id_20_333.0,id_20_549.0,M7_F,id_32_24.0
0,2987000,0.0,0,4,12696,-1,50,1,42,1,215,77,,,0.452748,-56.367245,-35.609619,0.147989,-7.33456,-76.363113,-4.731055,-7.852631,-13.998543,-219.237869,T,M2,F,T,,,,0.332354,,,0.311933,-0.696159,0.42011,0.371908,-0.400271,-0.477729,-5.658206,-442.928955,-6130.06543,-47.239407,93.224512,,,,,,,,,-2.146344,0.000325,-98.38842,-65.309341,,,,,,,,1,,,,,100.97171,0,0,-0.01599,-0.019621,-0.265431,-0.164929,-0.003047,-0.080853,0.437072,-0.025767,-0.006322,-8e-06,-0.390713,-0.016675
1,2987001,0.0,1,4,1726,304,50,2,2,1,225,77,gmail.com,,-3.620259,-56.254463,-4.153918,-0.184928,-9.3722,-2.555614,-0.118743,-88.157257,-157.705399,-163.875885,,M0,T,T,,,,-0.726845,-0.738347,-0.600466,-0.84568,-0.669546,-0.7288,-0.656142,-0.179304,-0.418249,-7.665982,-29.248983,-1255.726807,10.947166,294.409943,,,,,,,,,-0.67422,0.007776,-1461.669067,325.092743,,,,,,,,1,com,,,,-256.13684,0,0,-0.000335,0.000125,0.185291,-0.182496,0.005825,-0.163323,-0.52669,0.002152,-0.154146,-8e-06,-0.2217,-0.00572
2,2987002,0.0,2,4,3597,390,50,3,66,2,230,77,outlook.com,,-4.552312,-57.456963,-5.875078,-0.109141,-13.440516,-8.605875,-3.505404,-48.127869,-70.123619,91.240555,T,M0,F,F,F,F,F,0.2998,0.297722,0.394673,0.299507,-0.521132,0.398391,0.318474,0.029664,-0.472105,-8.090937,-14.131649,-8801.166016,-127.531815,47.391029,,,,,,,,,-0.530957,0.000325,-206.973801,-566.554932,,,,,,,,1,com,,,,-366.5482,0,0,0.001292,6.3e-05,0.431426,0.819766,0.029431,0.422267,0.758143,0.015687,-0.000358,-8e-06,0.765233,0.001808
3,2987003,0.0,3,4,16830,467,50,2,17,2,376,77,yahoo.com,,-28.246696,-45.159729,-2.702492,0.607816,-11.248108,-48.973129,4.087717,-83.858826,-13.188324,-202.70694,,M0,T,F,,,,0.28404,0.588696,0.456558,0.216863,-0.823855,0.633004,0.214649,0.066225,-0.269714,0.613984,-16.667755,-7303.942383,875.265041,1494.660744,,,,,,,,,8.82947,0.002005,109.774151,1643.618919,,,,,,,,1,com,,,,-61.43119,0,0,0.007336,0.001207,0.319998,-0.146063,0.014364,0.494657,-0.807073,-0.036007,0.005367,-8e-06,-0.233716,-0.00109
4,2987004,0.0,4,1,3434,414,50,2,2,1,320,77,gmail.com,,1.002734,-29.206808,0.961519,1.349431,-0.743106,-4.84097,5.024339,1.039412,,,,,,,,,,,,,,,,,,,-0.641993,-134.359344,-621.891052,-380.488739,-776.441528,55.91214,-0.341633,33.539467,-1.079679,-0.108888,-18.394489,-0.234223,-0.001191,0.114226,-0.017394,-2210.778564,-884.677673,-93479.875,,166.0,542.0,144.0,,SAMSUNG SM-G892A Build/NRD90M,1,com,,False,True,-25.592527,0,0,-0.335331,-0.059717,-0.229276,0.016135,-0.174344,0.035477,0.056471,-0.084155,0.007734,0.005053,0.01736,-0.443237


In [7]:
master_df_time_adjusted.drop(columns = [ 'hours', 'weekday', 'TransactionDT', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5'
                                                      , 'card6', 'addr1', 'addr2', 'is_train_df', 'isFraud'], inplace = True)

In [8]:
master_df_merged = pd.merge(master_df, master_df_time_adjusted, on='TransactionID', how='left', validate = '1:1')

In [9]:
master_df_merged.shape

(1097231, 262)

In [10]:
master_df_merged.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain_x,R_emaildomain_x,C1_x,C2_x,C5,C6_x,C8_x,C9,C10,C11_x,C13_x,C14_x,D1_x,D2,D3,D4,D5,D8,D9,D10_x,D11,D14,D15_x,M3_x,M4_x,M5_x,M6_x,M7_x,M8_x,M9_x,V4,V12_x,V20,V35_x,V36_x,V53_x,V56,V62,V70_x,V75_x,V76_x,V82,V87_x,V91_x,V94,V99_x,V102,V103,V112,V126_x,V127_x,V128_x,V130,V131,V133_x,V141,V145_x,V149_x,V152_x,V162,V173,V176_x,V177_x,V188,V189,V191,V201,V206,V208,V209,V223,V230_x,V244,V245,V254,V257_x,V258_x,V269,V271,V283,V285_x,V294,V304_x,V306_x,V307_x,V308,V310,V312,V313,V314,V315,V317,V320,V324,V329,id_01,id_02_x,id_05,id_06,id_13_x,id_17_x,id_19_x,id_20_x,id_26_x,DeviceInfo_x,is_train_df,P_emaildomain_2_x,P_emaildomain_3_x,id_31_edge_x,id_31_chrome_version_newness,id_31_safari_version_newness,id_30_android_x,id_33_resolution,TransactionAmt_decimal_x,weekday,hours,card3_144.0,card3_150.0,card3_185.0,R_emaildomain_2_com_x,id_14_-420.0_x,id_14_60.0,M4_M0_x,M4_M1,card6_credit,M8_F_x,hours_5.0,hours_11.0,card4_american express,R_emaildomain_anonymous.com_x,addr2_87.0,M6_F_x,M3_F,M5_F_x,card1_9500,card1_9633,card1_17188,card1_infrequent_category,P_emaildomain_comcast.net,P_emaildomain_gmail.com,P_emaildomain_hotmail.com_x,P_emaildomain_me.com,P_emaildomain_msn.com,P_emaildomain_optonline.net,P_emaildomain_verizon.net,P_emaildomain_infrequent_category,addr1_204.0,addr1_299.0,addr1_325.0,addr1_337.0,addr1_472.0,addr1_485.0,addr1_infrequent_category,ProductCD_H,ProductCD_R,id_20_333.0_x,id_20_549.0_x,id_20_612.0,card2_225.0,card2_268.0,card2_321.0,card2_481.0,card2_490.0,card2_553.0,card2_555.0,card2_567.0,card2_infrequent_category,M7_F_x,weekday_1.0,weekday_2.0,weekday_3.0,weekday_4.0,weekday_5.0,card5_166.0,card5_224.0,P_emaildomain_1_infrequent_category,id_17_166.0,id_32_24.0_x,P_emaildomain_y,R_emaildomain_y,C1_y,C2_y,C6_y,C8_y,C11_y,C13_y,C14_y,D1_y,D10_y,D15_y,M3_y,M4_y,M5_y,M6_y,M7_y,M8_y,M9_y,V12_y,V35_y,V36_y,V53_y,V70_y,V75_y,V76_y,V87_y,V91_y,V99_y,V126_y,V127_y,V128_y,V133_y,V145_y,V149_y,V152_y,V176_y,V177_y,V230_y,V257_y,V258_y,V285_y,V304_y,V306_y,V307_y,id_02_y,id_13_y,id_17_y,id_19_y,id_20_y,id_26_y,DeviceInfo_y,P_emaildomain_2_y,P_emaildomain_3_y,id_31_edge_y,id_30_android_y,TransactionAmt_decimal_y,R_emaildomain_2_com_y,id_14_-420.0_y,M4_M0_y,M8_F_y,R_emaildomain_anonymous.com_y,M6_F_y,M5_F_y,P_emaildomain_hotmail.com_y,id_20_333.0_y,id_20_549.0_y,M7_F_y,id_32_24.0_y
0,2987000,0.0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,1.0,0.0,1.0,0.0,2.0,1.0,1.0,14.0,,13.0,,,,,13.0,13.0,,0.0,T,M2,F,T,,,,1.0,1.0,1.0,,,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,117.0,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,,,,,,,,,,,,,1,,,,,,,,500,0.0,0.0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,0.452748,-56.367245,-35.609619,0.147989,-7.33456,-76.363113,-4.731055,-7.852631,-13.998543,-219.237869,T,M2,F,T,,,,0.332354,,,0.311933,-0.696159,0.42011,0.371908,-0.400271,-0.477729,-5.658206,-442.928955,-6130.06543,-47.239407,93.224512,,,,,,,,,-2.146344,0.000325,-98.38842,-65.309341,,,,,,,,,,,,100.97171,-0.01599,-0.019621,-0.265431,-0.164929,-0.003047,-0.080853,0.437072,-0.025767,-0.006322,-8e-06,-0.390713,-0.016675
1,2987001,0.0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,,,0.0,,,,0.0,,,0.0,,M0,T,T,,,,,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,1,com,,,,,,,0,0.0,0.0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,gmail.com,,-3.620259,-56.254463,-4.153918,-0.184928,-9.3722,-2.555614,-0.118743,-88.157257,-157.705399,-163.875885,,M0,T,T,,,,-0.726845,-0.738347,-0.600466,-0.84568,-0.669546,-0.7288,-0.656142,-0.179304,-0.418249,-7.665982,-29.248983,-1255.726807,10.947166,294.409943,,,,,,,,,-0.67422,0.007776,-1461.669067,325.092743,,,,,,,,com,,,,-256.13684,-0.000335,0.000125,0.185291,-0.182496,0.005825,-0.163323,-0.52669,0.002152,-0.154146,-8e-06,-0.2217,-0.00572
2,2987002,0.0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,,outlook.com,,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,,,0.0,,,,0.0,315.0,,315.0,T,M0,F,F,F,F,F,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,1,com,,,,,,,0,0.0,0.0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,outlook.com,,-4.552312,-57.456963,-5.875078,-0.109141,-13.440516,-8.605875,-3.505404,-48.127869,-70.123619,91.240555,T,M0,F,F,F,F,F,0.2998,0.297722,0.394673,0.299507,-0.521132,0.398391,0.318474,0.029664,-0.472105,-8.090937,-14.131649,-8801.166016,-127.531815,47.391029,,,,,,,,,-0.530957,0.000325,-206.973801,-566.554932,,,,,,,,com,,,,-366.5482,0.001292,6.3e-05,0.431426,0.819766,0.029431,0.422267,0.758143,0.015687,-0.000358,-8e-06,0.765233,0.001808
3,2987003,0.0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,,,yahoo.com,,2.0,5.0,0.0,4.0,0.0,1.0,0.0,1.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,,,84.0,,,111.0,,M0,T,F,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,10.0,38.0,24.0,1.0,50.0,1758.0,925.0,354.0,135.0,1404.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,10.0,38.0,0.0,50.0,1758.0,925.0,354.0,135.0,0.0,0.0,0.0,1404.0,0.0,,,,,,,,,,,,,1,com,,,,,,,0,0.0,0.0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,yahoo.com,,-28.246696,-45.159729,-2.702492,0.607816,-11.248108,-48.973129,4.087717,-83.858826,-13.188324,-202.70694,,M0,T,F,,,,0.28404,0.588696,0.456558,0.216863,-0.823855,0.633004,0.214649,0.066225,-0.269714,0.613984,-16.667755,-7303.942383,875.265041,1494.660744,,,,,,,,,8.82947,0.002005,109.774151,1643.618919,,,,,,,,com,,,,-61.43119,0.007336,0.001207,0.319998,-0.146063,0.014364,0.494657,-0.807073,-0.036007,0.005367,-8e-06,-0.233716,-0.00109
4,2987004,0.0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,,,gmail.com,,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,140.0,0.0,64.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70787.0,,,,166.0,542.0,144.0,,SAMSUNG SM-G892A Build/NRD90M,1,com,,False,,,True,14.689979,0,0.0,0.0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,gmail.com,,1.002734,-29.206808,0.961519,1.349431,-0.743106,-4.84097,5.024339,1.039412,,,,,,,,,,,,,,,,,,,-0.641993,-134.359344,-621.891052,-380.488739,-776.441528,55.91214,-0.341633,33.539467,-1.079679,-0.108888,-18.394489,-0.234223,-0.001191,0.114226,-0.017394,-2210.778564,-884.677673,-93479.875,,166.0,542.0,144.0,,SAMSUNG SM-G892A Build/NRD90M,com,,False,True,-25.592527,-0.335331,-0.059717,-0.229276,0.016135,-0.174344,0.035477,0.056471,-0.084155,0.007734,0.005053,0.01736,-0.443237


In [11]:
del master_df, master_df_time_adjusted
gc.collect()

0

In [12]:
'''
cols_to_keep = ['TransactionID', 'hours', 'weekday', 'TransactionDT', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5'
                                                      , 'card6', 'addr1', 'addr2', 'is_train_df', 'isFraud']
drop_correlated_cols(master_df_merged, 0.9, cols_to_keep = cols_to_keep, sample_frac = 0.2)
master_df_merged.head()
'''

"\ncols_to_keep = ['TransactionID', 'hours', 'weekday', 'TransactionDT', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5'\n                                                      , 'card6', 'addr1', 'addr2', 'is_train_df', 'isFraud']\ndrop_correlated_cols(master_df_merged, 0.9, cols_to_keep = cols_to_keep, sample_frac = 0.2)\nmaster_df_merged.head()\n"

In [13]:
gc.collect()
master_df_merged.shape

(1097231, 262)

In [14]:
master_df = master_df_merged
del master_df_merged

In [15]:
for col in master_df.select_dtypes(exclude='number').columns:
    master_df[col] = master_df[col].astype('category').cat.codes

In [16]:
'''
length_ones = len(master_df[master_df['isFraud']==1])
train_balanced = pd.concat([master_df[master_df['isFraud']==1], (master_df[master_df['isFraud']==0]).sample(length_ones)], axis=0)

#train_balanced = train_balanced.sample(10000)


X_train, X_test, y_train, y_test = train_test_split(
    train_balanced.drop(columns=['isFraud', 'TransactionID', 'TransactionDT']), train_balanced['isFraud'], 
    test_size=1/4, stratify =train_balanced['isFraud'],  random_state=0)

print(X_train.shape)
print(X_test.shape)

clf = XGBClassifier(max_depth=5, n_estimators=1000, verbosity=1)
clf.fit(X_train, y_train)
pred_prob = clf.predict_proba(X_test)
pred_prob[:, 1]
roc_score = roc_auc_score(y_test, pred_prob[:, 1])
print("roc_auc score %.4f" % roc_score)
xgboost.plot_importance(clf, max_num_features=20, importance_type='gain')
xgboost.plot_importance(clf, max_num_features=20, importance_type='weight')
'''

'\nlength_ones = len(master_df[master_df[\'isFraud\']==1])\ntrain_balanced = pd.concat([master_df[master_df[\'isFraud\']==1], (master_df[master_df[\'isFraud\']==0]).sample(length_ones)], axis=0)\n\n#train_balanced = train_balanced.sample(10000)\n\n\nX_train, X_test, y_train, y_test = train_test_split(\n    train_balanced.drop(columns=[\'isFraud\', \'TransactionID\', \'TransactionDT\']), train_balanced[\'isFraud\'], \n    test_size=1/4, stratify =train_balanced[\'isFraud\'],  random_state=0)\n\nprint(X_train.shape)\nprint(X_test.shape)\n\nclf = XGBClassifier(max_depth=5, n_estimators=1000, verbosity=1)\nclf.fit(X_train, y_train)\npred_prob = clf.predict_proba(X_test)\npred_prob[:, 1]\nroc_score = roc_auc_score(y_test, pred_prob[:, 1])\nprint("roc_auc score %.4f" % roc_score)\nxgboost.plot_importance(clf, max_num_features=20, importance_type=\'gain\')\nxgboost.plot_importance(clf, max_num_features=20, importance_type=\'weight\')\n'

In [17]:
train_balanced = master_df[master_df['isFraud'].notnull()]

temp_list_to_drop = []
temp_list_to_drop.extend(['isFraud', 'TransactionID', 'TransactionDT', 'is_train_df'])

print(train_balanced.shape)


clf = XGBClassifier(max_depth=50)
clf.fit(train_balanced.drop(columns=temp_list_to_drop), train_balanced['isFraud'])

(590540, 262)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=50,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [18]:
gc.collect()

0

In [19]:
# prepare submission
temp_list_to_drop = []
#temp_list_to_drop = list(cols_cat)
temp_list_to_drop.extend(['isFraud', 'TransactionID', 'TransactionDT'])
temp_list_to_include = list(set(master_df.columns).difference(set(temp_list_to_drop)))

temp_list_to_drop = []
#temp_list_to_drop = list(cols_cat)
temp_list_to_drop.extend(['isFraud', 'TransactionID', 'TransactionDT'])

temp_list_to_include =  list(train_balanced.drop(columns=temp_list_to_drop).columns)

temp_list_to_drop = []
#temp_list_to_drop = list(cols_cat)
temp_list_to_drop.extend(['isFraud', 'TransactionID', 'TransactionDT', 'is_train_df'])

counter_from = master_df.loc[master_df['is_train_df']==0, 'isFraud'].index[0]

len_master_df = len(master_df)

print(counter_from)
print(len_master_df)
print('start!!')
while counter_from < len_master_df:
    print(counter_from)
    counter_to = counter_from + 10000
    pred = pd.DataFrame()
    #print(len(master_df['isFraud'].loc[counter_from:counter_to]))
    #print(len(master_df.loc[counter_from:counter_to, [col for col in master_df.columns if col not in temp_list_to_drop]]))
    master_df['isFraud'].loc[counter_from:counter_to] =  clf.predict_proba(master_df.loc[counter_from:counter_to, [col for col in master_df.columns if col not in temp_list_to_drop]])[:, 1]
    
    counter_from += 10000
    gc.collect()
#print(temp_list_to_include)

590540
1097231
start!!
590540
600540
610540
620540
630540
640540
650540
660540
670540
680540
690540
700540
710540
720540
730540
740540
750540
760540
770540
780540
790540
800540
810540
820540
830540
840540
850540
860540
870540
880540
890540
900540
910540
920540
930540
940540
950540
960540
970540
980540
990540
1000540
1010540
1020540
1030540
1040540
1050540
1060540
1070540
1080540
1090540


In [20]:
#sample_submission.head()

In [21]:
counter_from = master_df.loc[master_df['is_train_df']==0, 'isFraud'].index[0]
submission = pd.DataFrame(master_df[['TransactionID', 'isFraud']].loc[counter_from:]).reset_index(drop = True)
submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000361
1,3663550,0.000206
2,3663551,0.001328
3,3663552,0.001711
4,3663553,0.000328


In [22]:
submission.describe()

Unnamed: 0,TransactionID,isFraud
count,506691.0,506691.0
mean,3916894.0,0.020619
std,146269.2,0.117921
min,3663549.0,6.2e-05
25%,3790222.0,0.000533
50%,3916894.0,0.001112
75%,4043566.0,0.00288
max,4170239.0,0.999617


In [23]:
submission.to_csv('submission.csv', index=False)