In [1]:
run_checks = False

### Overview
This notebook works on the IEEE-CIS Fraud Detection competition. Here I build a simple XGBoost model based on a balanced dataset.

### Lessons:

. keep the categorical variables as single items

. Use a high max_depth for xgboost (maybe 40)


### Ideas to try:

. train divergence of expected value (eg. for TransactionAmt and distance based on the non-fraud subset (not all subset as in the case now)

. try using a temporal approach to CV

In [2]:
# all imports necessary for this notebook
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import gc
import copy
import missingno as msno 
import xgboost
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split 
from sklearn.metrics import roc_auc_score, r2_score

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/master-df-time-adjusted-top-100csv/master_df_time_adjusted_top_100.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/master-df-time-adjusted-top-200csv/master_df_time_adjusted_top_200.csv
/kaggle/input/ieee-preprocessed/master_df_top_all.csv
/kaggle/input/ieee-preprocessed/master_df_top_300.csv
/kaggle/input/ieee-preprocessed/master_df_top_100.csv
/kaggle/input/ieee-preprocessed/master_df_top_200.csv


In [3]:
# Helpers
    
def seed_everything(seed=0):
    '''Seed to make all processes deterministic '''
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
def drop_correlated_cols(df, threshold, cols_to_keep, sample_frac = 1):
    '''Drops one of two dataframe's columns whose pairwise pearson's correlation is above the provided threshold'''
    if sample_frac != 1:
        dataset = df.sample(frac = sample_frac).copy()
    else:
        dataset = df
        
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        if corr_matrix.columns[i] in col_corr:
            continue
        for j in range(i):
            if corr_matrix.columns[j] in cols_to_keep:
                continue
            if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
    del dataset
    gc.collect()
    df.drop(columns = col_corr, inplace = True)

def calc_feature_difference(df, feature_name, indep_features, min_r2 = 0.1, min_r2_improv = 0, frac1 = 0.1,
                              max_depth_start = 2, max_depth_step = 4):
    
    from copy import deepcopy
    
    print("Feature name %s" %feature_name)
    #print("Indep_features %s" %indep_features)
    
    is_imrpoving = True
    curr_max_depth = max_depth_start
    best_r2 = float("-inf")
    clf_best = np.nan
    
    while is_imrpoving:
        clf = XGBRegressor(max_depth = curr_max_depth)

        rand_sample_indeces = df[df[feature_name].notnull()].sample(frac = frac1).index
        clf.fit(df.loc[rand_sample_indeces, indep_features], df.loc[rand_sample_indeces, feature_name]) 

        rand_sample_indeces = df[df[feature_name].notnull()].sample(frac = frac1).index
        
        pred_y = clf.predict(df.loc[rand_sample_indeces, indep_features])
        r2Score = r2_score(df.loc[rand_sample_indeces, feature_name], pred_y)
        print("%d, R2 score %.4f" % (curr_max_depth, r2Score))
        
        curr_max_depth = curr_max_depth + max_depth_step
        
        if r2Score > best_r2:
            best_r2 = r2Score
            clf_best = deepcopy(clf)
        if r2Score < best_r2 + (best_r2 * min_r2_improv) or (curr_max_depth > max_depth_start * max_depth_step and best_r2 < min_r2 / 2):
            is_imrpoving = False

    print("The best R2 score of %.4f" % ( best_r2))
    
    if best_r2 > min_r2:
        pred_feature = clf_best.predict(df.loc[:, indep_features])
        return (df[feature_name] - pred_feature)
    else:
        return df[feature_name]

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
seed_everything()
pd.set_option('display.max_columns', 500)

In [5]:
master_df = pd.read_csv('/kaggle/input/ieee-preprocessed/master_df_top_200.csv')
master_df = reduce_mem_usage(master_df)
master_df.head()

Memory usage of dataframe is 2913.18 MB
Memory usage after optimization is: 632.13 MB
Decreased by 78.3%


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C5,C6,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D13,D14,D15,M2,M3,M4,M5,M6,M7,M8,M9,V2,V3,V4,V5,V7,V10,V12,V13,V19,V20,V21,V35,V36,V37,V38,V40,V44,V45,V47,V48,V49,V53,V54,V55,V56,V61,V62,V64,V67,V70,V74,V75,V76,V77,V78,V79,V82,V83,V86,V87,V91,V94,V96,V97,V99,V100,V102,V103,V105,V110,V112,V115,V117,V122,V126,V127,V128,V129,V130,V131,V132,V133,V134,V136,V137,V141,V144,V145,V149,V150,V152,V162,V165,V166,V171,V173,V176,V177,V188,V189,V191,V194,V201,V202,V203,V206,V208,V209,V211,V221,V223,V224,V225,V230,V243,V244,V245,V247,V249,V254,V257,V258,V263,V264,V269,V271,V280,V281,V282,V283,V285,V288,V291,V293,V294,V296,V298,V301,V302,V303,V304,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V320,V322,V323,V324,V329,V333,id_01,id_02,id_05,id_06,id_13,id_14,id_16,id_17,id_18,id_19,id_20,id_25,id_26,id_36,id_38,DeviceType,DeviceInfo,is_train_df,P_emaildomain_2,P_emaildomain_3,P_emaildomain_4,id_31_edge,id_31_ie,id_31_chrome_version_newness,id_31_safari_version_newness,id_30_android,id_33_resolution,TransactionAmt_decimal,TransactionAmt_decimal_length,weekday,hours,id_18_15.0,card3_143.0,card3_144.0,card3_150.0,card3_185.0,R_emaildomain_2_com,id_14_-420.0,id_14_60.0,M4_M0,M4_M1,card6_credit,M8_F,hours_1.0,hours_3.0,hours_5.0,hours_11.0,hours_15.0,hours_16.0,hours_17.0,hours_18.0,hours_19.0,hours_20.0,hours_21.0,hours_22.0,card4_american express,card4_discover,card4_mastercard,id_13_19.0,id_13_49.0,R_emaildomain_anonymous.com,R_emaildomain_gmail.com,R_emaildomain_hotmail.com,P_emaildomain_4_com,id_31_tablet_False,addr2_87.0,id_19_193.0,id_19_271.0,id_19_312.0,id_19_321.0,id_19_infrequent_category,M6_F,M3_F,M5_F,card1_6019,card1_9500,card1_9633,card1_10616,card1_12695,card1_12839,card1_15885,card1_16132,card1_17188,card1_infrequent_category,M2_F,P_emaildomain_anonymous.com,P_emaildomain_bellsouth.net,P_emaildomain_comcast.net,P_emaildomain_gmail.com,P_emaildomain_hotmail.com,P_emaildomain_live.com,P_emaildomain_me.com,P_emaildomain_msn.com,P_emaildomain_optonline.net,P_emaildomain_outlook.com,P_emaildomain_verizon.net,P_emaildomain_yahoo.com,P_emaildomain_infrequent_category,addr1_181.0,addr1_184.0,addr1_204.0,addr1_231.0,addr1_264.0,addr1_299.0,addr1_315.0,addr1_325.0,addr1_330.0,addr1_337.0,addr1_433.0,addr1_441.0,addr1_472.0,addr1_485.0,addr1_infrequent_category,ProductCD_H,ProductCD_R,M9_F,id_20_333.0,id_20_401.0,id_20_500.0,id_20_507.0,id_20_533.0,id_20_549.0,id_20_612.0,card2_111.0,card2_170.0,card2_174.0,card2_225.0,card2_268.0,card2_321.0,card2_360.0,card2_361.0,card2_481.0,card2_490.0,card2_514.0,card2_553.0,card2_555.0,card2_567.0,card2_infrequent_category,M7_F,weekday_1.0,weekday_2.0,weekday_3.0,weekday_4.0,weekday_5.0,card5_166.0,card5_195.0,card5_224.0,card5_226.0,card5_229.0,card5_236.0,P_emaildomain_1_outlook,P_emaildomain_1_infrequent_category,id_17_100.0,id_17_166.0,id_32_24.0,DeviceInfo_SM-J700M Build/MMB29K,DeviceInfo_Trident/7.0,DeviceInfo_Windows
0,2987000,0.0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,13.0,,,0.0,T,T,M2,F,T,,,,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,,,,,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,1,,,,,,,,,,500,1,0.0,0.0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2987001,0.0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,,,,0.0,,,M0,T,T,,,,,,,,,,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,1,com,,com,,,,,,,0,1,0.0,0.0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2987002,0.0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,,outlook.com,,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,315.0,,,315.0,T,T,M0,F,F,F,F,F,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,1,com,,com,,,,,,,0,1,0.0,0.0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
3,2987003,0.0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,,,yahoo.com,,2.0,5.0,0.0,4.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,,,,,84.0,,,,111.0,,,M0,T,F,,,,,,,,,,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,48.0,28.0,10.0,4.0,38.0,24.0,0.0,1.0,1.0,1.0,1.0,1.0,50.0,1758.0,925.0,0.0,354.0,135.0,50.0,1404.0,790.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,28.0,0.0,0.0,0.0,10.0,0.0,1.0,1.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,1758.0,925.0,0.0,354.0,0.0,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,,,,,,,,,,,,,,,,,,,,,,,1,com,,com,,,,,,,0,1,0.0,0.0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2987004,0.0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,,,gmail.com,,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,140.0,0.0,1803.0,64.0,0.0,5155.0,2840.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70787.0,,,,-480.0,NotFound,166.0,,542.0,144.0,,,F,T,mobile,SAMSUNG SM-G892A Build/NRD90M,1,com,,com,False,False,,,True,14.6875,0,1,0.0,0.0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [6]:
gc.collect()

0

In [7]:
master_df_time_adjusted = pd.read_csv('/kaggle/input/master-df-time-adjusted-top-200csv/master_df_time_adjusted_top_200.csv')
master_df_time_adjusted = reduce_mem_usage(master_df_time_adjusted)
master_df_time_adjusted.head()

Memory usage of dataframe is 2126.29 MB
Memory usage after optimization is: 531.68 MB
Decreased by 75.0%


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C5,C6,C8,C9,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D10,D11,D13,D14,D15,M2,M3,M4,M5,M6,M7,M8,M9,V2,V3,V4,V5,V7,V10,V12,V13,V19,V20,V21,V35,V36,V37,V38,V40,V44,V45,V47,V48,V49,V53,V54,V55,V56,V61,V62,V64,V67,V70,V74,V75,V76,V77,V78,V79,V82,V83,V86,V87,V91,V99,V100,V105,V110,V112,V115,V117,V122,V126,V127,V128,V129,V130,V131,V134,V136,V137,V141,V144,V145,V149,V150,V152,V162,V165,V166,V171,V173,V176,V188,V189,V191,V194,V201,V202,V203,V206,V208,V209,V221,V223,V224,V225,V230,V243,V244,V245,V247,V249,V254,V257,V258,V263,V264,V269,V271,V281,V282,V283,V285,V288,V291,V296,V298,V301,V302,V303,V304,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V320,V333,id_01,id_02,id_05,id_06,id_13,id_14,id_16,id_17,id_18,id_19,id_20,id_25,id_26,id_36,id_38,DeviceType,DeviceInfo,is_train_df,P_emaildomain_2,P_emaildomain_3,P_emaildomain_4,id_31_edge,id_31_ie,id_31_chrome_version_newness,id_31_safari_version_newness,id_30_android,id_33_resolution,TransactionAmt_decimal,TransactionAmt_decimal_length,weekday,hours,id_18_15.0,R_emaildomain_2_com,id_14_-420.0,id_14_60.0,M4_M0,M4_M1,M8_F,id_13_19.0,id_13_49.0,R_emaildomain_anonymous.com,R_emaildomain_gmail.com,R_emaildomain_hotmail.com,P_emaildomain_4_com,id_19_193.0,id_19_271.0,id_19_312.0,id_19_321.0,id_19_infrequent_category,M6_F,M3_F,M5_F,M2_F,P_emaildomain_anonymous.com,P_emaildomain_bellsouth.net,P_emaildomain_comcast.net,P_emaildomain_gmail.com,P_emaildomain_hotmail.com,P_emaildomain_live.com,P_emaildomain_me.com,P_emaildomain_msn.com,P_emaildomain_optonline.net,P_emaildomain_outlook.com,P_emaildomain_verizon.net,P_emaildomain_yahoo.com,P_emaildomain_infrequent_category,M9_F,id_20_333.0,id_20_401.0,id_20_500.0,id_20_507.0,id_20_533.0,id_20_549.0,id_20_612.0,M7_F,P_emaildomain_1_outlook,P_emaildomain_1_infrequent_category,id_17_100.0,id_32_24.0,DeviceInfo_SM-J700M Build/MMB29K,DeviceInfo_Trident/7.0,DeviceInfo_Windows
0,2987000,0.0,0,68.5,4,12696,-1,50,1,42,1,215,77,19.0,,,,0.452637,-56.375,0.0,-35.625,0.147949,1.0,-2.697266,-0.002918,-47.75,-44.46875,-75.1875,,13.0,,,,,,-129.625,13.0,,,-130.25,T,T,M2,F,T,,,,1.0,1.0,1.0,1.0,1.0,0.0,0.474121,0.28833,0.216187,0.882324,0.099915,,,,,,,,,,,0.330322,-0.563477,1.0,1.0,0.022949,1.0,0.001961,1.0,-0.691895,0.005024,0.390625,0.238159,1.0,1.0,-0.00102,0.0,0.0,-0.114136,-0.073059,-0.507324,-1.301758,-0.109802,-4.191406,1.0,1.0,1.0,1.0,1.0,-6.71961,-2.708824,-0.523158,0.0,-22.091198,0.0,142.647385,-364.04425,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.055756,1.0,-1.576172,0.0,-0.236084,-1.376953,-2.455078,0.0,-0.002861,0.022232,-0.046967,-115.976929,10.262924,-84.031685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.73419,-52.337128,0.007775,-62.248173,,,,,,,,,,,,,,,,,,,1,,,,,,,,,,180.125,-1.407227,0,0,0.038452,0.02977,-0.018692,0,-0.879883,0,-0.131226,-8.4e-05,-0.020798,-0.04126,0.000748,-0.002371,-0.665039,0,0,0,0,-0.005356,-0.512695,0,0.73291,0,0,0,0,0,-0.235229,0,0,0,0,0,0,0,0,0,-0.005093,0,0.037262,-0.00355,0.008629,-0.000116,0,-0.264893,0,0,-1e-06,-0.028214,0,0,-0.014145
1,2987001,0.0,1,29.0,4,1726,304,50,2,2,1,225,77,,,gmail.com,,-3.621094,-56.25,0.0,-4.152344,-0.184937,0.0,-6.273438,-0.002895,-86.625,-55.84375,-145.875,,,0.0,,,,,-96.375,,,,-135.375,,,M0,T,T,,,,,,,,,,-0.320801,-0.305664,0.181641,0.692383,0.099915,-0.458008,-0.699707,1.0,1.0,-0.002575,-0.247437,-0.52002,1.0,-0.254395,-0.387939,-0.820801,-1.00293,1.0,1.0,0.055237,1.0,0.000696,1.0,-0.649414,-0.006096,-0.626465,-0.745117,1.0,1.0,0.000196,1.0,1.0,0.004551,-0.052765,-0.647949,-0.977539,-0.225464,-1.176758,1.0,1.0,1.0,1.0,1.0,-38.638889,-94.207375,-902.065979,0.0,-93.548965,0.0,43.854446,-15.204865,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.3396,1.0,-1.37793,0.0,-0.244263,-0.557617,0.029922,0.0,-0.012245,-0.038361,-0.000793,-10.440208,293.314484,-1116.605957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-229.436371,-226.169266,-2985.060303,-25.625862,,,,,,,,,,,,,,,,,,,1,com,,com,,,,,,,-325.25,-0.394287,0,0,-0.023315,-0.002499,0.002174,0,0.26001,0,-0.186523,9e-05,-0.010605,-0.001632,-0.004669,-0.001369,0.352295,0,0,0,0,-0.00362,-0.378662,0,-0.421143,0,0,0,0,1,-0.161499,0,0,0,0,0,0,0,0,0,0.001272,0,0.05072,-0.003147,0.000129,-0.021179,0,-0.254639,0,0,-0.000525,0.007568,0,0,-0.012245
2,2987002,0.0,2,59.0,4,3597,390,50,3,66,2,230,77,287.0,,outlook.com,,-4.550781,-57.46875,0.0,-5.875,-0.109131,1.0,-11.515625,-0.004707,-96.25,-50.1875,-195.875,,,0.0,,,,,-61.25,315.0,,,185.125,T,T,M0,F,F,F,F,F,1.0,1.0,1.0,1.0,1.0,0.0,0.641113,0.500977,0.154785,0.53125,0.100281,0.530273,0.296631,1.0,1.0,0.000102,-0.133911,-0.03537,1.0,-0.310791,-0.292969,0.321289,-0.009979,1.0,1.0,0.103821,1.0,-0.000404,1.0,-0.699707,0.006508,0.378418,0.282471,1.0,1.0,-0.000632,1.0,1.0,0.086731,0.009644,-0.401611,-0.425537,-0.027618,0.019165,1.0,1.0,1.0,1.0,1.0,-7.368867,-0.263193,-244.418457,0.0,-0.95438,0.0,19.340231,-34.404888,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.182739,1.0,-1.493164,0.0,-0.298584,-0.222412,-0.129395,0.0,-0.001134,-0.000715,-0.000592,-37.330769,-614.068054,-277.90387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-22.479866,-90.550117,-206.577148,-73.767731,,,,,,,,,,,,,,,,,,,1,com,,com,,,,,,,-305.75,-0.377441,0,0,0.001814,0.001918,0.002615,0,0.341064,0,0.8125,-5.5e-05,-0.010605,0.000667,-0.001724,-0.001341,0.600586,0,0,0,0,-0.004463,0.348145,0,0.717773,0,0,0,0,0,-0.182739,0,0,0,0,1,0,0,0,1,0.000941,0,0.051544,0.001185,-0.010506,-0.00198,0,0.714355,1,0,-1e-06,0.008881,0,0,-0.006023
3,2987003,0.0,3,50.0,4,16830,467,50,2,17,2,376,77,,,yahoo.com,,-28.25,-45.15625,0.0,-2.703125,0.60791,1.0,-9.507812,-0.026459,-60.09375,-68.4375,-133.625,112.0,0.0,94.0,0.0,,,,27.0625,,,,-65.25,,,M0,T,F,,,,,,,,,,0.525391,0.487061,-0.162354,0.500977,0.099915,0.549805,0.529297,1.0,1.0,-0.001173,-0.154907,-0.088623,1.0,-0.380371,-0.116394,0.073975,-0.660645,1.0,1.0,0.352539,1.0,-0.000169,1.0,-0.196655,0.008858,0.376953,0.255859,1.0,1.0,-0.000588,1.0,1.0,-0.017181,-0.16626,-0.763184,6.957031,3.398438,-2.183594,1.0,1.0,1.0,1.0,1.0,-215.648071,1497.818481,997.392578,0.0,267.860382,135.0,750.674011,-17.736492,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,-0.694824,0.0,6.558594,0.0,-0.22168,-0.200317,-0.29834,0.0,-0.001918,0.003338,0.000517,-108.145111,1642.264282,836.696655,0.0,354.0,0.0,135.0,0.0,0.0,0.0,13.099121,1192.914551,730.83606,13.065339,,,,,,,,,,,,,,,,,,,1,com,,com,,,,,,,-130.5,-1.326172,0,0,-0.018738,0.007774,0.001982,0,0.117798,0,-0.146484,-6.9e-05,-0.010605,0.002058,-0.003838,-6.8e-05,0.62207,0,0,0,0,-0.005562,0.494873,0,-0.255615,0,0,0,0,0,-0.575684,0,0,0,0,0,0,1,0,0,0.000138,0,0.050507,0.000483,0.000549,-0.002802,0,-0.351562,0,0,-0.000494,0.000222,0,0,-0.003872
4,2987004,0.0,4,50.0,1,3434,414,50,2,2,1,320,77,,,gmail.com,,1.00293,-29.203125,0.0,0.961426,1.349609,0.0,-1.40625,-0.000846,-62.84375,-4.117188,-20.203125,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.176025,-0.041412,0.508789,1.0,1.0,1.0,1.0,1.0,-15.045835,-102.59185,-1001.933533,0.0,-20.846397,0.0,36.575062,-34.830391,0.0,0.0,6.152344,149.75,0.0,339.5,30.484375,0.0,-1002.35498,678.003906,-0.648926,0.0,-0.025162,1.0,1.0,1.0,1.0,1.0,-3861.406006,11.781294,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.146484,1.0,1.0,1.0,1.0,1.0,1.0,-0.083069,-0.462158,0.0,0.0,0.0,0.0,0.0,-2.042969,1.0,-0.128906,0.0,-0.019287,0.03717,-0.468018,0.0,0.04837,-0.059753,0.145386,41.349895,-2149.368652,-2181.003662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-613.816833,-80.648102,-4166.310547,-11.695937,-7.323485,0.0,-31713.632812,,,,-480.0,NotFound,166.0,,542.0,144.0,,,F,T,mobile,SAMSUNG SM-G892A Build/NRD90M,1,com,,com,False,False,,,True,14.6875,-38.3125,-0.008095,0,0,-0.057343,-0.190918,-0.003691,0,-0.145752,0,0.012894,-0.000199,-0.221191,-0.083923,-0.282227,-0.047821,0.28125,0,0,0,0,-0.233276,0.064392,0,-0.085144,0,0,0,0,1,-0.436279,0,0,0,0,0,0,0,0,0,-0.059479,0,0.051178,-0.004326,0.002384,-0.021576,0,-0.073669,0,0,-0.000525,-0.536621,0,0,-0.344727


In [8]:
gc.collect()

0

In [9]:
master_df_time_adjusted.drop(columns = [ 'hours', 'weekday', 'TransactionDT', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5'
                                                      , 'card6', 'addr1', 'addr2', 'is_train_df', 'isFraud'], inplace = True)

In [10]:
master_df_merged = pd.merge(master_df, master_df_time_adjusted, on='TransactionID', how='left', validate = '1:1')

In [11]:
master_df_merged.shape

(1097231, 587)

In [12]:
master_df_merged.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt_x,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1_x,dist2_x,P_emaildomain_x,R_emaildomain_x,C1_x,C2_x,C5_x,C6_x,C8_x,C9_x,C10,C11_x,C12_x,C13_x,C14_x,D1_x,D2_x,D3_x,D4_x,D5_x,D6_x,D7_x,D8_x,D9,D10_x,D11_x,D13_x,D14_x,D15_x,M2_x,M3_x,M4_x,M5_x,M6_x,M7_x,M8_x,M9_x,V2_x,V3_x,V4_x,V5_x,V7_x,V10_x,V12_x,V13_x,V19_x,V20_x,V21_x,V35_x,V36_x,V37_x,V38_x,V40_x,V44_x,V45_x,V47_x,V48_x,V49_x,V53_x,V54_x,V55_x,V56_x,V61_x,V62_x,V64_x,V67_x,V70_x,V74_x,V75_x,V76_x,V77_x,V78_x,V79_x,V82_x,V83_x,V86_x,V87_x,V91_x,V94,V96,V97,V99_x,V100_x,V102,V103,V105_x,V110_x,V112_x,V115_x,V117_x,V122_x,V126_x,V127_x,V128_x,V129_x,V130_x,V131_x,V132,V133,V134_x,V136_x,V137_x,V141_x,V144_x,V145_x,V149_x,V150_x,V152_x,V162_x,V165_x,V166_x,V171_x,V173_x,V176_x,V177,V188_x,V189_x,V191_x,V194_x,V201_x,V202_x,V203_x,V206_x,V208_x,V209_x,V211,V221_x,V223_x,V224_x,V225_x,V230_x,V243_x,V244_x,V245_x,V247_x,V249_x,V254_x,V257_x,V258_x,V263_x,V264_x,V269_x,V271_x,V280,V281_x,V282_x,V283_x,V285_x,V288_x,V291_x,V293,V294,V296_x,V298_x,V301_x,V302_x,V303_x,V304_x,V306_x,V307_x,V308_x,V309_x,V310_x,V311_x,V312_x,V313_x,V314_x,V315_x,V316_x,V317_x,V318_x,V320_x,V322,V323,V324,V329,V333_x,id_01_x,id_02_x,id_05_x,id_06_x,id_13_x,id_14_x,id_16_x,id_17_x,id_18_x,id_19_x,id_20_x,id_25_x,id_26_x,id_36_x,id_38_x,DeviceType_x,DeviceInfo_x,is_train_df,P_emaildomain_2_x,P_emaildomain_3_x,P_emaildomain_4_x,id_31_edge_x,id_31_ie_x,id_31_chrome_version_newness_x,id_31_safari_version_newness_x,id_30_android_x,id_33_resolution_x,TransactionAmt_decimal_x,TransactionAmt_decimal_length_x,weekday,hours,id_18_15.0_x,card3_143.0,card3_144.0,card3_150.0,card3_185.0,R_emaildomain_2_com_x,id_14_-420.0_x,id_14_60.0_x,M4_M0_x,M4_M1_x,card6_credit,M8_F_x,hours_1.0,hours_3.0,hours_5.0,hours_11.0,hours_15.0,hours_16.0,hours_17.0,hours_18.0,hours_19.0,hours_20.0,hours_21.0,hours_22.0,card4_american express,card4_discover,card4_mastercard,id_13_19.0_x,id_13_49.0_x,...,card5_226.0,card5_229.0,card5_236.0,P_emaildomain_1_outlook_x,P_emaildomain_1_infrequent_category_x,id_17_100.0_x,id_17_166.0,id_32_24.0_x,DeviceInfo_SM-J700M Build/MMB29K_x,DeviceInfo_Trident/7.0_x,DeviceInfo_Windows_x,TransactionAmt_y,dist1_y,dist2_y,P_emaildomain_y,R_emaildomain_y,C1_y,C2_y,C5_y,C6_y,C8_y,C9_y,C11_y,C12_y,C13_y,C14_y,D1_y,D2_y,D3_y,D4_y,D5_y,D6_y,D7_y,D8_y,D10_y,D11_y,D13_y,D14_y,D15_y,M2_y,M3_y,M4_y,M5_y,M6_y,M7_y,M8_y,M9_y,V2_y,V3_y,V4_y,V5_y,V7_y,V10_y,V12_y,V13_y,V19_y,V20_y,V21_y,V35_y,V36_y,V37_y,V38_y,V40_y,V44_y,V45_y,V47_y,V48_y,V49_y,V53_y,V54_y,V55_y,V56_y,V61_y,V62_y,V64_y,V67_y,V70_y,V74_y,V75_y,V76_y,V77_y,V78_y,V79_y,V82_y,V83_y,V86_y,V87_y,V91_y,V99_y,V100_y,V105_y,V110_y,V112_y,V115_y,V117_y,V122_y,V126_y,V127_y,V128_y,V129_y,V130_y,V131_y,V134_y,V136_y,V137_y,V141_y,V144_y,V145_y,V149_y,V150_y,V152_y,V162_y,V165_y,V166_y,V171_y,V173_y,V176_y,V188_y,V189_y,V191_y,V194_y,V201_y,V202_y,V203_y,V206_y,V208_y,V209_y,V221_y,V223_y,V224_y,V225_y,V230_y,V243_y,V244_y,V245_y,V247_y,V249_y,V254_y,V257_y,V258_y,V263_y,V264_y,V269_y,V271_y,V281_y,V282_y,V283_y,V285_y,V288_y,V291_y,V296_y,V298_y,V301_y,V302_y,V303_y,V304_y,V306_y,V307_y,V308_y,V309_y,V310_y,V311_y,V312_y,V313_y,V314_y,V315_y,V316_y,V317_y,V318_y,V320_y,V333_y,id_01_y,id_02_y,id_05_y,id_06_y,id_13_y,id_14_y,id_16_y,id_17_y,id_18_y,id_19_y,id_20_y,id_25_y,id_26_y,id_36_y,id_38_y,DeviceType_y,DeviceInfo_y,P_emaildomain_2_y,P_emaildomain_3_y,P_emaildomain_4_y,id_31_edge_y,id_31_ie_y,id_31_chrome_version_newness_y,id_31_safari_version_newness_y,id_30_android_y,id_33_resolution_y,TransactionAmt_decimal_y,TransactionAmt_decimal_length_y,id_18_15.0_y,R_emaildomain_2_com_y,id_14_-420.0_y,id_14_60.0_y,M4_M0_y,M4_M1_y,M8_F_y,id_13_19.0_y,id_13_49.0_y,R_emaildomain_anonymous.com_y,R_emaildomain_gmail.com_y,R_emaildomain_hotmail.com_y,P_emaildomain_4_com_y,id_19_193.0_y,id_19_271.0_y,id_19_312.0_y,id_19_321.0_y,id_19_infrequent_category_y,M6_F_y,M3_F_y,M5_F_y,M2_F_y,P_emaildomain_anonymous.com_y,P_emaildomain_bellsouth.net_y,P_emaildomain_comcast.net_y,P_emaildomain_gmail.com_y,P_emaildomain_hotmail.com_y,P_emaildomain_live.com_y,P_emaildomain_me.com_y,P_emaildomain_msn.com_y,P_emaildomain_optonline.net_y,P_emaildomain_outlook.com_y,P_emaildomain_verizon.net_y,P_emaildomain_yahoo.com_y,P_emaildomain_infrequent_category_y,M9_F_y,id_20_333.0_y,id_20_401.0_y,id_20_500.0_y,id_20_507.0_y,id_20_533.0_y,id_20_549.0_y,id_20_612.0_y,M7_F_y,P_emaildomain_1_outlook_y,P_emaildomain_1_infrequent_category_y,id_17_100.0_y,id_32_24.0_y,DeviceInfo_SM-J700M Build/MMB29K_y,DeviceInfo_Trident/7.0_y,DeviceInfo_Windows_y
0,2987000,0.0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,13.0,,,0.0,T,T,M2,F,T,,,,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,,,,,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,1,,,,,,,,,,500,1,0.0,0.0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,68.5,19.0,,,,0.452637,-56.375,0.0,-35.625,0.147949,1.0,-2.697266,-0.002918,-47.75,-44.46875,-75.1875,,13.0,,,,,,-129.625,13.0,,,-130.25,T,T,M2,F,T,,,,1.0,1.0,1.0,1.0,1.0,0.0,0.474121,0.28833,0.216187,0.882324,0.099915,,,,,,,,,,,0.330322,-0.563477,1.0,1.0,0.022949,1.0,0.001961,1.0,-0.691895,0.005024,0.390625,0.238159,1.0,1.0,-0.00102,0.0,0.0,-0.114136,-0.073059,-0.507324,-1.301758,-0.109802,-4.191406,1.0,1.0,1.0,1.0,1.0,-6.71961,-2.708824,-0.523158,0.0,-22.091198,0.0,142.647385,-364.04425,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.055756,1.0,-1.576172,0.0,-0.236084,-1.376953,-2.455078,0.0,-0.002861,0.022232,-0.046967,-115.976929,10.262924,-84.031685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.73419,-52.337128,0.007775,-62.248173,,,,,,,,,,,,,,,,,,,,,,,,,,,,180.125,-1.407227,0.038452,0.02977,-0.018692,0,-0.879883,0,-0.131226,-8.4e-05,-0.020798,-0.04126,0.000748,-0.002371,-0.665039,0,0,0,0,-0.005356,-0.512695,0,0.73291,0,0,0,0,0,-0.235229,0,0,0,0,0,0,0,0,0,-0.005093,0,0.037262,-0.00355,0.008629,-0.000116,0,-0.264893,0,0,-1e-06,-0.028214,0,0,-0.014145
1,2987001,0.0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,,,,0.0,,,M0,T,T,,,,,,,,,,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,1,com,,com,,,,,,,0,1,0.0,0.0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,29.0,,,gmail.com,,-3.621094,-56.25,0.0,-4.152344,-0.184937,0.0,-6.273438,-0.002895,-86.625,-55.84375,-145.875,,,0.0,,,,,-96.375,,,,-135.375,,,M0,T,T,,,,,,,,,,-0.320801,-0.305664,0.181641,0.692383,0.099915,-0.458008,-0.699707,1.0,1.0,-0.002575,-0.247437,-0.52002,1.0,-0.254395,-0.387939,-0.820801,-1.00293,1.0,1.0,0.055237,1.0,0.000696,1.0,-0.649414,-0.006096,-0.626465,-0.745117,1.0,1.0,0.000196,1.0,1.0,0.004551,-0.052765,-0.647949,-0.977539,-0.225464,-1.176758,1.0,1.0,1.0,1.0,1.0,-38.638889,-94.207375,-902.065979,0.0,-93.548965,0.0,43.854446,-15.204865,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.3396,1.0,-1.37793,0.0,-0.244263,-0.557617,0.029922,0.0,-0.012245,-0.038361,-0.000793,-10.440208,293.314484,-1116.605957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-229.436371,-226.169266,-2985.060303,-25.625862,,,,,,,,,,,,,,,,,,,com,,com,,,,,,,-325.25,-0.394287,-0.023315,-0.002499,0.002174,0,0.26001,0,-0.186523,9e-05,-0.010605,-0.001632,-0.004669,-0.001369,0.352295,0,0,0,0,-0.00362,-0.378662,0,-0.421143,0,0,0,0,1,-0.161499,0,0,0,0,0,0,0,0,0,0.001272,0,0.05072,-0.003147,0.000129,-0.021179,0,-0.254639,0,0,-0.000525,0.007568,0,0,-0.012245
2,2987002,0.0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,,outlook.com,,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,315.0,,,315.0,T,T,M0,F,F,F,F,F,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,1,com,,com,,,,,,,0,1,0.0,0.0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,59.0,287.0,,outlook.com,,-4.550781,-57.46875,0.0,-5.875,-0.109131,1.0,-11.515625,-0.004707,-96.25,-50.1875,-195.875,,,0.0,,,,,-61.25,315.0,,,185.125,T,T,M0,F,F,F,F,F,1.0,1.0,1.0,1.0,1.0,0.0,0.641113,0.500977,0.154785,0.53125,0.100281,0.530273,0.296631,1.0,1.0,0.000102,-0.133911,-0.03537,1.0,-0.310791,-0.292969,0.321289,-0.009979,1.0,1.0,0.103821,1.0,-0.000404,1.0,-0.699707,0.006508,0.378418,0.282471,1.0,1.0,-0.000632,1.0,1.0,0.086731,0.009644,-0.401611,-0.425537,-0.027618,0.019165,1.0,1.0,1.0,1.0,1.0,-7.368867,-0.263193,-244.418457,0.0,-0.95438,0.0,19.340231,-34.404888,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.182739,1.0,-1.493164,0.0,-0.298584,-0.222412,-0.129395,0.0,-0.001134,-0.000715,-0.000592,-37.330769,-614.068054,-277.90387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-22.479866,-90.550117,-206.577148,-73.767731,,,,,,,,,,,,,,,,,,,com,,com,,,,,,,-305.75,-0.377441,0.001814,0.001918,0.002615,0,0.341064,0,0.8125,-5.5e-05,-0.010605,0.000667,-0.001724,-0.001341,0.600586,0,0,0,0,-0.004463,0.348145,0,0.717773,0,0,0,0,0,-0.182739,0,0,0,0,1,0,0,0,1,0.000941,0,0.051544,0.001185,-0.010506,-0.00198,0,0.714355,1,0,-1e-06,0.008881,0,0,-0.006023
3,2987003,0.0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,,,yahoo.com,,2.0,5.0,0.0,4.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,,,,,84.0,,,,111.0,,,M0,T,F,,,,,,,,,,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,48.0,28.0,10.0,4.0,38.0,24.0,0.0,1.0,1.0,1.0,1.0,1.0,50.0,1758.0,925.0,0.0,354.0,135.0,50.0,1404.0,790.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,28.0,0.0,0.0,0.0,10.0,0.0,1.0,1.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,1758.0,925.0,0.0,354.0,0.0,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,,,,,,,,,,,,,,,,,,,,,,,1,com,,com,,,,,,,0,1,0.0,0.0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,50.0,,,yahoo.com,,-28.25,-45.15625,0.0,-2.703125,0.60791,1.0,-9.507812,-0.026459,-60.09375,-68.4375,-133.625,112.0,0.0,94.0,0.0,,,,27.0625,,,,-65.25,,,M0,T,F,,,,,,,,,,0.525391,0.487061,-0.162354,0.500977,0.099915,0.549805,0.529297,1.0,1.0,-0.001173,-0.154907,-0.088623,1.0,-0.380371,-0.116394,0.073975,-0.660645,1.0,1.0,0.352539,1.0,-0.000169,1.0,-0.196655,0.008858,0.376953,0.255859,1.0,1.0,-0.000588,1.0,1.0,-0.017181,-0.16626,-0.763184,6.957031,3.398438,-2.183594,1.0,1.0,1.0,1.0,1.0,-215.648071,1497.818481,997.392578,0.0,267.860382,135.0,750.674011,-17.736492,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,-0.694824,0.0,6.558594,0.0,-0.22168,-0.200317,-0.29834,0.0,-0.001918,0.003338,0.000517,-108.145111,1642.264282,836.696655,0.0,354.0,0.0,135.0,0.0,0.0,0.0,13.099121,1192.914551,730.83606,13.065339,,,,,,,,,,,,,,,,,,,com,,com,,,,,,,-130.5,-1.326172,-0.018738,0.007774,0.001982,0,0.117798,0,-0.146484,-6.9e-05,-0.010605,0.002058,-0.003838,-6.8e-05,0.62207,0,0,0,0,-0.005562,0.494873,0,-0.255615,0,0,0,0,0,-0.575684,0,0,0,0,0,0,1,0,0,0.000138,0,0.050507,0.000483,0.000549,-0.002802,0,-0.351562,0,0,-0.000494,0.000222,0,0,-0.003872
4,2987004,0.0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,,,gmail.com,,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,140.0,0.0,1803.0,64.0,0.0,5155.0,2840.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70787.0,,,,-480.0,NotFound,166.0,,542.0,144.0,,,F,T,mobile,SAMSUNG SM-G892A Build/NRD90M,1,com,,com,False,False,,,True,14.6875,0,1,0.0,0.0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0,0,50.0,,,gmail.com,,1.00293,-29.203125,0.0,0.961426,1.349609,0.0,-1.40625,-0.000846,-62.84375,-4.117188,-20.203125,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.176025,-0.041412,0.508789,1.0,1.0,1.0,1.0,1.0,-15.045835,-102.59185,-1001.933533,0.0,-20.846397,0.0,36.575062,-34.830391,0.0,0.0,6.152344,149.75,0.0,339.5,30.484375,0.0,-1002.35498,678.003906,-0.648926,0.0,-0.025162,1.0,1.0,1.0,1.0,1.0,-3861.406006,11.781294,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.146484,1.0,1.0,1.0,1.0,1.0,1.0,-0.083069,-0.462158,0.0,0.0,0.0,0.0,0.0,-2.042969,1.0,-0.128906,0.0,-0.019287,0.03717,-0.468018,0.0,0.04837,-0.059753,0.145386,41.349895,-2149.368652,-2181.003662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-613.816833,-80.648102,-4166.310547,-11.695937,-7.323485,0.0,-31713.632812,,,,-480.0,NotFound,166.0,,542.0,144.0,,,F,T,mobile,SAMSUNG SM-G892A Build/NRD90M,com,,com,False,False,,,True,14.6875,-38.3125,-0.008095,-0.057343,-0.190918,-0.003691,0,-0.145752,0,0.012894,-0.000199,-0.221191,-0.083923,-0.282227,-0.047821,0.28125,0,0,0,0,-0.233276,0.064392,0,-0.085144,0,0,0,0,1,-0.436279,0,0,0,0,0,0,0,0,0,-0.059479,0,0.051178,-0.004326,0.002384,-0.021576,0,-0.073669,0,0,-0.000525,-0.536621,0,0,-0.344727


In [13]:
del master_df, master_df_time_adjusted
gc.collect()

0

In [14]:
'''cols_to_keep = ['TransactionID', 'hours', 'weekday', 'TransactionDT', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5'
                                                      , 'card6', 'addr1', 'addr2', 'is_train_df', 'isFraud']
drop_correlated_cols(master_df_merged, 0.9, cols_to_keep = cols_to_keep, sample_frac = 1)
master_df_merged.head()
'''

"cols_to_keep = ['TransactionID', 'hours', 'weekday', 'TransactionDT', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5'\n                                                      , 'card6', 'addr1', 'addr2', 'is_train_df', 'isFraud']\ndrop_correlated_cols(master_df_merged, 0.9, cols_to_keep = cols_to_keep, sample_frac = 1)\nmaster_df_merged.head()\n"

In [15]:
gc.collect()
master_df_merged.shape

(1097231, 587)

In [16]:
master_df = master_df_merged
del master_df_merged

In [17]:
for col in master_df.select_dtypes(exclude='number').columns:
    master_df[col] = master_df[col].astype('category').cat.codes

In [18]:
'''
length_ones = len(master_df[master_df['isFraud']==1])
train_balanced = pd.concat([master_df[master_df['isFraud']==1], (master_df[master_df['isFraud']==0]).sample(length_ones)], axis=0)

#train_balanced = train_balanced.sample(10000)


X_train, X_test, y_train, y_test = train_test_split(
    train_balanced.drop(columns=['isFraud', 'TransactionID', 'TransactionDT']), train_balanced['isFraud'], 
    test_size=1/4, stratify =train_balanced['isFraud'],  random_state=0)

print(X_train.shape)
print(X_test.shape)

clf = XGBClassifier(max_depth=5, n_estimators=1000, verbosity=1)
clf.fit(X_train, y_train)
pred_prob = clf.predict_proba(X_test)
pred_prob[:, 1]
roc_score = roc_auc_score(y_test, pred_prob[:, 1])
print("roc_auc score %.4f" % roc_score)
xgboost.plot_importance(clf, max_num_features=20, importance_type='gain')
xgboost.plot_importance(clf, max_num_features=20, importance_type='weight')
'''

'\nlength_ones = len(master_df[master_df[\'isFraud\']==1])\ntrain_balanced = pd.concat([master_df[master_df[\'isFraud\']==1], (master_df[master_df[\'isFraud\']==0]).sample(length_ones)], axis=0)\n\n#train_balanced = train_balanced.sample(10000)\n\n\nX_train, X_test, y_train, y_test = train_test_split(\n    train_balanced.drop(columns=[\'isFraud\', \'TransactionID\', \'TransactionDT\']), train_balanced[\'isFraud\'], \n    test_size=1/4, stratify =train_balanced[\'isFraud\'],  random_state=0)\n\nprint(X_train.shape)\nprint(X_test.shape)\n\nclf = XGBClassifier(max_depth=5, n_estimators=1000, verbosity=1)\nclf.fit(X_train, y_train)\npred_prob = clf.predict_proba(X_test)\npred_prob[:, 1]\nroc_score = roc_auc_score(y_test, pred_prob[:, 1])\nprint("roc_auc score %.4f" % roc_score)\nxgboost.plot_importance(clf, max_num_features=20, importance_type=\'gain\')\nxgboost.plot_importance(clf, max_num_features=20, importance_type=\'weight\')\n'

In [19]:
train_balanced = master_df[master_df['isFraud'].notnull()]

temp_list_to_drop = []
temp_list_to_drop.extend(['isFraud', 'TransactionID', 'TransactionDT', 'is_train_df'])

print(train_balanced.shape)


clf = XGBClassifier(max_depth=50)
clf.fit(train_balanced.drop(columns=temp_list_to_drop), train_balanced['isFraud'])

(590540, 587)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=50,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [20]:
gc.collect()

19

In [21]:
# prepare submission
temp_list_to_drop = []
#temp_list_to_drop = list(cols_cat)
temp_list_to_drop.extend(['isFraud', 'TransactionID', 'TransactionDT'])
temp_list_to_include = list(set(master_df.columns).difference(set(temp_list_to_drop)))

temp_list_to_drop = []
#temp_list_to_drop = list(cols_cat)
temp_list_to_drop.extend(['isFraud', 'TransactionID', 'TransactionDT'])

temp_list_to_include =  list(train_balanced.drop(columns=temp_list_to_drop).columns)

temp_list_to_drop = []
#temp_list_to_drop = list(cols_cat)
temp_list_to_drop.extend(['isFraud', 'TransactionID', 'TransactionDT', 'is_train_df'])

counter_from = master_df.loc[master_df['is_train_df']==0, 'isFraud'].index[0]

len_master_df = len(master_df)

print(counter_from)
print(len_master_df)
print('start!!')
while counter_from < len_master_df:
    print(counter_from)
    counter_to = counter_from + 10000
    pred = pd.DataFrame()
    #print(len(master_df['isFraud'].loc[counter_from:counter_to]))
    #print(len(master_df.loc[counter_from:counter_to, [col for col in master_df.columns if col not in temp_list_to_drop]]))
    master_df['isFraud'].loc[counter_from:counter_to] =  clf.predict_proba(master_df.loc[counter_from:counter_to, [col for col in master_df.columns if col not in temp_list_to_drop]])[:, 1]
    
    counter_from += 10000
    gc.collect()
#print(temp_list_to_include)

590540
1097231
start!!
590540
600540
610540
620540
630540
640540
650540
660540
670540
680540
690540
700540
710540
720540
730540
740540
750540
760540
770540
780540
790540
800540
810540
820540
830540
840540
850540
860540
870540
880540
890540
900540
910540
920540
930540
940540
950540
960540
970540
980540
990540
1000540
1010540
1020540
1030540
1040540
1050540
1060540
1070540
1080540
1090540


In [22]:
#sample_submission.head()

In [23]:
counter_from = master_df.loc[master_df['is_train_df']==0, 'isFraud'].index[0]
submission = pd.DataFrame(master_df[['TransactionID', 'isFraud']].loc[counter_from:]).reset_index(drop = True)
submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000278
1,3663550,0.000418
2,3663551,0.000972
3,3663552,0.001447
4,3663553,0.000229


In [24]:
submission.describe()

Unnamed: 0,TransactionID,isFraud
count,506691.0,506691.0
mean,3916894.0,0.020447
std,146269.2,0.11676
min,3663549.0,7.5e-05
25%,3790222.0,0.000557
50%,3916894.0,0.001143
75%,4043566.0,0.002911
max,4170239.0,0.999512


In [25]:
submission.to_csv('submission.csv', index=False)