### Overview
This notebook works on the IEEE-CIS Fraud Detection competition. Here I build a simple XGBoost model based on a balanced dataset.

In [None]:
# all imports necessary for this notebook
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import gc
import copy
import missingno as msno 
import xgboost
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split 
from sklearn.metrics import roc_auc_score, r2_score

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Helpers

# seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [None]:
seed_everything()
pd.set_option('display.max_columns', 500)

In [None]:
#read data
folder_path = '/kaggle/input/ieee-fraud-detection/'
train_identity = pd.read_csv(f'{folder_path}train_identity.csv')
train_transaction = pd.read_csv(f'{folder_path}train_transaction.csv')
test_identity = pd.read_csv(f'{folder_path}test_identity.csv')
test_transaction = pd.read_csv(f'{folder_path}test_transaction.csv')
sample_submission = pd.read_csv(f'{folder_path}sample_submission.csv')
# Merge identity and transaction data 
train_df = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test_df = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

del train_identity, train_transaction, test_identity, test_transaction
gc.collect()

In [None]:
print(train_df.shape)
print(test_df.shape)
gc.collect()

In [None]:
df_missing = pd.DataFrame((train_df.isnull().mean() * 100), columns=['missing_perc_train'])
test_missing = (test_df.isnull().mean() * 100)
df_missing = df_missing.join(test_missing.rename('missing_perc_test')).reset_index()
df_missing.rename(columns = {'index' :'Feature'}, inplace=True)
df_missing['missing_percent_avg'] = (df_missing['missing_perc_train'] + df_missing['missing_perc_test']) / 2
df_missing.sort_values(by=['missing_percent_avg', 'missing_perc_train', 'missing_perc_test'], inplace=True)
#df_missing['abs_missing_percent_diff'] = np.abs(df_missing['missing_perc_train'] - df_missing['missing_perc_test'])
#df_missing.sort_values(by=['abs_missing_percent_diff'], ascending=False)
print(df_missing.shape)
df_missing.head()

In [None]:
df_missing[~df_missing.Feature.str.contains('isFraud')].set_index('Feature').plot(figsize=(15,7.5), grid=True)

In [None]:
df_missing[~df_missing.Feature.str.contains('isFraud')].loc[df_missing.missing_perc_train<50].set_index('Feature').plot(figsize=(15,7.5), grid=True)
#df_missing.loc[df_missing.missing_perc_train<50].shape

In [None]:
gc.collect()

In [None]:
train_df['is_train_df'] = 1
test_df['is_train_df'] = 0
print(train_df.shape)
print(test_df.shape)

In [None]:
cols_orig_train = train_df.columns
master_df = pd.concat([train_df, test_df], ignore_index=True, sort =True).reindex(columns=cols_orig_train)
print(master_df.shape)
#master_df.head()

In [None]:
del test_df, train_df
gc.collect()

In [None]:
'''
train_df_row_indices = list(master_df.loc[master_df['is_train_df'] == 1].index)
train_df_col_names = list(master_df.columns)
train_df_col_names.remove('is_train_df')
print(master_df.loc[train_df_row_indices, train_df_col_names].describe())

test_df_row_indices = list(master_df.loc[master_df['is_train_df'] == 0].index)
test_df_col_names = list(master_df.columns)
test_df_col_names.remove('is_train_df')
test_df_col_names.remove('isFraud')
print(master_df.loc[test_df_row_indices, test_df_col_names].describe())
'''

In [None]:
cols_all = set(master_df.columns)

cols_target = 'isFraud'

cols_cat = {'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 
            'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 
            'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 
            'card6', 'M4','P_emaildomain',  'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 
            'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9'}

cols_cont = set([col for col in cols_all if col not in cols_cat and col != cols_target] )
# cols_cont.remove(cols_target)
print(len(cols_cat))
print(len(cols_cont))
print(len(cols_cat) + len(cols_cont))

In [None]:
msno.matrix(master_df[cols_cat].sample(10000)) 

In [None]:
msno.matrix(master_df[cols_cont].sample(10000)) 

In [None]:
# Some FE
master_df[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = master_df['P_emaildomain'].str.split('.', expand=True)
master_df[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = master_df['R_emaildomain'].str.split('.', expand=True)
master_df['P_emaildomain_4'] = master_df['P_emaildomain'].str.replace('^[^.]+.', '', regex=True)
master_df['R_emaildomain_4'] = master_df['R_emaildomain'].str.replace('^[^.]+.', '', regex=True)
cols_cat.update(['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3', 'P_emaildomain_4', 'R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3', 'R_emaildomain_4'])


In [None]:
print('P_emaildomain_1', master_df['P_emaildomain_1'].unique())
print(80 * '-')
print('P_emaildomain_2', master_df['P_emaildomain_2'].unique())
print(80 * '-')
print('P_emaildomain_3', master_df['P_emaildomain_3'].unique())
print(80 * '-')
print('P_emaildomain_4', master_df['P_emaildomain_4'].unique())

In [None]:
master_df.loc[:, cols_cat] = master_df.loc[:, cols_cat].astype('category')

In [None]:
print(master_df.loc[:, master_df.dtypes == object].shape)
print(len(cols_cat))

temp_missing_cat = master_df.loc[:, cols_cat].isnull().sum()
temp_missing_cat.sort_values(inplace=True)

temp_missing_cat_train = master_df.loc[master_df['is_train_df'] ==1 , cols_cat].isnull().sum()
temp_missing_cat_test = master_df.loc[master_df['is_train_df'] ==0 , cols_cat].isnull().sum()

temp_len = len(master_df)
temp_len_train = len(master_df.loc[master_df['is_train_df'] ==1])
temp_len_test = len(master_df.loc[master_df['is_train_df'] ==0])

for col in temp_missing_cat.index:
    
    temp_missing_percent = temp_missing_cat[col] * 100 / temp_len
    temp_missing_percent_train = temp_missing_cat_train[col] * 100 / temp_len_train
    temp_missing_percent_test = temp_missing_cat_test[col] * 100 / temp_len_test
    print("\n%s, missing is: %.1f%% (train: %.1f%%, test: %.1f%%), n_unique is: %s\n" 
          %(col, temp_missing_percent, temp_missing_percent_train, temp_missing_percent_test,  len(master_df.loc[:, col].unique()) ))
    temp_unique_list = master_df.loc[master_df[col].notnull(), col].astype(str).unique()
    temp_unique_list.sort()
    print(master_df.loc[:, col].value_counts().iloc[0:10])
    print(80* '-')
    print(80* '-')

Further FE

In [None]:
#focus on id_31
master_df.id_31.astype(str).value_counts()[0:20]

In [None]:
#lowercase the whole column
master_df['id_31'] = master_df['id_31'].loc[master_df['id_31'].notnull()].str.lower()

In [None]:
temp = list(master_df['id_31'].unique())

temp.remove(np.nan)
#print(temp)
new_temp = []
import re
#DATA = "Hey, you - what are you doing here!?"
#print re.findall(r"[\w']+", DATA)
for item in temp:
    #new_temp.extend(item.split())
    new_temp.extend(re.findall(r"[\w']+", item))
new_temp
from collections import Counter
most_common_words= [word for word, word_count in Counter(new_temp).most_common(1000)]

#remove digits
most_common_words= [word for word in most_common_words if not word.isdigit()]

#remove single letter words
most_common_words= [word for word in most_common_words if len(word) > 1]

print(most_common_words)

In [None]:
temp_min_n_in_cat_to_keep = 1000

temp_added_cols = set()

for word in most_common_words:
    temp_len = len(master_df['id_31'].loc[master_df['id_31'].notnull() & master_df['id_31'].str.contains(word)])
    if temp_len >= temp_min_n_in_cat_to_keep:
        print("%s: %d \n" %(word, temp_len))
        temp_new_col_name = 'id_31' + '_' + word 
        master_df[temp_new_col_name] = master_df['id_31'].str.contains(word)
        temp_added_cols.add(temp_new_col_name)
        print(master_df[temp_new_col_name].describe())
        print(80* '-')

In [None]:
cols_cat = cols_cat.union(temp_added_cols)
#cols_cat

In [None]:
corr = master_df[temp_added_cols].astype('float16').corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
gc.collect()

In [None]:
master_df['id_31'].loc[master_df['id_31_chrome']== True].loc[master_df['id_31_android']== True].value_counts()

In [None]:
master_df['id_31_chrome_version'] = master_df['id_31'].loc[master_df['id_31_chrome'] & 
                                                           (master_df['id_31_generic']==False)].str.slice(start=7, stop=9)
master_df['id_31_chrome_version'].loc[master_df['id_31_chrome_version'] ==''] = np.nan
#master_df[['id_31', 'id_31_chrome_version']].loc[master_df['id_31_chrome_version'].notnull()].head(20)
master_df['id_31_chrome_version'].value_counts()

In [None]:
rolling_window = 1000
min_rolling_window = 10
temp_df = master_df[['id_31_chrome_version']].loc[master_df['id_31_chrome_version'].notnull()].astype('float16')
temp_df['id_31_chrome_version_newness'] = temp_df['id_31_chrome_version'] / temp_df['id_31_chrome_version'].rolling(rolling_window, center= True, min_periods=min_rolling_window).mean()
#train_df[new_col_name] = train_df[col] / train_df[col].rolling(rolling_window, center= True, min_periods=min_rolling_window).mean().interpolate()

In [None]:
plt.plot(temp_df['id_31_chrome_version'].rolling(rolling_window, center= True, min_periods=min_rolling_window).mean())
plt.show()
plt.plot(temp_df['id_31_chrome_version_newness'].rolling(rolling_window, center= True, min_periods=min_rolling_window).mean())

In [None]:
master_df['id_31_chrome_version_newness'] = temp_df['id_31_chrome_version_newness']
master_df.drop(columns=['id_31_chrome_version'], inplace=True)
del temp_df
gc.collect()
cols_cont.add('id_31_chrome_version_newness')

In [None]:
master_df['id_31'].loc[master_df['id_31_safari']== True].value_counts()

In [None]:
master_df['id_31_safari_version'] = np.nan
master_df['id_31_safari_version'].loc[master_df['id_31'].notnull() & master_df['id_31'].str.contains('safari 8.0')] = 8
master_df['id_31_safari_version'].loc[master_df['id_31'].notnull() & master_df['id_31'].str.contains('safari 9.0')] = 9
master_df['id_31_safari_version'].loc[master_df['id_31'].notnull() & master_df['id_31'].str.contains('safari 10.0')] = 10
master_df['id_31_safari_version'].loc[master_df['id_31'].notnull() & master_df['id_31'].str.contains('safari 11.0')] = 11
master_df['id_31_safari_version'].loc[master_df['id_31'].notnull() & master_df['id_31'].str.contains('safari 12.0')] = 12

In [None]:
master_df['id_31_safari_version'].plot()

In [None]:
rolling_window = 20
min_rolling_window = 10
temp_df = master_df[['id_31_safari_version']].loc[master_df['id_31_safari_version'].notnull()].astype('float16')
temp_df['id_31_safari_version_newness'] = temp_df['id_31_safari_version'] / temp_df['id_31_safari_version'].rolling(rolling_window, center= True, min_periods=min_rolling_window).mean()
#train_df[new_col_name] = train_df[col] / train_df[col].rolling(rolling_window, center= True, min_periods=min_rolling_window).mean().interpolate()

plt.plot(temp_df['id_31_safari_version'].rolling(rolling_window, center= True, min_periods=min_rolling_window).mean())
plt.show()
plt.plot(temp_df['id_31_safari_version_newness'].rolling(rolling_window, center= True, min_periods=min_rolling_window).mean())

In [None]:
temp_df['id_31_safari_version_newness'].hist()

In [None]:
master_df['id_31_safari_version_newness'] = temp_df['id_31_safari_version_newness']
master_df.drop(columns=['id_31_safari_version'], inplace=True)
del temp_df
gc.collect()
cols_cont.add('id_31_safari_version_newness')

In [None]:
# id_31 values excluding chrome and safari
master_df['id_31'].loc[(master_df['id_31_chrome']==False) & 
                       (master_df['id_31_safari']== False)].astype(str).value_counts()[0:16]

In [None]:
master_df.id_31.loc[master_df['id_31_edge']==True].value_counts()

In [None]:
master_df['id_31_edge_version'] = np.nan
master_df['id_31_edge_version'].loc[master_df['id_31'].notnull() & master_df['id_31'].str.contains('edge 13.0')] = 13
master_df['id_31_edge_version'].loc[master_df['id_31'].notnull() & master_df['id_31'].str.contains('edge 14.0')] = 14
master_df['id_31_edge_version'].loc[master_df['id_31'].notnull() & master_df['id_31'].str.contains('edge 15.0')] = 15
master_df['id_31_edge_version'].loc[master_df['id_31'].notnull() & master_df['id_31'].str.contains('edge 16.0')] = 16
master_df['id_31_edge_version'].loc[master_df['id_31'].notnull() & master_df['id_31'].str.contains('edge 17.0')] = 17
master_df['id_31_edge_version'].loc[master_df['id_31'].notnull() & master_df['id_31'].str.contains('edge 18.0')] = 18

In [None]:
master_df['id_31_edge_version'].plot()

In [None]:
rolling_window = 100
min_rolling_window = 10
temp_df = master_df[['id_31_edge_version']].loc[master_df['id_31_edge_version'].notnull()].astype('float16')
temp_df['id_31_edge_version_newness'] = temp_df['id_31_edge_version'] / temp_df['id_31_edge_version'].rolling(rolling_window, center= True, min_periods=min_rolling_window).mean()
#train_df[new_col_name] = train_df[col] / train_df[col].rolling(rolling_window, center= True, min_periods=min_rolling_window).mean().interpolate()

plt.plot(temp_df['id_31_edge_version'].rolling(rolling_window, center= True, min_periods=min_rolling_window).mean())
plt.show()
plt.plot(temp_df['id_31_edge_version_newness'].rolling(rolling_window, center= True, min_periods=min_rolling_window).mean())

In [None]:
master_df['id_31_edge_version_newness'] = temp_df['id_31_edge_version_newness']
master_df.drop(columns=['id_31_edge_version'], inplace=True)
del temp_df
gc.collect()
cols_cont.add('id_31_edge_version_newness')

In [None]:
master_df.id_31.loc[master_df['id_31_firefox']==True].value_counts()

In [None]:
master_df['id_31_firefox_version'] = master_df['id_31'].loc[master_df['id_31_firefox']==True].str.slice(start=-4, stop=-2)
master_df['id_31_firefox_version'].loc[master_df['id_31_firefox_version'] =='ef'] = np.nan
master_df['id_31_firefox_version'].loc[master_df['id_31_firefox_version'] =='er'] = np.nan
#master_df[['id_31', 'id_31_firefox_version']].loc[master_df['id_31_firefox_version'].notnull()].head(20)
master_df['id_31_firefox_version'].value_counts()

In [None]:
master_df['id_31_firefox_version'].astype('float16').plot()

In [None]:
rolling_window = 1000
min_rolling_window = 100
temp_df = master_df[['id_31_firefox_version']].loc[master_df['id_31_firefox_version'].notnull()].astype('float16')
temp_df['id_31_firefox_version_newness'] = temp_df['id_31_firefox_version'] / temp_df['id_31_firefox_version'].rolling(rolling_window, center= True, min_periods=min_rolling_window).mean()
#train_df[new_col_name] = train_df[col] / train_df[col].rolling(rolling_window, center= True, min_periods=min_rolling_window).mean().interpolate()

plt.plot(temp_df['id_31_firefox_version'].rolling(rolling_window, center= True, min_periods=min_rolling_window).mean())
plt.show()
plt.plot(temp_df['id_31_firefox_version_newness'].rolling(rolling_window, center= True, min_periods=min_rolling_window).mean())

In [None]:
master_df['id_31_firefox_version_newness'] = temp_df['id_31_firefox_version_newness']
master_df.drop(columns=['id_31_firefox_version'], inplace=True)
del temp_df
gc.collect()
cols_cont.add('id_31_firefox_version_newness')

That's enough with the id_31 variable (but can do the same for the samsung kind)

In [None]:
# check 'DeviceInfo' variable
master_df['DeviceInfo'].astype(str).value_counts()[0:20]

No clear FE to do here .... move on

In [None]:
# check 'id_30'
master_df['id_30'].astype(str).value_counts()[0:30]

In [None]:
#lowercase the whole column
master_df['id_30'] = master_df['id_30'].loc[master_df['id_30'].notnull()].str.lower()

In [None]:
temp = list(master_df['id_30'].unique())

temp.remove(np.nan)
#print(temp)
new_temp = []
import re
#DATA = "Hey, you - what are you doing here!?"
#print re.findall(r"[\w']+", DATA)
for item in temp:
    #new_temp.extend(item.split())
    new_temp.extend(re.findall(r"[\w']+", item))
new_temp
from collections import Counter
most_common_words= [word for word, word_count in Counter(new_temp).most_common(1000)]

#remove digits
most_common_words= [word for word in most_common_words if not word.isdigit()]

#remove single letter words
most_common_words= [word for word in most_common_words if len(word) > 1]

print(most_common_words)

In [None]:
# Hard code most common words
most_common_words = ['mac', 'ios', 'android', 'windows', 'linux']
most_common_words

In [None]:
temp_min_n_in_cat_to_keep = 1000

temp_added_cols = set()

for word in most_common_words:
    temp_len = len(master_df['id_30'].loc[master_df['id_30'].notnull() & master_df['id_30'].str.contains(word)])
    if temp_len >= temp_min_n_in_cat_to_keep:
        print("%s: %d \n" %(word, temp_len))
        temp_new_col_name = 'id_30' + '_' + word 
        master_df[temp_new_col_name] = master_df['id_30'].str.contains(word)
        temp_added_cols.add(temp_new_col_name)
        print(master_df[temp_new_col_name].describe())
        print(80* '-')

In [None]:
cols_cat = cols_cat.union(temp_added_cols)
#cols_cat

In [None]:
corr = master_df[temp_added_cols].astype('float16').corr()
corr.style.background_gradient(cmap='coolwarm')

enough FE with id_30 (though could code newness like done with 'id_31'

In [None]:
# FE of id_33
master_df['id_33'].loc[master_df['id_33'].notnull()].astype(str).value_counts()[0:30]

In [None]:
gc.collect()

In [None]:
temp_df = pd.DataFrame()
temp_df[['id_33_1', 'id_33_2']] = master_df['id_33'].loc[master_df['id_33'].notnull()].str.split('x', expand=True)
temp_df = temp_df.astype('float64')
temp_df['id_33_1'].loc[temp_df['id_33_1']==0] = np.nan
temp_df['id_33_2'].loc[temp_df['id_33_2']==0] = np.nan

In [None]:
temp_df['id_33_resolution'] = temp_df['id_33_1'] * temp_df['id_33_2']
temp_df['id_33_resolution'] = np.log(temp_df['id_33_resolution'])
temp_df.describe()

In [None]:
master_df['id_33_resolution'] = temp_df['id_33_resolution']
cols_cont.add('id_33_resolution')
del temp_df
gc.collect()

In [None]:
master_df['id_33_resolution'].hist()

Moving now to some FE of continuous variables

In [None]:
# Decimal part of the 'TransactionAmt' feature
master_df['TransactionAmt_decimal'] = ((master_df['TransactionAmt'] - master_df['TransactionAmt'].astype(int)) * 1000).astype(int)
# Length of the 'TransactionAmt' feature
master_df['TransactionAmt_decimal_length'] = master_df['TransactionAmt'].astype(str).str.split('.', expand=True)[1].str.len()

cols_cont.update(['TransactionAmt_decimal', 'TransactionAmt_decimal_length'])

In [None]:
master_df['TransactionAmt_decimal_length'].hist()

In [None]:
gc.collect()

In [None]:
## Thanks to FChmiel (https://www.kaggle.com/fchmiel) for these two functions
def make_day_feature(df, offset=0, tname='TransactionDT'):
    """
    Creates a day of the week feature, encoded as 0-6. 
    
    Parameters:
    -----------
    df : pd.DataFrame
        df to manipulate.
    offset : float (default=0)
        offset (in days) to shift the start/end of a day.
    tname : str
        Name of the time column in df.
    """
    # found a good offset is 0.58
    days = df[tname] / (3600*24)        
    encoded_days = np.floor(days-1+offset) % 7
    return encoded_days

def make_hour_feature(df, tname='TransactionDT'):
    """
    Creates an hour of the day feature, encoded as 0-23. 
    
    Parameters:
    -----------
    df : pd.DataFrame
        df to manipulate.
    tname : str
        Name of the time column in df.
    """
    hours = df[tname] / (3600)        
    encoded_hours = np.floor(hours) % 24
    return encoded_hours

In [None]:
master_df['weekday'] = make_day_feature(master_df, offset=0.58)
master_df['hours'] = make_hour_feature(master_df)
                                     
cols_cat.update(['weekday', 'hours'])

In [None]:
# check all cols in either cols_cat or cols_cont
print(set(master_df.columns).difference(cols_cat.union(cols_cont)))
print(cols_cat.intersection(cols_cont))

In [None]:
master_df.memory_usage().sum()

In [None]:
temp_cols_cat_list = list(cols_cat)
master_df[temp_cols_cat_list] = master_df[temp_cols_cat_list].astype('category')
gc.collect()
master_df[cols_cat].describe()

In [None]:
master_df.memory_usage().sum()

In [None]:
cols_cat_dummified = set()
n_categories_to_keep = 24

for col in cols_cat:
    print("%s, " %col, end="")
    
    len_categories = len(master_df[col].loc[master_df[col].notnull()].unique())
    temp_col = master_df.loc[:, [col]]
    
    if n_categories_to_keep < len_categories:
        top_cats = list(temp_col[col].value_counts(ascending = False, normalize=False).iloc[:n_categories_to_keep].index) 
        temp_col[col].cat.add_categories(['infrequent_category'], inplace = True)
        top_cats.append('infrequent_category')
        #print(list(top_cats))
        temp_col.loc[temp_col[col].notnull() & ~temp_col[col].isin(top_cats), [col]] = 'infrequent_category'
        temp_col[col].cat.remove_categories([cat for cat in temp_col[col].cat.categories if not cat in top_cats], inplace = True)
    
    temp_col = pd.get_dummies(temp_col, dummy_na=True)
    
    cols_cat_dummified.update(list(temp_col.columns))
    master_df[temp_col.columns] = temp_col
    
    del temp_col
    gc.collect()


In [None]:
master_df[cols_cat_dummified].astype('category').describe()

In [None]:
# master_df[[col for col in master_df.columns if len(master_df[col].loc[master_df[col].notnull()].unique()) ==  2 ]].dtypes

In [None]:
master_df.shape

In [None]:
'''
from scipy import stats

cols_cont_transformed = set()

for col in cols_cont:
    print("%s, " %col, end="")
    new_col_name = col + "_rel_mean"
    master_df[new_col_name] = np.float16(stats.zscore(master_df[col]))
    cols_cont_transformed.add(new_col_name)
    gc.collect()
'''

In [None]:
for col in cols_cat:
    master_df[col] = master_df[col].astype('category').cat.codes

In [None]:
length_ones = len(master_df[master_df['isFraud']==1])
train_balanced = pd.concat([master_df[master_df['isFraud']==1], (master_df[master_df['isFraud']==0]).sample(length_ones)], axis=0)

#train_balanced = train_df.sample(40000)
#rand_cols = set(random.sample(list(train_df.columns), 200))
#rand_cols = list(rand_cols.union({'isFraud', 'TransactionID', 'TransactionDT'}))
#train_balanced = train_df.loc[:, rand_cols].sample(100000)
#train_balanced = train_balanced.loc[:, rand_cols]
#train_balanced = train_balanced.iloc[:, 0:600]

#train_balanced = train_balanced.loc[:, list(cols_cont) + ['isFraud']]

train_balanced = train_balanced.sample(10000)

#train_balanced.drop(columns= list(cols_cat), inplace=True)

#added_list = ['isFraud', 'TransactionID', 'TransactionDT', 'TransactionAmt']
#cols_cat_dummified.update(added_list)
#train_balanced = train_balanced.loc[:, list(cols_cat_dummified)]

X_train, X_test, y_train, y_test = train_test_split(
    train_balanced.drop(columns=['isFraud', 'TransactionID', 'TransactionDT']), train_balanced['isFraud'], 
    test_size=1/3, stratify =train_balanced['isFraud'],  random_state=0)

'''
temp_q1_len = int(len(train_df) / 4) 
temp_half_len = int(len(train_df) / 2)
temp_q3_len = int(len(train_df) / 4) + temp_half_len

X_train = train_df.drop(columns=['isFraud', 'TransactionID', 'TransactionDT']).loc[:temp_q1_len ]
y_train = train_df.loc[:temp_q1_len, 'isFraud']
X_test = train_df.drop(columns=['isFraud', 'TransactionID', 'TransactionDT']).loc[temp_q3_len: ]
y_test = train_df.loc[temp_q3_len:, 'isFraud']
'''

print(X_train.shape)
print(X_test.shape)

clf = XGBClassifier(max_depth=40)
clf.fit(X_train, y_train)
pred_prob = clf.predict_proba(X_test)
pred_prob[:, 1]
roc_score = roc_auc_score(y_test, pred_prob[:, 1])
print("roc_auc score %.4f" % roc_score)
xgboost.plot_importance(clf, max_num_features=20, importance_type='gain')
xgboost.plot_importance(clf, max_num_features=20, importance_type='weight')
#cols_cat_dummified.difference_update(set(added_list))

In [None]:
temp = clf.get_booster().get_score(importance_type='gain')
df = pd.DataFrame(temp.keys(), columns=['Feature'])
df['Feature_importance'] = temp.values()
df = df.sort_values(by=['Feature_importance'], ascending = False)
print(df.shape)
df

In [None]:
temp_list = df.Feature.index
#print(temp_list)
df.loc[[index for index in temp_list if df.Feature[index] in cols_cat_dummified], :]

In [None]:
temp_list = [col for col in master_df.columns if ('addr' in col or 'card' in col or 'hour' in col or 'week' in col) and '_' in col ]
print(temp_list)

In [None]:
del X_train, X_test, y_train, y_test 
gc.collect()

In [None]:
#master_df.drop(columns = ['TransactionAmt_to_predicted'], inplace=True)

In [None]:
clf = XGBRegressor(max_depth=3)

rand_sample_indeces = master_df.sample(100000).index

#temp_list_to_drop = list(cols_cat)
temp_list_to_drop.extend(['isFraud', 'TransactionID', 'TransactionDT', 'TransactionAmt'])
#print(temp_list_to_drop)

#X_train, X_test, y_train, y_test = train_test_split(master_df.iloc[rand_sample_indeces].drop(columns= temp_list_to_drop), 
#                                                    master_df.loc[rand_sample_indeces, 'TransactionAmt'], 
#                                                    test_size=1/3,  random_state=0)

X_train, X_test, y_train, y_test = train_test_split(master_df.iloc[rand_sample_indeces].drop(columns= temp_list_to_drop), 
                                                    master_df.loc[rand_sample_indeces, 'TransactionAmt'], 
                                                    test_size=1/3,  random_state=0)

print(X_train.shape)
print(X_test.shape)

clf.fit(X_train, y_train)
pred_y = clf.predict(X_test)
r2Score = r2_score(y_test, pred_y)
print("R2 score %.4f" % r2Score)

xgboost.plot_importance(clf, max_num_features=20, importance_type='gain')
xgboost.plot_importance(clf, max_num_features=20, importance_type='weight')


In [None]:
del X_train, X_test, y_train, y_test 
gc.collect()

In [None]:
len(master_df)

In [None]:
master_df['TransactionAmt_to_predicted'] = np.nan
temp_list_to_drop.append('TransactionAmt_to_predicted')
len_master_df = len(master_df)
counter_from = 0
while counter_from < len_master_df:
    print(counter_from)
    counter_to = counter_from + 40000
    pred_y = clf.predict(master_df.iloc[counter_from:counter_to].drop(columns= temp_list_to_drop))
    master_df.loc[counter_from:counter_to, ['TransactionAmt_to_predicted']] = master_df['TransactionAmt'].iloc[counter_from:counter_to] - pred_y
    counter_from += 40000
    gc.collect()

In [None]:
master_df['TransactionAmt_to_predicted'].describe()

In [None]:
master_df['TransactionAmt_to_predicted'].hist()

In [None]:
length_ones = len(master_df[master_df['isFraud']==1])
train_balanced = pd.concat([master_df[master_df['isFraud']==1], (master_df[master_df['isFraud']==0]).sample(length_ones)], axis=0)

train_balanced = train_balanced.sample(10000)

#train_balanced.drop(columns= list(cols_cat), inplace=True)


X_train, X_test, y_train, y_test = train_test_split(
    train_balanced.drop(columns=['isFraud', 'TransactionID', 'TransactionDT']), train_balanced['isFraud'], 
    test_size=1/3, stratify =train_balanced['isFraud'],  random_state=0)



print(X_train.shape)
print(X_test.shape)

clf = XGBClassifier(max_depth=5)
clf.fit(X_train, y_train)
pred_prob = clf.predict_proba(X_test)
pred_prob[:, 1]
roc_score = roc_auc_score(y_test, pred_prob[:, 1])
print("roc_auc score %.4f" % roc_score)
xgboost.plot_importance(clf, max_num_features=20, importance_type='gain')
xgboost.plot_importance(clf, max_num_features=20, importance_type='weight')
#cols_cat_dummified.difference_update(set(added_list))

In [None]:
del X_train, X_test, y_train, y_test 
gc.collect()

In [None]:
d

In [None]:
clf = XGBRegressor(max_depth=3)

rand_sample_indeces = master_df[master_df['dist1'].notnull()].sample(30000).index

temp_list_to_drop = []
#temp_list_to_drop = list(cols_cat)
temp_list_to_drop.extend(['isFraud', 'TransactionID', 'TransactionDT', 'dist1'])
#print(temp_list_to_drop)

#X_train, X_test, y_train, y_test = train_test_split(master_df.iloc[rand_sample_indeces].drop(columns= temp_list_to_drop), 
#                                                    master_df.loc[rand_sample_indeces, 'TransactionAmt'], 
#                                                    test_size=1/3,  random_state=0)

X_train, X_test, y_train, y_test = train_test_split(master_df.iloc[rand_sample_indeces].drop(columns= temp_list_to_drop), 
                                                    master_df.loc[rand_sample_indeces, 'dist1'], 
                                                    test_size=1/3,  random_state=0)

print(X_train.shape)
print(X_test.shape)

clf.fit(X_train, y_train)
pred_y = clf.predict(X_test)
r2Score = r2_score(y_test, pred_y)
print("R2 score %.4f" % r2Score)

xgboost.plot_importance(clf, max_num_features=20, importance_type='gain')
xgboost.plot_importance(clf, max_num_features=20, importance_type='weight')


In [None]:
del X_train, X_test, y_train, y_test 
gc.collect()

In [None]:
master_df['dist1_to_predicted'] = np.nan
temp_list_to_drop.append('dist1_to_predicted')
len_master_df = len(master_df)
counter_from = 0
while counter_from < len_master_df:
    print(counter_from)
    counter_to = counter_from + 40000
    pred_y = clf.predict(master_df.iloc[counter_from:counter_to].drop(columns= temp_list_to_drop))
    master_df.loc[counter_from:counter_to, ['dist1_to_predicted']] = master_df['dist1'].iloc[counter_from:counter_to] - pred_y
    counter_from += 40000
    gc.collect()

In [None]:
master_df[['dist1', 'dist1_to_predicted']].describe()

In [None]:
master_df['dist1_to_predicted'].hist()

In [None]:
length_ones = len(master_df[master_df['isFraud']==1])
train_balanced = pd.concat([master_df[master_df['isFraud']==1], (master_df[master_df['isFraud']==0]).sample(length_ones)], axis=0)

train_balanced = train_balanced.sample(30000)

#train_balanced.drop(columns= list(cols_cat), inplace=True)


X_train, X_test, y_train, y_test = train_test_split(
    train_balanced.drop(columns=['isFraud', 'TransactionID', 'TransactionDT']), train_balanced['isFraud'], 
    test_size=1/3, stratify =train_balanced['isFraud'],  random_state=0)



print(X_train.shape)
print(X_test.shape)

clf = XGBClassifier(max_depth=5)
clf.fit(X_train, y_train)
pred_prob = clf.predict_proba(X_test)
pred_prob[:, 1]
roc_score = roc_auc_score(y_test, pred_prob[:, 1])
print("roc_auc score %.4f" % roc_score)
xgboost.plot_importance(clf, max_num_features=20, importance_type='gain')
xgboost.plot_importance(clf, max_num_features=20, importance_type='weight')
#cols_cat_dummified.difference_update(set(added_list))

In [None]:
del  X_test, X_train, y_test, y_train
gc.collect()

In [None]:
master_df[['TransactionAmt', 'addr2']].groupby('addr2').mean()

In [None]:
gc.collect()
temp_df_1 = master_df[['TransactionAmt', 'addr2']]
temp_df_2 = master_df[['TransactionAmt', 'addr2']].groupby('addr2').mean()
temp_df_2.rename(columns={"TransactionAmt": "TransactionAmt_by_addr2_mean"}, inplace = True)
temp_df_1 = pd.merge(temp_df_1, temp_df_2, on='addr2', how='left')
master_df['TransactionAmt_by_addr2_mean'] = temp_df_1['TransactionAmt_by_addr2_mean']
master_df['TransactionAmt_to_TransactionAmt_by_addr2_mean'] = master_df['TransactionAmt'] / temp_df_1['TransactionAmt_by_addr2_mean']

In [None]:
del temp_df_1, temp_df_2
gc.collect()

In [None]:
plt.plot(master_df['TransactionAmt_by_addr2_mean'])

In [None]:
length_ones = len(master_df[master_df['isFraud']==1])
train_balanced = pd.concat([master_df[master_df['isFraud']==1], (master_df[master_df['isFraud']==0]).sample(length_ones)], axis=0)

train_balanced = train_balanced.sample(30000)

#train_balanced.drop(columns= list(cols_cat), inplace=True)


X_train, X_test, y_train, y_test = train_test_split(
    train_balanced.drop(columns=['isFraud', 'TransactionID', 'TransactionDT']), train_balanced['isFraud'], 
    test_size=1/3, stratify =train_balanced['isFraud'],  random_state=0)



print(X_train.shape)
print(X_test.shape)

clf = XGBClassifier(max_depth=5)
clf.fit(X_train, y_train)
pred_prob = clf.predict_proba(X_test)
pred_prob[:, 1]
roc_score = roc_auc_score(y_test, pred_prob[:, 1])
print("roc_auc score %.4f" % roc_score)
xgboost.plot_importance(clf, max_num_features=20, importance_type='gain')
xgboost.plot_importance(clf, max_num_features=20, importance_type='weight')
#cols_cat_dummified.difference_update(set(added_list))

In [None]:
del X_test, X_train, y_train, y_test
gc.collect()

In [None]:
del train_balanced
gc.collect()

In [None]:
temp = clf.get_booster().get_score(importance_type='gain')
df = pd.DataFrame(temp.keys(), columns=['Feature'])
df['Feature_importance'] = temp.values()
df = df.sort_values(by=['Feature_importance'], ascending = False)
print(df.shape)
df.head()

In [None]:
temp_feature_to_keep = list(df.Feature)
temp_feature_to_keep.extend(['isFraud', 'TransactionID', 'TransactionDT', 'is_train_df'])
temp_feature_to_keep

In [None]:
temp_features_to_drop = [col for col in master_df.columns if col not in temp_feature_to_keep]
temp_features_to_drop

In [None]:
master_df.drop(columns=temp_features_to_drop, inplace = True)
gc.collect()

In [None]:
if False:
    %%time
    temp_list_to_drop = ['isFraud', 'TransactionID', 'TransactionDT', 'is_train_df']

    X_train, X_test, y_train, y_test = train_test_split(
        master_df.loc[master_df['is_train_df']==1].drop(columns=temp_list_to_drop),
        master_df.loc[master_df['is_train_df']==1, ['isFraud']], test_size=1/3,
        stratify =master_df.loc[master_df['is_train_df']==1, ['isFraud']],  random_state=0)



    print(X_train.shape)
    print(X_test.shape)

    clf = XGBClassifier(max_depth=20)
    clf.fit(X_train, y_train)
    pred_prob = clf.predict_proba(X_test)
    pred_prob[:, 1]
    roc_score = roc_auc_score(y_test, pred_prob[:, 1])
    print("roc_auc score %.4f" % roc_score)
    xgboost.plot_importance(clf, max_num_features=20, importance_type='gain')
    xgboost.plot_importance(clf, max_num_features=20, importance_type='weight')

In [None]:
if False:
    del X_test, X_train, y_train, y_test
    gc.collect()

In [None]:
%%time
temp_list_to_drop = ['isFraud', 'TransactionID', 'TransactionDT', 'is_train_df']
clf = XGBClassifier(max_depth=20)
clf.fit(master_df.loc[master_df['is_train_df']==1].drop(columns=temp_list_to_drop), master_df.loc[master_df['is_train_df']==1, ['isFraud']])

In [None]:
%%time
master_df.drop(master_df[master_df['is_train_df']==1].index, inplace = True)
gc.collect()

In [None]:
master_df['is_train_df'].unique()

In [None]:
%%time
# prepare submission
temp_list_to_drop = ['isFraud', 'TransactionID', 'TransactionDT', 'is_train_df']
len_master_df = len(master_df)

counter_from = master_df.loc[master_df['is_train_df']==0, 'isFraud'].index[0]
counter_final = master_df.loc[master_df['is_train_df']==0, 'isFraud'].index[len_master_df - 1]
while counter_from <= counter_final:
    print(counter_from)
    counter_to = counter_from + 10000
    pred = pd.DataFrame()
    #print(len(master_df['isFraud'].loc[counter_from:counter_to]))
    #print(len(master_df.loc[counter_from:counter_to, [col for col in master_df.columns if col not in temp_list_to_drop]]))
    master_df['isFraud'].loc[counter_from:counter_to] =  clf.predict_proba(master_df.loc[counter_from:counter_to].drop(columns=temp_list_to_drop))[:, 1]
    
    counter_from += 10000
    gc.collect()
#print(temp_list_to_include)

In [None]:
master_df[['TransactionID', 'isFraud']].reset_index(drop = True).head()

In [None]:
sample_submission.head()

In [None]:
submission = master_df[['TransactionID', 'isFraud']].reset_index(drop = True)
submission.head()

In [None]:
submission.describe()

In [None]:
submission.to_csv('submission.csv', index=False)