In [None]:
# all imports necessary for this notebook
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import gc
import copy
import xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.metrics import roc_auc_score
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#read data
folder_path = '/kaggle/input/ieee-fraud-detection/'
train_identity = pd.read_csv(f'{folder_path}train_identity.csv')
train_transaction = pd.read_csv(f'{folder_path}train_transaction.csv')
test_identity = pd.read_csv(f'{folder_path}test_identity.csv')
test_transaction = pd.read_csv(f'{folder_path}test_transaction.csv')
sample_submission = pd.read_csv(f'{folder_path}sample_submission.csv')
# Merge identity and transaction data 
train_df = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test_df = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

del train_identity, train_transaction, test_identity, test_transaction

In [None]:
target_df = train_df[['TransactionID', 'isFraud']]
train_df.drop(columns=['isFraud'], inplace = True)

In [None]:
random.seed(12345)
np.random.seed(12345)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
target_df.head()

In [None]:
print(train_df.shape)
print(test_df.shape)
print(target_df.shape)

In [None]:
cols_all = train_df.columns

cols_target = 'isFraud'

cols_cat = {'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 
            'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 
            'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 
            'card6', 'M4','P_emaildomain',  'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 
            'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9'}

cols_cont = set([col for col in cols_all if col not in cols_cat and col != cols_target] )
# cols_cont.remove(cols_target)
print(len(cols_cat))
print(len(cols_cont))
print(len(cols_cat) + len(cols_cont))

In [None]:
def get_adverserial_df(df_1, df_2, n_sample, label = 'test_data'):
    if len(df_1.shape) >1 and len(df_2.shape) >1 and  df_1.shape[1] != df_2.shape[1]:
        print("Error!!")
        return
    adverserial_df = df_1.sample(n_sample)
    adverserial_df[label] = 0
    temp_df = df_2.sample(n_sample)
    temp_df[label] = 1
    adverserial_df = pd.concat([adverserial_df, temp_df], ignore_index=True)
    del temp_df
    return adverserial_df

In [None]:
def encode_cat_columns(df, cat_cols):
    for col in cat_cols:
        if col in df.columns:
            df[col] = df[col].astype('category').cat.codes
    return df

In [None]:
def run_adversrial_analysis(data, target, test_size=1/3, print_result = True):
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size= test_size, stratify =target,  random_state=0)
    
    clf = XGBClassifier()
    clf.fit(X_train, y_train)
    pred_prob = clf.predict_proba(X_test)
    pred_prob[:, 1]
    roc_score = roc_auc_score(y_test, pred_prob[:, 1])
    if print_result:
        print("roc_auc score %.2f" % roc_score)
        xgboost.plot_importance(clf, max_num_features=20, importance_type='gain')
        xgboost.plot_importance(clf, max_num_features=20, importance_type='weight')
    return roc_score, clf

In [None]:
def reset_cols_excluded(print_excluded = False):
    cols_excluded =  {'test_data', 'TransactionID', 'TransactionDT'}
    if print_excluded:
        print("Excluded columns:\n", cols_excluded, sep='')
    return cols_excluded

In [None]:
cols_excluded = reset_cols_excluded(print_excluded=True)

In [None]:
print("Run adversarial: train vs. train")
adverserial_df = get_adverserial_df(train_df, train_df, n_sample=1000)
adverserial_df = encode_cat_columns(adverserial_df, cols_cat)
run_adversrial_analysis(adverserial_df.drop(columns=cols_excluded), adverserial_df['test_data'])

In [None]:
print("Run adversarial: test vs. test")
adverserial_df = get_adverserial_df(test_df, test_df, n_sample=1000)
adverserial_df = encode_cat_columns(adverserial_df, cols_cat)
run_adversrial_analysis(adverserial_df.drop(columns=cols_excluded), adverserial_df['test_data'])

In [None]:
train_df.sort_values(by=['TransactionDT', 'TransactionID'], inplace=True)
test_df.sort_values(by=['TransactionDT', 'TransactionID'], inplace=True)

In [None]:
print("Run adversarial: train vs. train (temporally ordered)")
cols_excluded = reset_cols_excluded()
temp_q1_len = int(len(train_df) / 4) 
temp_half_len = int(len(train_df) / 2)
temp_q3_len = int(len(train_df) / 4) + temp_half_len
adverserial_df = get_adverserial_df(train_df[:temp_q1_len], train_df[temp_q3_len:], n_sample=1000)
adverserial_df = encode_cat_columns(adverserial_df, cols_cat)
run_adversrial_analysis(adverserial_df.drop(columns=cols_excluded), adverserial_df['test_data'])

In [None]:
print("Run adversarial: test vs. test (temporally ordered)")
cols_excluded = reset_cols_excluded()
temp_q1_len = int(len(test_df) / 4) 
temp_half_len = int(len(test_df) / 2)
temp_q3_len = int(len(test_df) / 4) + temp_half_len
adverserial_df = get_adverserial_df(test_df[:temp_q1_len], test_df[temp_q3_len:], n_sample=1000)
adverserial_df = encode_cat_columns(adverserial_df, cols_cat)
run_adversrial_analysis(adverserial_df.drop(columns=cols_excluded), adverserial_df['test_data'])

In [None]:
print("Run adversarial: train vs. test ")
cols_excluded = reset_cols_excluded()

adverserial_df = get_adverserial_df(train_df[:], test_df[:], n_sample=10000)
adverserial_df = encode_cat_columns(adverserial_df, cols_cat)
rocScore, clf = run_adversrial_analysis(adverserial_df.drop(columns=cols_excluded), adverserial_df['test_data'])

In [None]:
print("Run adversarial: train vs. train (temporally ordered)")
cols_excluded = reset_cols_excluded()
temp_q1_len = int(len(train_df) / 4) 
temp_half_len = int(len(train_df) / 2)
temp_q3_len = int(len(train_df) / 4) + temp_half_len

adverserial_df = get_adverserial_df(train_df[:temp_q1_len], train_df[temp_q3_len:], n_sample=1000)
adverserial_df = encode_cat_columns(adverserial_df, cols_cat)
rocScore, clf = run_adversrial_analysis(adverserial_df.drop(columns=cols_excluded), adverserial_df['test_data'])

In [None]:
temp_q1_len = int(len(train_df) / 4) 
temp_half_len = int(len(train_df) / 2)
temp_q3_len = int(len(train_df) / 4) + temp_half_len

temp_train_df = train_df.loc[:temp_q1_len]
temp_test_df = train_df.loc[temp_q3_len:]

temp_train_df.loc[:, cols_cat] = encode_cat_columns(temp_train_df.loc[:, cols_cat], cols_cat)
temp_test_df.loc[:, cols_cat] = encode_cat_columns(temp_test_df.loc[:, cols_cat], cols_cat)

pred_prob_q1 = clf.predict_proba( temp_train_df.drop(columns = ['TransactionDT', 'TransactionID']))

In [None]:
temp_train_df['pred_prob_q1'] = np.nan
temp_train_df.loc[:, ['pred_prob_q1']] = pred_prob_q1[:, 1]
#temp_train_df['pred_prob_q1'].head()

In [None]:
X_train = temp_train_df.drop(columns=['TransactionDT']).sample(10000)
y_train = pd.merge(X_train, target_df, on='TransactionID', how='left', left_index=True).loc[:, 'isFraud']

X_test = temp_test_df.drop(columns=['TransactionDT']).sample(10000)
y_test = pd.merge(X_test, target_df, on='TransactionID', how='left', left_index=True).loc[:, 'isFraud']

X_train.drop(columns= [ 'TransactionID'], inplace= True)
X_test.drop(columns= ['TransactionID'], inplace= True)

In [None]:
clf = XGBClassifier(max_depth=5)
clf.fit(X_train.drop(columns=['pred_prob_q1']), y_train)
pred_prob = clf.predict_proba(X_test)
pred_prob[:, 1]
roc_score = roc_auc_score(y_test, pred_prob[:, 1])
print("roc_auc score %.4f" % roc_score)
xgboost.plot_importance(clf, max_num_features=20, importance_type='gain')
xgboost.plot_importance(clf, max_num_features=20, importance_type='weight')

In [None]:
X_train.loc[:, ['pred_prob_q1']] = X_train.loc[:, ['pred_prob_q1']] / 1.5 
X_train.loc[y_train==1, ['pred_prob_q1']] = X_train.loc[y_train==1, ['pred_prob_q1']] * 1.5 

clf = XGBClassifier(max_depth=5)
clf.fit(X_train.drop(columns=['pred_prob_q1']), y_train, sample_weight  = X_train['pred_prob_q1'] )
pred_prob = clf.predict_proba(X_test)
pred_prob[:, 1]
roc_score = roc_auc_score(y_test, pred_prob[:, 1])
print("roc_auc score %.4f" % roc_score)
xgboost.plot_importance(clf, max_num_features=20, importance_type='gain')
xgboost.plot_importance(clf, max_num_features=20, importance_type='weight')

In [None]:
X_train['pred_prob_q1'].hist()

In [None]:
adverserial_df['predict_train'] = np.nan
X_train['predict_train']=clf.predict(X_train.drop(columns= 'predict_train'))
plt.hist(X_train['predict_train'].loc[y_train == 0] )

In [None]:
temp_df = X_train['predict_train'].loc[adverserial_df['test_data'] == 0]
indeces = temp_df[X_train['predict_train'] == 1].index
adverserial_df.loc[indeces].shape

In [None]:
#clf = XGBClassifier(max_depth=5)
clf.fit(X_train.drop(columns= 'predict_train').loc[indeces], y_train[indeces])
pred_prob = clf.predict_proba(X_test)
pred_prob[:, 1]
roc_score = roc_auc_score(y_test, pred_prob[:, 1])
print("roc_auc score %.4f" % roc_score)
xgboost.plot_importance(clf, max_num_features=20, importance_type='gain')
xgboost.plot_importance(clf, max_num_features=20, importance_type='weight')

Focus on one variable that shows temporal depndence 
For example M7

In [None]:
print("Run adversarial: train vs. train (temporally ordered) on M7")
cols_excluded = reset_cols_excluded()
temp_q1_len = int(len(train_df) / 4) 
temp_half_len = int(len(train_df) / 2)
temp_q3_len = int(len(train_df) / 4) + temp_half_len
adverserial_df = get_adverserial_df(train_df.loc[:temp_q1_len, ['M7']], train_df.loc[temp_q3_len:, ['M7']], n_sample=1000)
adverserial_df = encode_cat_columns(adverserial_df, cols_cat)
run_adversrial_analysis(adverserial_df.drop(columns='test_data'), adverserial_df['test_data'])

In [None]:
cols_excluded = reset_cols_excluded()
cols_excluded.update(train_df.filter(regex = '^V|^M|^D|^C|^id').columns )
print("Run adversarial: train vs. train (temporally ordered) **Without many variables**")
temp_q1_len = int(len(train_df) / 4) 
temp_half_len = int(len(train_df) / 2)
temp_q3_len = int(len(train_df) / 4) + temp_half_len

adverserial_df = get_adverserial_df(train_df[:temp_q1_len], train_df[temp_q3_len:], n_sample=1000)
adverserial_df = encode_cat_columns(adverserial_df, cols_cat)

run_adversrial_analysis(adverserial_df.drop(columns=cols_excluded), adverserial_df['test_data'])
cols_excluded = reset_cols_excluded()

In [None]:
cols_excluded = reset_cols_excluded()
cols_excluded.update(test_df.filter(regex = '^V|^M|^D|^C|^id').columns )
print("Run adversarial: test vs. test (temporally ordered) **Without many variables**")
temp_q1_len = int(len(test_df) / 4) 
temp_half_len = int(len(test_df) / 2)
temp_q3_len = int(len(test_df) / 4) + temp_half_len
adverserial_df = get_adverserial_df(test_df[:temp_q1_len], test_df[temp_q3_len:], n_sample=1000)
adverserial_df = encode_cat_columns(adverserial_df, cols_cat)
run_adversrial_analysis(adverserial_df.drop(columns=cols_excluded), adverserial_df['test_data'])
cols_excluded = reset_cols_excluded()

In [None]:
cols_excluded = reset_cols_excluded()
cols_excluded.update(train_df.filter(regex = '^V|^M|^D|^C|^id').columns )
cols_excluded.remove('test_data')
cols_excluded.remove('TransactionID')


temp_q1_len = int(len(train_df) / 4) 
temp_half_len = int(len(train_df) / 2)
temp_q3_len = int(len(train_df) / 4) + temp_half_len

X_train = train_df.drop(columns=cols_excluded).loc[:temp_q1_len ]
y_train = pd.merge(X_train, target_df, on='TransactionID', how='left').loc[:, 'isFraud']

X_test = train_df.drop(columns=cols_excluded).loc[temp_q3_len: ]
y_test = pd.merge(X_test, target_df, on='TransactionID', how='left').loc[:, 'isFraud']

X_train.drop(columns= 'TransactionID', inplace= True)
X_test.drop(columns= 'TransactionID', inplace= True)

X_train = encode_cat_columns(X_train, cols_cat)
X_test = encode_cat_columns(X_test, cols_cat)

clf = XGBClassifier()
print("Start fitting!")
clf.fit(X_train, y_train)
pred_prob = clf.predict_proba(X_test)
pred_prob[:, 1]
roc_score = roc_auc_score(y_test, pred_prob[:, 1])
print("roc_auc score %.2f" % roc_score)
xgboost.plot_importance(clf, max_num_features=20, importance_type='gain')

cols_excluded = reset_cols_excluded()

In [None]:
cols_excluded = reset_cols_excluded()
#cols_excluded.update(train_df.filter(regex = '^V|^M|^D|^C|^id').columns )
cols_excluded.remove('test_data')
cols_excluded.remove('TransactionID')


temp_q1_len = int(len(train_df) / 4) 
temp_half_len = int(len(train_df) / 2)
temp_q3_len = int(len(train_df) / 4) + temp_half_len

X_train = train_df.drop(columns=cols_excluded).loc[:temp_q1_len ]
y_train = pd.merge(X_train, target_df, on='TransactionID', how='left').loc[:, 'isFraud']

X_test = train_df.drop(columns=cols_excluded).loc[temp_q3_len: ]
y_test = pd.merge(X_test, target_df, on='TransactionID', how='left').loc[:, 'isFraud']

X_train.drop(columns= 'TransactionID', inplace= True)
X_test.drop(columns= 'TransactionID', inplace= True)

X_train = encode_cat_columns(X_train, cols_cat)
X_test = encode_cat_columns(X_test, cols_cat)

clf = XGBClassifier()
print("Start fitting!")
clf.fit(X_train, y_train)
pred_prob = clf.predict_proba(X_test)
pred_prob[:, 1]
roc_score = roc_auc_score(y_test, pred_prob[:, 1])
print("roc_auc score %.2f" % roc_score)
xgboost.plot_importance(clf, max_num_features=20, importance_type='gain')

cols_excluded = reset_cols_excluded()

# Conclusion

Removing the time correlated factors seems to hurt (rather than improve) the model's quality. Whether or not there is benefit in reengineering these time-dependent features is of course another matter. 

In [None]:
del temp_df, test_df

In [None]:
cols_cat_dummified = set()
n_max_top_categories_to_keep = 20
for col in cols_cat:  
    temp_col = train_df.loc[:, [col]]
    top_cats = temp_col[col].value_counts(ascending = False, normalize=False).iloc[:n_max_top_categories_to_keep].index
    temp_col.loc[temp_col[col].notnull() & ~temp_col[col].isin(top_cats), [col]] = 'infrequent_category'
    temp_col.astype('category')
    #print(temp_col.head(10))
    #print(temp_col.describe())
    temp_col = pd.get_dummies(temp_col, dummy_na = True)
    #print(temp_col.describe())
    #temp_col.shape
    train_df.drop(columns=col, inplace=True)
    cols_cat_dummified.update(list(temp_col.columns))
    train_df[temp_col.columns] = temp_col

In [None]:
train_df.head()

Enginner this feature to remove time depndence 

In [None]:
#temp_df = train_df.loc[:, ['D15', 'TransactionDT']]
temp_df = train_df[train_df['D15'].notnull()].loc[:, ['D15', 'TransactionDT', 'TransactionID']]
temp_df = pd.merge(temp_df, target_df, on='TransactionID', how='left')
#temp_df = train_df[train_df['D15'].notnull()].loc[:, ['D15', 'TransactionDT']]
temp_rolling_size = 2000
#temp_min_periods = int(temp_rolling_size * 0.5)
#temp_min_periods = temp_rolling_size -1
temp_df.reset_index(drop=True, inplace=True)
temp_df['D15_mean_prev_100'] =  temp_df.loc[:, ['D15']].rolling(temp_rolling_size).mean()
temp_df['D15_std_prev_100'] =  temp_df.loc[:, ['D15']].rolling(temp_rolling_size).std()
temp_df['D15_z_rel_prev_100'] = (temp_df['D15'] - temp_df['D15_mean_prev_100']) / temp_df['D15_std_prev_100']
#temp_df.head(20)

In [None]:
temp_df.dropna(inplace=True)


temp_q1_len = int(len(temp_df) / 4) 
temp_half_len = int(len(temp_df) / 2)
temp_q3_len = int(len(temp_df) / 4) + temp_half_len

In [None]:
temp_trials =20

In [None]:
auc_sum = 0
for i in range(temp_trials):
    adverserial_df = get_adverserial_df(temp_df.loc[temp_q1_len:temp_half_len, ['D15']], temp_df.loc[temp_q3_len:, ['D15']], n_sample=1000)
    #adverserial_df = encode_cat_columns(adverserial_df, cols_cat)
    auc_sum += run_adversrial_analysis(adverserial_df.drop(columns='test_data'), adverserial_df['test_data'], print_result = False)
    
print(auc_sum / temp_trials)

In [None]:
auc_sum = 0
for i in range(temp_trials):
    adverserial_df = get_adverserial_df(temp_df.loc[temp_q1_len:temp_half_len, ['D15_z_rel_prev_100']], temp_df.loc[temp_q3_len:, ['D15_z_rel_prev_100']], n_sample=1000)
    #adverserial_df = encode_cat_columns(adverserial_df, cols_cat)
    auc_sum += run_adversrial_analysis(adverserial_df.drop(columns='test_data'), adverserial_df['test_data'], print_result=False)
    
print(auc_sum / temp_trials)

In [None]:
temp_df['D15'].isnull().sum() * 100 / len(temp_df['D15'])

In [None]:
temp_df['D15_z_rel_prev_100'].isnull().sum() * 100 / len(temp_df['D15_z_rel_prev_100'])

In [None]:
temp_df.dropna(inplace=True)
X_train = temp_df.loc[:temp_q1_len, ['D15_z_rel_prev_100']]
y_train = temp_df.loc[:temp_q1_len, ['isFraud']]
X_test = temp_df.loc[temp_q3_len:, ['D15_z_rel_prev_100']]
y_test = temp_df.loc[temp_q3_len:, ['isFraud']]
clf = XGBClassifier()
clf.fit(X_train, y_train)
pred_prob = clf.predict_proba(X_test)
pred_prob[:, 1]
roc_score = roc_auc_score(y_test, pred_prob[:, 1])
print("roc_auc score %.2f" % roc_score)
xgboost.plot_importance(clf, max_num_features=20)

In [None]:
X_train = temp_df.loc[:temp_q1_len, ['D15']]
y_train = temp_df.loc[:temp_q1_len, ['isFraud']]
X_test = temp_df.loc[temp_q3_len:, ['D15']]
y_test = temp_df.loc[temp_q3_len:, ['isFraud']]
clf = XGBClassifier()
clf.fit(X_train, y_train)
pred_prob = clf.predict_proba(X_test)
pred_prob[:, 1]
roc_score = roc_auc_score(y_test, pred_prob[:, 1])
print("roc_auc score %.2f" % roc_score)
xgboost.plot_importance(clf, max_num_features=20)

To start with I will exclude three features: target of the adversarial analysis (test_data), 'TransactionID'

In [None]:
excluded_cols =  ['test_data', 'TransactionID']
print("Excluded columns:\n", excluded_cols, sep='')
temp_half_len = int(len(train_df) / 2)
adverserial_df = get_adverserial_df(train_df, test_df, n_sample=1000)
adverserial_df = encode_cat_columns(adverserial_df, cat_cols)
run_adversrial_analysis(adverserial_df.drop(columns=excluded_cols), adverserial_df['test_data'])

Now, I'll re-engineer the TransactionDT and exclude it from the main models

In [None]:
excluded_cols += ['TransactionDT']
excluded_cols

In [None]:
## Thanks to FChmiel (https://www.kaggle.com/fchmiel) for these two functions
def make_day_feature(df, offset=0, tname='TransactionDT'):
    """
    Creates a day of the week feature, encoded as 0-6. 
    
    Parameters:
    -----------
    df : pd.DataFrame
        df to manipulate.
    offset : float (default=0)
        offset (in days) to shift the start/end of a day.
    tname : str
        Name of the time column in df.
    """
    # found a good offset is 0.58
    days = df[tname] / (3600*24)        
    encoded_days = np.floor(days-1+offset) % 7
    return encoded_days

def make_hour_feature(df, tname='TransactionDT'):
    """
    Creates an hour of the day feature, encoded as 0-23. 
    
    Parameters:
    -----------
    df : pd.DataFrame
        df to manipulate.
    tname : str
        Name of the time column in df.
    """
    hours = df[tname] / (3600)        
    encoded_hours = np.floor(hours) % 24
    return encoded_hours

In [None]:
train_df['weekday'] = make_day_feature(train_df, offset=0.58)
test_df['weekday'] = make_day_feature(test_df, offset=0.58)
train_df['hours'] = make_hour_feature(train_df)
test_df['hours'] = make_hour_feature(test_df)
                                     
cat_cols.extend(['weekday', 'hours'])

In [None]:
print("Excluded columns:\n", excluded_cols, sep='')
adverserial_df = get_adverserial_df(train_df, test_df, n_sample=1000)
adverserial_df = encode_cat_columns(adverserial_df, cat_cols)
run_adversrial_analysis(adverserial_df.drop(columns=excluded_cols), adverserial_df['test_data'])

In [None]:
print(pd.concat([test_df['card1'].astype('category'),train_df['card1'].astype('category')], axis=1).describe())

In [None]:
temp_top_n = 1000
temp_cat_test_list = test_df['card1'].value_counts(normalize=True)
print("Percentage of samples in top %d categories in test is %.2f" %(temp_top_n, temp_cat_test_list[0:temp_top_n].sum()))
temp_cat_test_set = set(temp_cat_test_list[0:temp_top_n].index)

temp_cat_train_list = train_df['card1'].value_counts(normalize=True)
print("Percentage of samples in top %d categories in train is %.2f" %(temp_top_n, temp_cat_train_list[0:temp_top_n].sum()))
temp_cat_train_set = set(temp_cat_train_list[0:temp_top_n].index)

temp_cat_set = temp_cat_train_set.intersection(set(temp_cat_test_set))
print("No. of categories where both intersect is %d. Which cover:" %len(set(temp_cat_set)))

sum_top_test = sum([temp_cat_test_list[index] for index in temp_cat_set])
sum_top_train = sum([temp_cat_train_list[index] for index in temp_cat_set])
print("%.4f %% of test cases" %sum_top_test)
print("%.4f %% of train cases" %sum_top_train)

#temp_cat_set = temp_cat_train_set.difference(set(temp_cat_test_set))
#print("No. of categories in train but not in test is %d" %len(set(temp_cat_set)))
#temp_cat_set = temp_cat_test_set.difference(set(temp_cat_train_set))
#print("No. of categories in test but not in train is %d" %len(set(temp_cat_set)))

In [None]:
print("Excluded columns:\n", excluded_cols, sep='')
adverserial_df = get_adverserial_df(train_df, test_df, n_sample=1000)
#adverserial_df.card1[adverserial_df['card1'].isin(temp_cat_set)].shape
adverserial_df['card1_other'] = ~adverserial_df['card1'].isin(temp_cat_set)
print(adverserial_df['card1_other'].describe())
cat_cols.add('card1_other')

In [None]:
adverserial_df.loc[adverserial_df['card1_other'], 'card1'] = np.nan

In [None]:
adverserial_df = pd.concat([adverserial_df.drop(columns='card1'), pd.get_dummies(adverserial_df['card1'], prefix='card1')], axis=1)
adverserial_df.head()

In [None]:
cat_cols.update([col for col in adverserial_df.columns if 'card1' in col ])
cat_cols.discard('card1')

In [None]:
adverserial_df = encode_cat_columns(adverserial_df, cat_cols)
run_adversrial_analysis(adverserial_df.drop(columns=excluded_cols), adverserial_df['test_data'])

In [None]:
adverserial_df.head()

In [None]:
run_adversrial_analysis(adverserial_df[[col for col in adverserial_df.columns if 'card1' in col ]], adverserial_df['test_data'])

In [None]:
adverserial_df = get_adverserial_df(train_df, test_df, n_sample=1000)
adverserial_df = pd.concat([adverserial_df.drop(columns='card1'), pd.get_dummies(adverserial_df['card1'], prefix='card1')], axis=1)
adverserial_df = encode_cat_columns(adverserial_df, [col for col in adverserial_df.columns if 'card1' in col ])
run_adversrial_analysis(adverserial_df[[col for col in adverserial_df.columns if 'card1' in col ]], adverserial_df['test_data'])

In [None]:
adverserial_df = get_adverserial_df(train_df, test_df, n_sample=1000)
adverserial_df = encode_cat_columns(adverserial_df, cat_cols)

In [None]:
adverserial_df

In [None]:
print("Excluded columns:\n", excluded_cols, sep='')
run_adversrial_analysis(adverserial_df.drop(columns=excluded_cols), adverserial_df['test_data'])

In [None]:
adverserial_df = pd.concat([adverserial_df.drop(columns='card2'), pd.get_dummies(adverserial_df['card2'], prefix='card2')], axis=1)
adverserial_df.head()

In [None]:
print("Excluded columns:\n", excluded_cols, sep='')
run_adversrial_analysis(adverserial_df.drop(columns=excluded_cols), adverserial_df['test_data'])

In [None]:
temp_cat_train_list[0:10]

In [None]:
sum([temp_cat_test_list[index] for index in temp_cat_set])



In [None]:
print(set(temp_cat_set))

In [None]:
set(test_df['card1'].astype('category'))
set(test_df['id_31']).difference(train_df['id_31'])

print(test_df['card1'].astype('category').value_counts(normalize=True)[0:1000].sum())
print(80 * '-')
print(train_df['card1'].astype('category').value_counts(normalize=True)[0:1000].sum())

In [None]:
print(pd.concat([test_df['id_31'].astype('category'),train_df['id_31'].astype('category')], axis=1).describe())
print(80 * '-')
print(pd.concat([test_df['id_13'].astype('category'),train_df['id_13'].astype('category')], axis=1).describe())
print(80 * '-')
print(pd.concat([test_df['addr1'].astype('category'),train_df['addr1'].astype('category')], axis=1).describe())
print(80 * '-')
print(pd.concat([test_df['card1'].astype('category'),train_df['card1'].astype('category')], axis=1).describe())
print(80 * '-')
print(pd.concat([test_df['D11'],train_df['D11']], axis=1).describe())
print(80 * '-')
print(pd.concat([test_df['D15'],train_df['D15']], axis=1).describe())
print(80 * '-')
print(pd.concat([test_df['D10'],train_df['D10']], axis=1).describe())
print(80 * '-')
print(pd.concat([test_df['dist1'],train_df['dist1']], axis=1).describe())

In [None]:
temp_list = list(set(test_df['id_31']).difference(train_df['id_31']))
temp_list.sort()
print(temp_list)

In [None]:
temp_list = list(set(test_df['id_31']).union(train_df['id_31']))[1:]
temp_list.sort()
print(temp_list)

In [None]:
print(train_df[train_df['isFraud']==1].id_31.value_counts()[0:10])
print(80 * '-')
print(train_df[train_df['isFraud']==0].id_31.value_counts()[0:10])

In [None]:
train_df.id_31.value_counts()[0:20]

In [None]:
print((train_df.id_31.str.contains('hrome')).sum() / train_df.id_31.count())
print((train_df.id_31.str.contains('afari')).sum() / train_df.id_31.count())
print((train_df.id_31.str.contains('amsun')).sum() / train_df.id_31.count())
print((train_df.id_31.str.contains('^ie')).sum() / train_df.id_31.count())
print((train_df.id_31.str.contains('android')).sum() / train_df.id_31.count())

In [None]:
from scipy import stats
stats.ks_2samp(train_df['TransactionAmt'].sample(1000), test_df['TransactionAmt'].sample(1000))

In [None]:
stats.ks_2samp(train_df['D15'].sample(1000), test_df['D15'].sample(1000))

In [None]:
temp_train['TransactionAmt'] = np.log(train_df['TransactionAmt'])
temp_test['TransactionAmt'] = np.log(test_df['TransactionAmt'] )

'''
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
temp_train['TransactionAmt'] = pd.DataFrame(scaler.fit_transform(train_df['TransactionAmt'].values.reshape(-1, 1)))
temp_test['TransactionAmt'] = pd.DataFrame(scaler.fit_transform(test_df['TransactionAmt'].values.reshape(-1, 1)))
'''

In [None]:
temp_test['TransactionAmt'].describe()

In [None]:
n_experiments = 100
n_transformed_better = 0
for i in range(n_experiments):
    orig_result = stats.ks_2samp(train_df['TransactionAmt'].sample(1000), test_df['TransactionAmt'].sample(1000))
    transformed_result = stats.ks_2samp(temp_train['TransactionAmt'].sample(1000), temp_test['TransactionAmt'].sample(1000))
    #print(orig_result)
    #print(transformed_result)
    if transformed_result[1] > orig_result[1]:
        #print("Yes")
        n_transformed_better += 1
    #print(80 * '-')
    
print("percentage where transformed was better is %.2f %%" %(n_transformed_better / n_experiments))

In [None]:
train_df['TransactionDT'].describe()