In [1]:
import pandas as pd
import numpy as np
from random import sample
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import calibration_curve
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, \
recall_score, accuracy_score, precision_score, confusion_matrix, f1_score
import seaborn as sns
import catboost

In [12]:
df_train_identity = pd.read_csv('train_identity.csv', index_col='TransactionID')
df_train_transaction = pd.read_csv('train_transaction.csv', index_col='TransactionID')

df_test_identity = pd.read_csv('test_identity.csv', index_col='TransactionID')
df_test_transaction = pd.read_csv('test_transaction.csv', index_col='TransactionID')

# Merge transaction and identity dataset

In [13]:
df_train = pd.merge(
    df_train_identity, df_train_transaction, how='left', on='TransactionID')

df_test = pd.merge(
    df_test_identity, df_test_transaction, how='left', on='TransactionID')

# Fill NaNs with -999 to avoid overfitting on NaN

In [14]:
for col in df_train:
    df_train[col].fillna(-999, inplace=True)
for col in df_test:
    df_test[col].fillna(-999, inplace=True)

# V columns correlated with each other so pick 5

In [15]:
V_list = []
for i in range(94, 433):
    V_list.append(df_train.columns[i])

In [16]:
corr_matrix = df_train[V_list].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
df_train = df_train.drop(df_train[to_drop], axis=1)

In [18]:
df_train

Unnamed: 0_level_0,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,...,V285,V291,V292,V298,V313,V314,V319,V320,V337,V338
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987004,0.0,70787.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,...,0.0,1.0,1.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
2987008,-5.0,98945.0,-999.0,-999.0,0.0,-5.0,-999.0,-999.0,-999.0,-999.0,...,0.0,1.0,1.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,-999.0,-999.0,0.0,0.0,...,2.0,4.0,4.0,1.0,90.327904,90.327904,75.887497,75.887497,-999.0,-999.0
2987011,-5.0,221832.0,-999.0,-999.0,0.0,-6.0,-999.0,-999.0,-999.0,-999.0,...,0.0,1.0,1.0,0.0,0.000000,0.000000,0.000000,0.000000,-999.0,-999.0
2987016,0.0,7460.0,0.0,0.0,1.0,0.0,-999.0,-999.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3577521,-15.0,145955.0,0.0,0.0,0.0,0.0,-999.0,-999.0,0.0,0.0,...,1.0,9.0,9.0,0.0,60.066002,60.066002,0.000000,0.000000,-999.0,-999.0
3577526,-5.0,172059.0,-999.0,-999.0,1.0,-5.0,-999.0,-999.0,-999.0,-999.0,...,0.0,1.0,1.0,0.0,-999.000000,-999.000000,0.000000,0.000000,0.0,0.0
3577529,-20.0,632381.0,-999.0,-999.0,-1.0,-36.0,-999.0,-999.0,-999.0,-999.0,...,0.0,1.0,1.0,0.0,0.000000,0.000000,0.000000,0.000000,-999.0,-999.0
3577531,-5.0,55528.0,0.0,0.0,0.0,-7.0,-999.0,-999.0,0.0,0.0,...,0.0,1.0,1.0,0.0,-999.000000,-999.000000,0.000000,0.000000,0.0,0.0


In [17]:
corr_matrix = df_test[V_list].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
df_test = df_test.drop(df_test[to_drop], axis=1)

In [19]:
df_test

Unnamed: 0_level_0,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,...,V281,V284,V306,V307,V311,V314,V317,V319,V320,V334
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3663586,-45.0,280290.0,-999.0,-999.0,0.0,0.0,-999.0,-999.0,-999.0,-999.0,...,-999.0,0.0,0.000,0.0000,0.0,-999.0000,0.0,0.000,0.000,-999.0
3663588,0.0,3579.0,0.0,0.0,0.0,0.0,-999.0,-999.0,0.0,0.0,...,0.0,0.0,0.000,270.0000,0.0,310.0000,0.0,0.000,0.000,0.0
3663597,-5.0,185210.0,-999.0,-999.0,1.0,0.0,-999.0,-999.0,-999.0,-999.0,...,-999.0,0.0,0.000,0.0000,0.0,-999.0000,0.0,0.000,0.000,-999.0
3663601,-45.0,252944.0,0.0,0.0,0.0,0.0,-999.0,-999.0,0.0,0.0,...,-999.0,0.0,6.271,6.2710,0.0,-999.0000,0.0,6.271,6.271,-999.0
3663602,-95.0,328680.0,-999.0,-999.0,7.0,-33.0,-999.0,-999.0,-999.0,-999.0,...,-999.0,0.0,0.000,0.0000,0.0,-999.0000,0.0,0.000,0.000,-999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4170230,-20.0,473365.0,-999.0,-999.0,0.0,0.0,-999.0,-999.0,-999.0,-999.0,...,0.0,0.0,0.000,0.0000,0.0,0.0000,0.0,0.000,0.000,-999.0
4170233,-5.0,489917.0,0.0,0.0,-4.0,-32.0,-999.0,-999.0,0.0,0.0,...,0.0,0.0,0.000,13.4026,0.0,0.0000,0.0,0.000,0.000,-999.0
4170234,-5.0,110081.0,-999.0,-999.0,22.0,-31.0,-999.0,-999.0,-999.0,-999.0,...,0.0,0.0,0.000,0.0000,0.0,0.0000,0.0,0.000,0.000,0.0
4170236,-45.0,266704.0,-999.0,-999.0,-3.0,-10.0,-999.0,-999.0,-999.0,-999.0,...,1.0,0.0,0.000,31.7237,0.0,31.7237,0.0,0.000,0.000,-999.0


# Balance the dataset

In [20]:
mask = df_train['isFraud'] == 1

train_fraud = df_train[mask]
train_non = df_train[-mask]

train_not = train_non.sample(n = int(len(train_fraud)))

df_train_balanced = pd.concat([train_fraud, train_not])

print (len(train_fraud))
print (len(train_not))
print (len(df_train_balanced))

11318
11318
22636


In [23]:
df_train = df_train.iloc[0:0]
df_train = df_train_balanced.copy()
df_train

Unnamed: 0_level_0,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,...,V285,V291,V292,V298,V313,V314,V319,V320,V337,V338
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987240,0.0,169947.0,0.0,0.0,3.0,0.0,-999.0,-999.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,-999.0,-999.0
2987243,0.0,222455.0,0.0,0.0,0.0,0.0,-999.0,-999.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,-999.0,-999.0
2987245,0.0,271870.0,0.0,0.0,3.0,0.0,-999.0,-999.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,-999.0,-999.0
2987288,-20.0,258138.0,-999.0,-999.0,0.0,-1.0,-999.0,-999.0,-999.0,-999.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,-999.0,-999.0
2987367,-5.0,141271.0,-999.0,-999.0,9.0,-81.0,-999.0,-999.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3006108,0.0,40639.0,-999.0,-999.0,4.0,-7.0,-999.0,-999.0,-999.0,-999.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3548252,-10.0,1109.0,0.0,0.0,6.0,0.0,-999.0,-999.0,0.0,0.0,...,26.0,2.0,1.0,71.0,100.0,740.0,1165.0,7226.0,1165.0,7709.0
3096593,-80.0,345950.0,0.0,0.0,0.0,0.0,-999.0,-999.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3525881,-5.0,19087.0,0.0,0.0,0.0,-1.0,-999.0,-999.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,150.0,150.0


# Bit of Preprocessing and Sanity Check

#### card6: type of card; we drop the 3 unique rows of 'debit or credit' and 'charge card'

In [24]:
mask = df_train['card6'] != 'debit or credit'
df_train = df_train[mask]
mask2 = df_train['card6'] != 'charge card'
df_train = df_train[mask2]

#### create into category type

#### encode into numeric categories; we opt for dummy variables (One Hot Encoding)

In [25]:
df_train['card6'] = df_train['card6'].astype('category')
df_train['card4'] = df_train['card4'].astype('category')
df_train['ProductCD'] = df_train['ProductCD'].astype('category')
train_dummies = pd.get_dummies(df_train, columns=['card4', 'card6', 'ProductCD'])

In [27]:
df_train

Unnamed: 0_level_0,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,...,V285,V291,V292,V298,V313,V314,V319,V320,V337,V338
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987240,0.0,169947.0,0.0,0.0,3.0,0.0,-999.0,-999.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,-999.0,-999.0
2987243,0.0,222455.0,0.0,0.0,0.0,0.0,-999.0,-999.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,-999.0,-999.0
2987245,0.0,271870.0,0.0,0.0,3.0,0.0,-999.0,-999.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,-999.0,-999.0
2987288,-20.0,258138.0,-999.0,-999.0,0.0,-1.0,-999.0,-999.0,-999.0,-999.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,-999.0,-999.0
2987367,-5.0,141271.0,-999.0,-999.0,9.0,-81.0,-999.0,-999.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3006108,0.0,40639.0,-999.0,-999.0,4.0,-7.0,-999.0,-999.0,-999.0,-999.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3548252,-10.0,1109.0,0.0,0.0,6.0,0.0,-999.0,-999.0,0.0,0.0,...,26.0,2.0,1.0,71.0,100.0,740.0,1165.0,7226.0,1165.0,7709.0
3096593,-80.0,345950.0,0.0,0.0,0.0,0.0,-999.0,-999.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3525881,-5.0,19087.0,0.0,0.0,0.0,-1.0,-999.0,-999.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,150.0,150.0


In [29]:
# initialize scaler
scaler = preprocessing.MinMaxScaler()

# drop target column
df_train = df_train.drop('isFraud', axis=1)
y_train = train_dummies['isFraud'].copy()

In [31]:
x = df_train.values
x = scaler.fit_transform(x)

ValueError: could not convert string to float: 'NotFound'

In [None]:
# Fit the data with scaler (scales data)
x = df_train.values
x = scaler.fit_transform(x)
df_train = df_train.iloc[0:0]
df_train = pd.DataFrame(x_scaled)
training_scaled = training_scaled.set_axis([x for x in training_v2.columns], axis=1, inplace=False)
training_scaled.insert(loc=0, column='TransactionID', value=training_id)
training_scaled['y'] = training_y
training_scaled = training_scaled.dropna()
training_scaled['TransactionID'] = training_scaled['TransactionID'].astype(int)
training_scaled.head()

# CatBoost