In [1]:
import pandas as pd
import numpy as np
from random import sample
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import calibration_curve
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, \
recall_score, accuracy_score, precision_score, confusion_matrix, f1_score
import seaborn as sns
from catboost import CatBoostClassifier

In [2]:
df_train_identity = pd.read_csv('train_identity.csv', index_col='TransactionID')
df_train_transaction = pd.read_csv('train_transaction.csv', index_col='TransactionID')

df_test_identity = pd.read_csv('test_identity.csv', index_col='TransactionID')
df_test_transaction = pd.read_csv('test_transaction.csv', index_col='TransactionID')

# Merge transaction and identity dataset

In [3]:
df_train = pd.merge(
    df_train_identity, df_train_transaction, how='left', on='TransactionID')

df_test = pd.merge(
    df_test_identity, df_test_transaction, how='left', on='TransactionID')

# Balance the dataset

In [4]:
mask = df_train['isFraud'] == 1

train_fraud = df_train[mask]
train_non = df_train[-mask]

train_not = train_non.sample(n = int(len(train_fraud)))

df_train_balanced = pd.concat([train_fraud, train_not])

print (len(train_fraud))
print (len(train_not))
print (len(df_train_balanced))

11318
11318
22636


In [5]:
df_train = df_train.iloc[0:0]
df_train = df_train_balanced.copy()
df_train

Unnamed: 0_level_0,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987240,0.0,169947.0,0.0,0.0,3.0,0.0,,,0.0,0.0,...,,,,,,,,,,
2987243,0.0,222455.0,0.0,0.0,0.0,0.0,,,0.0,0.0,...,,,,,,,,,,
2987245,0.0,271870.0,0.0,0.0,3.0,0.0,,,0.0,0.0,...,,,,,,,,,,
2987288,-20.0,258138.0,,,0.0,-1.0,,,,,...,,,,,,,,,,
2987367,-5.0,141271.0,,,9.0,-81.0,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3021134,-5.0,73174.0,0.0,0.0,0.0,-6.0,,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3463503,-5.0,323695.0,0.0,0.0,0.0,0.0,,,0.0,0.0,...,,,,,,,,,,
3041595,-5.0,343144.0,,,2.0,0.0,,,,,...,,,,,,,,,,
3288868,-5.0,165453.0,0.0,0.0,0.0,0.0,,,0.0,0.0,...,,,,,,,,,,


# Bit of Preprocessing and Sanity Check

#### card6: type of card; we drop the 3 unique rows of 'debit or credit' and 'charge card'

In [6]:
mask = df_train['card6'] != 'debit or credit'
df_train = df_train[mask]
mask2 = df_train['card6'] != 'charge card'
df_train = df_train[mask2]

In [7]:
# drop target column
target = df_train['isFraud'].copy()
df_train = df_train.drop('isFraud', axis=1)

# V columns correlated with each other so we filter

In [8]:
V_list = []
for i in range(93, 432):
    V_list.append(df_train.columns[i])

In [9]:
corr_matrix = df_train[V_list].corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
df_train = df_train.drop(df_train[to_drop], axis=1)

In [10]:
df_train

Unnamed: 0_level_0,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,...,V290,V291,V297,V300,V302,V303,V304,V305,V313,V314
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987240,0.0,169947.0,0.0,0.0,3.0,0.0,,,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
2987243,0.0,222455.0,0.0,0.0,0.0,0.0,,,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
2987245,0.0,271870.0,0.0,0.0,3.0,0.0,,,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
2987288,-20.0,258138.0,,,0.0,-1.0,,,,,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
2987367,-5.0,141271.0,,,9.0,-81.0,,,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3021134,-5.0,73174.0,0.0,0.0,0.0,-6.0,,,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
3463503,-5.0,323695.0,0.0,0.0,0.0,0.0,,,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
3041595,-5.0,343144.0,,,2.0,0.0,,,,,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
3288868,-5.0,165453.0,0.0,0.0,0.0,0.0,,,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0


#### transformation pipeline

#### MinMaxScale our numeric features

#### OneHotEncode our categoric features

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
class OneHotEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self,col):
        self.col = col
                
    def transform(self, X):
        return pd.get_dummies(X[self.col]) 

    def fit(self, X, y=None):
        return self  
    
# Transformer that does min-max transform on the specified columns
class MinMaxTransform(BaseEstimator, TransformerMixin):

    def __init__(self,cols=None):
        self.cols = cols
        self.minVec = np.zeros(len(cols))
        self.maxVec = np.zeros(len(cols))
       
    def transform(self, X):
        X[self.cols] = ((X[self.cols] - self.minVec)/(self.maxVec-self.minVec))
        return X

    def fit(self, X, y=None):
        self.minVec = (df_train[self.cols].min())
        self.maxVec = (df_train[self.cols].max())
        return self

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
numeric_features = df_train.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(
    steps=[('minmax', MinMaxTransform(numeric_features))])
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)])

In [9]:
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# numeric_features = df_train.select_dtypes(include=['int64', 'float64']).columns
# # categorical_features = df_train.select_dtypes(include=['object']).columns

# numeric_transformer = Pipeline(
#     steps=[('minmax', MinMaxTransform(numeric_features))])
# categorical_transformer = Pipeline(
#     steps=[('onehot', OneHotEncoder(categorical_features))])

# preprocessor = ColumnTransformer(
#     transformers=[('num', numeric_transformer, numeric_features),
#                   ('cat', categorical_transformer, categorical_features)])

# Fill NaNs with -999 to avoid overfitting on NaN

In [13]:
for col in df_train:
    df_train[col].fillna(-999, inplace=True)

# CatBoost

In [17]:
x_train, x_val, target_train, target_val = train_test_split(
    df_train, target, test_size=0.2, random_state=1)
categorical_features_indices = np.where(df_train.dtypes != np.float)[0]
clf = CatBoostClassifier(random_seed=1, custom_metric='AUC', silent=True)
clf.fit(x_train, target_train, cat_features=categorical_features_indices)
# clf.score(x_val, target_val)
clf.eval_metrics(x_val)

0.9273249392533687