In [1]:
import pandas as pd
import numpy as np

from src.data_preprocessing import DataPreprocessingTrain, DataPreprocessingInference
from src.data_loader import DataLoader
from src.model_training import ModelTraining
from src.model_inference import ModelInference
from src.model_evaluation import ModelEvaluation

In [2]:
# get the data
train_id = DataLoader("data/train_identity.csv")
train_trans = DataLoader("data/train_transaction.csv")

test_id = DataLoader("data/test_identity.csv")
test_trans = DataLoader("data/test_transaction.csv")

In [3]:
print(test_id.dataset.shape)
print(test_trans.dataset.shape)

(141907, 41)
(506691, 393)


In [4]:
# preprocess the training data
X_train, X_val, y_train, y_val = DataPreprocessingTrain(create_val_set=True).transform(train_id.dataset, train_trans.dataset)

In [5]:
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)
print(y_train.value_counts())
print(y_val.value_counts())

(472432, 400) (118108, 400) (472432,) (118108,)
0    455902
1     16530
Name: isFraud, dtype: int64
0    113975
1      4133
Name: isFraud, dtype: int64


In [6]:
# get categorical columns
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print(cat_cols)

['id_12', 'id_15', 'id_16', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain']


In [7]:
# Model training
params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_seed': 42
}

fit_params = {
    'cat_features': cat_cols,
    'early_stopping_rounds': 25,
    'eval_set': (X_val, y_val),
    'verbose': 100,
    'use_best_model': True
}

model = ModelTraining(params=params)
model.train(X_train, y_train, fit_params=fit_params)

Learning rate set to 0.145034
0:	test: 0.6959991	best: 0.6959991 (0)	total: 620ms	remaining: 10m 19s
100:	test: 0.8954296	best: 0.8954296 (100)	total: 53.6s	remaining: 7m 57s
200:	test: 0.9099658	best: 0.9099658 (200)	total: 1m 49s	remaining: 7m 13s
300:	test: 0.9163182	best: 0.9163182 (300)	total: 2m 47s	remaining: 6m 28s
400:	test: 0.9212292	best: 0.9212292 (400)	total: 3m 46s	remaining: 5m 38s
500:	test: 0.9244568	best: 0.9244846 (499)	total: 4m 45s	remaining: 4m 44s
600:	test: 0.9269106	best: 0.9269106 (600)	total: 5m 46s	remaining: 3m 50s
700:	test: 0.9289656	best: 0.9289656 (700)	total: 6m 47s	remaining: 2m 53s
800:	test: 0.9310598	best: 0.9310598 (800)	total: 7m 48s	remaining: 1m 56s
900:	test: 0.9329882	best: 0.9329882 (900)	total: 8m 49s	remaining: 58.2s
999:	test: 0.9347366	best: 0.9347366 (999)	total: 9m 57s	remaining: 0us

bestTest = 0.934736592
bestIteration = 999



<src.model_training.ModelTraining at 0x13cf15e70>

In [8]:
# preprocessing the test data
X_test = DataPreprocessingInference().transform(test_id.dataset, test_trans.dataset)

In [9]:
print(X_test.shape)

(506691, 400)


In [10]:
# Make inference
y_pred = ModelInference(model_object=model.model).predict_proba(X_test)

In [13]:
y_pred

array([[0.99624146, 0.00375854],
       [0.99087893, 0.00912107],
       [0.99423535, 0.00576465],
       ...,
       [0.99088957, 0.00911043],
       [0.99013737, 0.00986263],
       [0.99219942, 0.00780058]])

In [15]:
# create submission file
sample_submission = pd.read_csv("data/sample_submission.csv")
sample_submission['isFraud'] = y_pred[:, 1]
sample_submission.to_csv("data/submission.csv", index=False)

In [None]:
# submission requires 506691 rows
# test_id dataset has 141907 rows
# hence, i need to figure out how to handle this