In [1]:
import pandas as pd
import numpy as np

from src.data_preprocessing import DataPreprocessingTrain, DataPreprocessingInference
from src.data_loader import DataLoader
from src.model_training import ModelTraining
from src.model_inference import ModelInference
from src.model_evaluation import ModelEvaluation

In [2]:
# get the data
train_id = DataLoader("data/train_identity.csv")
train_trans = DataLoader("data/train_transaction.csv")

test_id = DataLoader("data/test_identity.csv")
test_trans = DataLoader("data/test_transaction.csv")

In [3]:
# preprocess the training data
X_train, X_val, y_train, y_val = DataPreprocessingTrain(create_val_set=True).transform(train_id.dataset, train_trans.dataset)

In [4]:
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)
print(y_train.value_counts())
print(y_val.value_counts())

(115386, 400) (28847, 400) (115386,) (28847,)
0    106332
1      9054
Name: isFraud, dtype: int64
0    26583
1     2264
Name: isFraud, dtype: int64


In [5]:
# get categorical columns
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print(cat_cols)

['id_12', 'id_15', 'id_16', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain']


In [6]:
# Model training
params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_seed': 42
}

fit_params = {
    'cat_features': cat_cols,
    'early_stopping_rounds': 50,
    'eval_set': (X_val, y_val),
    'verbose': 100,
    'use_best_model': True
}

model = ModelTraining(params=params)
model.train(X_train, y_train, fit_params=fit_params)

Learning rate set to 0.102391
0:	test: 0.7985765	best: 0.7985765 (0)	total: 190ms	remaining: 3m 10s
100:	test: 0.9570759	best: 0.9570759 (100)	total: 13.4s	remaining: 1m 59s
200:	test: 0.9639872	best: 0.9639872 (200)	total: 26.3s	remaining: 1m 44s
300:	test: 0.9676430	best: 0.9676430 (300)	total: 39.3s	remaining: 1m 31s
400:	test: 0.9697342	best: 0.9697342 (400)	total: 52.7s	remaining: 1m 18s
500:	test: 0.9710195	best: 0.9710195 (500)	total: 1m 5s	remaining: 1m 5s
600:	test: 0.9719973	best: 0.9719973 (600)	total: 1m 19s	remaining: 52.5s
700:	test: 0.9728146	best: 0.9728146 (700)	total: 1m 32s	remaining: 39.6s
800:	test: 0.9734620	best: 0.9734620 (800)	total: 1m 46s	remaining: 26.6s
900:	test: 0.9744132	best: 0.9744132 (900)	total: 2m	remaining: 13.3s
999:	test: 0.9749630	best: 0.9749630 (999)	total: 2m 15s	remaining: 0us

bestTest = 0.9749630433
bestIteration = 999



<src.model_training.ModelTraining at 0x147af2140>

In [7]:
# preprocessing the test data
X_test = DataPreprocessingInference().transform(test_id.dataset, test_trans.dataset)

In [8]:
print(X_test.shape)

(141907, 400)


In [9]:
# Make inference
y_pred = ModelInference(model_object=model.model).predict_proba(X_test)

In [10]:
# create submission file
submission = pd.concat([test_id.dataset['TransactionID'], pd.Series(y_pred[:, 1])], axis=1)
submission.columns = ['TransactionID', 'isFraud']
submission.to_csv("data/submission.csv", index=False)

In [11]:
print(submission.shape)

(141907, 2)


In [12]:
print(test_id.dataset['TransactionID'].nunique())

141907


In [None]:
# submission requires 506691 rows
# test_id dataset has 141907 rows
# hence, i need to figure out how to handle this