In [1]:
# debugging and formatter
from IPython.core.debugger import set_trace

%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit, KFold
import time
import lightgbm as lgb
import xgboost as xgb

plt.style.use(style="seaborn")
%matplotlib inline

<IPython.core.display.Javascript object>

In [3]:
from os import listdir
from os.path import isfile, join

loc = os.path.abspath("")
data_loc = f"{loc}/data"

<IPython.core.display.Javascript object>

Check data loading time and memory usage...

In [11]:
%%time
train_id = pd.read_parquet(f"{data_loc}/train_identity.parquet.gzip")
train_tr = pd.read_parquet(f"{data_loc}/train_transaction.parquet.gzip")
test_id = pd.read_parquet(f"{data_loc}/test_identity.parquet.gzip")
test_tr = pd.read_parquet(f"{data_loc}/test_transaction.parquet.gzip")

CPU times: user 8.53 s, sys: 4.91 s, total: 13.4 s
Wall time: 1.51 s


<IPython.core.display.Javascript object>

In [24]:
train = pd.merge(
    train_tr,
    train_id,
    how="left",
    on="TransactionID",
    left_index=True,
    right_index=True,
)

<IPython.core.display.Javascript object>

In [25]:
test = pd.merge(
    test_tr, test_id, how="left", on="TransactionID", left_index=True, right_index=True
)

<IPython.core.display.Javascript object>

In [26]:
print(f"Train shape: {train.shape}, Test shape: {test.shape}")

Train shape: (590540, 434), Test shape: (506691, 433)


<IPython.core.display.Javascript object>

In [27]:
imp_features = [
    "TransactionAmt",
    "ProductCD",
    "card1",
    "card2",
    "card3",
    "card5",
    "card6",
    "addr1",
    "addr2",
    "dist1",
    "dist2",
    "P_emaildomain",
    "R_emaildomain",
    "C1",
    "C2",
    "C4",
    "C5",
    "C6",
    "C7",
    "C8",
    "C9",
    "C10",
    "C11",
    "C12",
    "C13",
    "C14",
    "D1",
    "D2",
    "D3",
    "D4",
    "D5",
    "D10",
    "D11",
    "D15",
    "M1",
    "M2",
    "M3",
    "M4",
    "M6",
    "M7",
    "M8",
    "M9",
    "V1",
    "V3",
    "V4",
    "V6",
    "V8",
    "V11",
    "V13",
    "V14",
    "V17",
    "V20",
    "V23",
    "V26",
    "V27",
    "V30",
    "V36",
    "V37",
    "V40",
    "V41",
    "V44",
    "V47",
    "V48",
    "V54",
    "V56",
    "V59",
    "V62",
    "V65",
    "V67",
    "V68",
    "V70",
    "V76",
    "V78",
    "V80",
    "V82",
    "V86",
    "V88",
    "V89",
    "V91",
    "V107",
    "V108",
    "V111",
    "V115",
    "V117",
    "V120",
    "V121",
    "V123",
    "V124",
    "V127",
    "V129",
    "V130",
    "V136",
    "V138",
    "V139",
    "V142",
    "V147",
    "V156",
    "V160",
    "V162",
    "V165",
    "V166",
    "V169",
    "V171",
    "V173",
    "V175",
    "V176",
    "V178",
    "V180",
    "V182",
    "V185",
    "V187",
    "V188",
    "V198",
    "V203",
    "V205",
    "V207",
    "V209",
    "V210",
    "V215",
    "V218",
    "V220",
    "V221",
    "V223",
    "V224",
    "V226",
    "V228",
    "V229",
    "V234",
    "V235",
    "V238",
    "V240",
    "V250",
    "V252",
    "V253",
    "V257",
    "V258",
    "V260",
    "V261",
    "V264",
    "V266",
    "V267",
    "V271",
    "V274",
    "V277",
    "V281",
    "V283",
    "V284",
    "V285",
    "V286",
    "V289",
    "V291",
    "V294",
    "V296",
    "V297",
    "V301",
    "V303",
    "V305",
    "V307",
    "V309",
    "V310",
    "V314",
    "V320",
    "DeviceType",
    "DeviceInfo",
    "isFraud",
]

<IPython.core.display.Javascript object>

In [28]:
len(imp_features)

165

<IPython.core.display.Javascript object>

In [29]:
cols_to_drop_train = [col for col in train.columns if col not in imp_features]
cols_to_drop_test = [col for col in test.columns if col not in imp_features]

print(f"{len(cols_to_drop_train)} features from train are going to be dropped.")
print(f"{len(cols_to_drop_test)} features from test are going to be dropped.")

269 features from train are going to be dropped.
269 features from test are going to be dropped.


<IPython.core.display.Javascript object>

In [30]:
train = train.drop(cols_to_drop_train, axis=1)
test = test.drop(cols_to_drop_test, axis=1)

<IPython.core.display.Javascript object>

In [31]:
def clean_inf_nan(df):
    return df.replace([np.inf, -np.inf], np.nan)

<IPython.core.display.Javascript object>

In [32]:
train = clean_inf_nan(train)
test = clean_inf_nan(test)

<IPython.core.display.Javascript object>

In [33]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

<IPython.core.display.Javascript object>

In [34]:
for col in train.columns:
    if train[col].dtype == "object":
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))

<IPython.core.display.Javascript object>

In [35]:
print(train.shape)
print(test.shape)

(590540, 165)
(506691, 164)


<IPython.core.display.Javascript object>

In [36]:
X_train = train.drop("isFraud", axis=1).copy()
X_test = test.copy()
y_train = train["isFraud"].copy()

<IPython.core.display.Javascript object>

In [37]:
print(X_train.shape, X_test.shape, y_train.shape)

(590540, 164) (506691, 164) (590540,)


<IPython.core.display.Javascript object>

In [38]:
from sklearn.model_selection import train_test_split

X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    X_train, y_train, test_size=0.3, random_state=7
)

<IPython.core.display.Javascript object>