In [1]:
# debugging and formatter
from IPython.core.debugger import set_trace

%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit, KFold
import time
import lightgbm as lgb
import xgboost as xgb

plt.style.use(style="seaborn")
%matplotlib inline

<IPython.core.display.Javascript object>

In [3]:
from os import listdir
from os.path import isfile, join

loc = os.path.abspath("")
data_loc = f"{loc}/data"

<IPython.core.display.Javascript object>

Check data loading time and memory usage...

In [5]:
%%time
train_id = pd.read_csv(f"{data_loc}/train_identity.csv")
train_tr = pd.read_csv(f"{data_loc}/train_transaction.csv")
test_id = pd.read_csv(f"{data_loc}/test_identity.csv")
test_tr = pd.read_csv(f"{data_loc}/test_transaction.csv")

CPU times: user 21.9 s, sys: 3.15 s, total: 25.1 s
Wall time: 25.1 s


<IPython.core.display.Javascript object>

In [6]:
print(train_id.info())
print(train_tr.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144233 entries, 0 to 144232
Data columns (total 41 columns):
TransactionID    144233 non-null int64
id_01            144233 non-null float64
id_02            140872 non-null float64
id_03            66324 non-null float64
id_04            66324 non-null float64
id_05            136865 non-null float64
id_06            136865 non-null float64
id_07            5155 non-null float64
id_08            5155 non-null float64
id_09            74926 non-null float64
id_10            74926 non-null float64
id_11            140978 non-null float64
id_12            144233 non-null object
id_13            127320 non-null float64
id_14            80044 non-null float64
id_15            140985 non-null object
id_16            129340 non-null object
id_17            139369 non-null float64
id_18            45113 non-null float64
id_19            139318 non-null float64
id_20            139261 non-null float64
id_21            5159 non-null float64
id_2

<IPython.core.display.Javascript object>

#### Downcasts types to reduce memory usage.

In [7]:
def downcast_dtypes(df):
    _start = df.memory_usage(deep=True).sum() / 1024 ** 2
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    _end = df.memory_usage(deep=True).sum() / 1024 ** 2
    saved = (_start - _end) / _start * 100
    print(f"Saved {saved:.2f}%")
    return df

<IPython.core.display.Javascript object>

In [8]:
train_id = downcast_dtypes(train_id)
train_tr = downcast_dtypes(train_tr)
test_id = downcast_dtypes(test_id)
test_tr = downcast_dtypes(test_tr)

Saved 8.32%
Saved 40.53%
Saved 8.38%
Saved 40.20%


<IPython.core.display.Javascript object>

In [9]:
print(train_id.info())
print(train_tr.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144233 entries, 0 to 144232
Data columns (total 41 columns):
TransactionID    144233 non-null int16
id_01            144233 non-null float32
id_02            140872 non-null float32
id_03            66324 non-null float32
id_04            66324 non-null float32
id_05            136865 non-null float32
id_06            136865 non-null float32
id_07            5155 non-null float32
id_08            5155 non-null float32
id_09            74926 non-null float32
id_10            74926 non-null float32
id_11            140978 non-null float32
id_12            144233 non-null object
id_13            127320 non-null float32
id_14            80044 non-null float32
id_15            140985 non-null object
id_16            129340 non-null object
id_17            139369 non-null float32
id_18            45113 non-null float32
id_19            139318 non-null float32
id_20            139261 non-null float32
id_21            5159 non-null float32
id_2

<IPython.core.display.Javascript object>

In [10]:
train_id.to_parquet(f"{data_loc}/train_identity.parquet.gzip", compression="gzip")
train_tr.to_parquet(f"{data_loc}/train_transaction.parquet.gzip", compression="gzip")
test_id.to_parquet(f"{data_loc}/test_identity.parquet.gzip", compression="gzip")
test_tr.to_parquet(f"{data_loc}/test_transaction.parquet.gzip", compression="gzip")

<IPython.core.display.Javascript object>

In [11]:
%%time
train_id = pd.read_parquet(f"{data_loc}/train_identity.parquet.gzip")
train_tr = pd.read_parquet(f"{data_loc}/train_transaction.parquet.gzip")
test_id = pd.read_parquet(f"{data_loc}/test_identity.parquet.gzip")
test_tr = pd.read_parquet(f"{data_loc}/test_transaction.parquet.gzip")

CPU times: user 8.53 s, sys: 4.91 s, total: 13.4 s
Wall time: 1.51 s


<IPython.core.display.Javascript object>

In [24]:
train = pd.merge(
    train_tr,
    train_id,
    how="left",
    on="TransactionID",
    left_index=True,
    right_index=True,
)

<IPython.core.display.Javascript object>

In [25]:
test = pd.merge(
    test_tr, test_id, how="left", on="TransactionID", left_index=True, right_index=True
)

<IPython.core.display.Javascript object>

In [26]:
print(f"Train shape: {train.shape}, Test shape: {test.shape}")

Train shape: (590540, 434), Test shape: (506691, 433)


<IPython.core.display.Javascript object>