### 1. Clean Parquet data from Kaggle (integer dtypes parquet)
- This notebook aims to further clean the data taken from Kaggle's user, without affecting data quality

In [1]:
import gc
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
pd.options.display.float_format = "{:,.4f}".format
import random
import sys
import time
import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore")

In [2]:
from pathlib import Path
rootpath = Path.cwd().parent
sys.path.append(os.path.join(rootpath))

In [3]:
from utils.constants import *
from utils.eda_helpers import (
    plot_missing_proportion_barchart, get_cols, plot_scatterplot, plot_target_check, 
    plot_int_feature_distribution, plot_train_test_distribution, check_overlap_missing,
    insert_row_number, plot_sampled_time_series
)
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.extraction_helpers import read_file
from utils.feature_engineering_helpers import integerize
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, BINARY_COLUMNS,
    MEAN_FEATURES, MIN_FEATURES, MAX_FEATURES, LAST_FEATURES, NON_FEATURE_COLUMNS
)

In [4]:
%load_ext autoreload
%autoreload

In [5]:
START = time.time()

In [8]:
gc.collect()

18

### Select Column

In [9]:
columns = ["P_4", "S_11", "S_15"]

### Replace Train

In [None]:
%%time
train = read_file(f"{RAW_DATA_PATH}/train_data.csv")

In [None]:
%%time
train_ = read_file(f"{RAW_TRAIN_PARQUET_PATH}/train.parquet", replace_negative_one=False)
for column in tqdm(columns):
    train_[column] = train[column].values
train_.to_parquet(f"{RAW_TRAIN_PARQUET_PATH}/train.parquet")

In [None]:
del train, train_

### Replace Test

In [22]:
%%time
test = read_file(f"{RAW_DATA_PATH}/test_data_sp.parquet")

Shape of data: (11363762, 26)
CPU times: user 2.38 s, sys: 5 s, total: 7.38 s
Wall time: 6.66 s


In [25]:
%%time
test_ = read_file(f"{RAW_TEST_PARQUET_PATH}/test.parquet", replace_negative_one=False)
for column in tqdm(columns):
    test_[column] = test[column].values
test_.to_parquet(f"{RAW_TEST_PARQUET_PATH}/test.parquet")

Shape of data: (5531451, 192)
CPU times: user 20.3 s, sys: 15.2 s, total: 35.5 s
Wall time: 52.3 s


In [26]:
del test, test_

Shape of data: (11363762, 190)


KeyboardInterrupt: 

In [None]:
%%time
if "target" not in train_.columns:
    train_ = train_.merge(labels, on="customer_ID", how="left")

In [None]:
%%time
labels = pd.read_csv(f"{RAW_DATA_PATH}/train_labels.csv")
if "target" not in train.columns:
    train = train.merge(labels, on="customer_ID", how="left")

In [None]:
train_cid_list = train["customer_ID"].unique().tolist()

In [None]:
print(sorted(get_cols(train, ["P_", "S_"])))

In [None]:
train_["S_17"].describe()

In [None]:
train["S_17"].describe()

In [None]:
plot_target_check(train, "S_15", q=50, use_raw_bin=True)

In [None]:
plot_target_check(train_, "S_15", q=50, use_raw_bin=True)

In [None]:
t = train["S_11"].round(2).value_counts()

In [None]:
t.reset_index().sort_values(by="S_11")

### P_2

In [None]:
plot_target_check(train, "P_2", q=50, use_raw_bin=True)

In [None]:
train["P_2"] = train["P_2"].round(2)
test["P_2"] = test["P_2"].round(2)

In [None]:
plot_target_check(train, "P_2", q=50, use_raw_bin=True, nunique_thr=150, strfy_x=True, figsize=(25, 10))

In [None]:
train_test_ = plot_train_test_distribution(train, test, "P_2", nunique_thr=150, figsize=(24, 10), return_df=True)

In [None]:
from scipy.stats import ks_2samp
import numpy as np

In [None]:
ks_2samp(train["S_6"].dropna(), test["S_6"].dropna())

In [None]:
# plot_sampled_time_series(train, labels, "S_3", 50)

### P_3

In [None]:
plot_target_check(train, "P_3", q=50, use_raw_bin=True)

In [None]:
train["P_3"] = train["P_3"].round(2)
test["P_3"] = test["P_3"].round(2)

In [None]:
p3_summary = plot_target_check(train, "P_3", q=50, use_raw_bin=True, nunique_thr=340, strfy_x=True, figsize=(25, 10), 
                               drop_outlier=True, return_df=True)

In [None]:
plot_train_test_distribution(train, test, "P_3", nunique_thr=340, figsize=(24, 10))

### P_4

In [None]:
plot_target_check(train, "P_4", q=50, use_raw_bin=True)

In [None]:
train["P_4"] = train["P_4"].round(2)
test["P_4"] = test["P_4"].round(2)

In [None]:
plot_target_check(train, "P_4", q=50, use_raw_bin=True, strfy_x=True)

In [None]:
train["P_4"] = train["P_4"].replace(0, np.nan)
test["P_4"] = test["P_4"].replace(0, np.nan)

In [None]:
plot_target_check(train, "P_4", q=50, use_raw_bin=True, strfy_x=True)

In [None]:
plot_train_test_distribution(train, test, "P_4")

**Train Test Skew: Slight**

### S_3

In [None]:
plot_target_check(train, "S_3", q=50, use_raw_bin=True)

In [None]:
train["S_3"] = train["S_3"].round(2)
test["S_3"] = test["S_3"].round(2)

In [None]:
plot_target_check(train, "S_3", q=50, use_raw_bin=True, nunique_thr=400, strfy_x=True)

In [None]:
plot_train_test_distribution(train, test, "S_3", nunique_thr=500)

### S_5

In [None]:
plot_target_check(train, "S_5", q=50, use_raw_bin=True)

In [None]:
train["S_5"] = train["S_5"].round(2)
test["S_5"] = test["S_5"].round(2)

In [None]:
plot_target_check(train, "S_5", q=50, use_raw_bin=True, nunique_thr=5000, strfy_x=True)

In [None]:
plot_train_test_distribution(train, test, "S_5", nunique_thr=5000)

In [None]:
# a = train.groupby("customer_ID")["S_5"].max() - train.groupby("customer_ID")["S_5"].min()
# b = train.groupby("customer_ID")["S_5"].min()
# c = pd.concat([a, b], axis=1)
# c.columns = ["range", "min"]
# c.loc[(c["range"] > 0) & (c["min"] <= 0.01)]

In [None]:
# train.loc[train["customer_ID"] == "0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a", "S_5"].replace((0, 0.01), np.nan)

### S_6

In [None]:
plot_target_check(train, "S_6", q=50, use_raw_bin=True, nunique_thr=5000, strfy_x=True)

In [None]:
plot_train_test_distribution(train, test, "S_6", nunique_thr=5000)

### S_7

In [None]:
plot_target_check(train, "S_7", q=50, use_raw_bin=True)

In [None]:
train["S_7"] = train["S_7"].round(2)
test["S_7"] = test["S_7"].round(2)

In [None]:
plot_target_check(train, "S_7", q=50, use_raw_bin=True, nunique_thr=360, strfy_x=True)

In [None]:
plot_train_test_distribution(train, test, "S_7", nunique_thr=5000)

### S_8

In [None]:
plot_target_check(train, "S_8", q=50, use_raw_bin=True, nunique_thr=1000, strfy_x=True)

In [None]:
plot_train_test_distribution(train, test, "S_8", nunique_thr=500)

In [None]:
s8_mapping_dict = dict(zip(sorted(train["S_8"].unique()), range(train["S_8"].nunique())))

In [None]:
train["S_8"] = train["S_8"].map(s8_mapping_dict).replace(0, np.nan)
test["S_8"] = test["S_8"].map(s8_mapping_dict).replace(0, np.nan)

In [None]:
plot_target_check(train, "S_8", q=50, use_raw_bin=True, nunique_thr=100, strfy_x=True)

In [None]:
plot_train_test_distribution(train, test, "S_8", nunique_thr=100)

### S_9

In [None]:
plot_target_check(train, "S_9", q=50, use_raw_bin=True)

In [None]:
train["S_9"] = train["S_9"].round(2)
test["S_9"] = test["S_9"].round(2)

In [None]:
plot_target_check(train, "S_9", q=50, use_raw_bin=True, nunique_thr=300, strfy_x=True)

In [None]:
plot_train_test_distribution(train, test, "S_9", nunique_thr=300)

**Train Test Skew: Slight**

### S_11

In [None]:
plot_target_check(train, "S_11", q=50, use_raw_bin=True)

**Check: Identify if there is some 12 originated from the raw data is actually a missing value / NaN**

In [None]:
plot_train_test_distribution(train, test, "S_11", nunique_thr=500)

**Train Test Skew: Heavy**

### S_12

In [None]:
plot_target_check(train, "S_12", q=50, use_raw_bin=True)

In [None]:
temp = train.loc[train["S_12"].between(0.17, 0.21)]
plot_target_check(temp, "S_12", q=100, use_raw_bin=True)

In [None]:
train.loc[train["S_12"].between(0.1835, 0.1945), "S_12"] = np.nan
test.loc[test["S_12"].between(0.1835, 0.1945), "S_12"] = np.nan

In [None]:
train["S_12"] = train["S_12"].round(2)
test["S_12"] = test["S_12"].round(2)

In [None]:
s12_summary = plot_target_check(train, "S_12", q=50, use_raw_bin=True, nunique_thr=10000, strfy_x=True, return_df=True)

### S_13

In [None]:
plot_target_check(train, "S_13", q=50, use_raw_bin=True, nunique_thr=100, strfy_x=True)

In [None]:
plot_train_test_distribution(train, test, "S_13", nunique_thr=500)

In [None]:
s13_mapping_dict = dict(zip(sorted(train["S_13"].unique()), range(train["S_13"].nunique())))

In [None]:
train["S_13"] = train["S_13"].map(s13_mapping_dict)
test["S_13"] = test["S_13"].map(s13_mapping_dict)

In [None]:
plot_target_check(train, "S_13", q=50, use_raw_bin=True, nunique_thr=100, strfy_x=True)

In [None]:
plot_train_test_distribution(train, test, "S_13", nunique_thr=500)

### S_15

In [None]:
plot_target_check(train, "S_15", q=50, use_raw_bin=True)

**Check: Identify if there is some 8 originated from the raw data is actually a missing value / NaN**

In [None]:
plot_train_test_distribution(train, test, "S_15", nunique_thr=500)

**Train Test Skew: Moderate**

### S_16

In [None]:
plot_target_check(train, "S_16", q=50, use_raw_bin=True)

In [None]:
train["S_16"] = train["S_16"].round(2).replace(0, 0.01)
test["S_16"] = test["S_16"].round(2).replace(0, 0.01)

In [None]:
plot_target_check(train, "S_16", q=50, use_raw_bin=True, nunique_thr=15000, strfy_x=True)

In [None]:
plot_train_test_distribution(train, test, "S_16", nunique_thr=15000)

### S_17

In [None]:
plot_target_check(train, "S_17", q=50, use_raw_bin=True)

In [None]:
train["S_17"] = train["S_17"].round(2)
test["S_17"] = test["S_17"].round(2)

In [None]:
s17_summary = plot_target_check(train, "S_17", q=50, use_raw_bin=True, nunique_thr=15000, strfy_x=True, return_df=True)

In [None]:
train["S_17"] = train["S_17"].replace(0.01, 0)
test["S_17"] = test["S_17"].replace(0.01, 0)

In [None]:
train["S_17"] = train["S_17"].replace(0, np.nan)
test["S_17"] = test["S_17"].replace(0, np.nan)

In [None]:
plot_target_check(train, "S_17", q=50, use_raw_bin=True, nunique_thr=15000, strfy_x=True)

In [None]:
plot_train_test_distribution(
    train.loc[train["S_17"] <= 1], 
    test.loc[test["S_17"] <= 1], 
    "S_17", 
    nunique_thr=15000
)

**Train Test Skew: Heavy**

### S_18

In [None]:
plot_target_check(train, "S_18", q=50, use_raw_bin=True)

In [None]:
plot_train_test_distribution(train, test, "S_18", nunique_thr=150)

### S_19

In [None]:
plot_target_check(train, "S_19", q=50, use_raw_bin=True)

In [None]:
train["S_19"] = train["S_19"].round(2)
test["S_19"] = test["S_19"].round(2)

In [None]:
train["S_19"] = train["S_19"].replace((0, 0.01), np.nan)
test["S_19"] = test["S_19"].replace((0, 0.01), np.nan)

In [None]:
plot_target_check(train, "S_19", q=50, use_raw_bin=True, strfy_x=True, nunique_thr=50)

In [None]:
plot_train_test_distribution(train, test, "S_19", nunique_thr=50)

**Train Test Skew: Moderate**

### S_20

In [None]:
plot_target_check(train, "S_20", q=50, use_raw_bin=True, nunique_thr=200)

In [None]:
plot_train_test_distribution(train, test, "S_20", nunique_thr=50)

### S_22

In [None]:
plot_target_check(train, "S_22", q=50, use_raw_bin=True)

In [None]:
train["S_22"] = train["S_22"].round(2)
test["S_22"] = test["S_22"].round(2)

In [None]:
plot_target_check(train, "S_22", q=50, use_raw_bin=True, nunique_thr=777, strfy_x=True)

**Relationship is ambigious**

In [None]:
plot_train_test_distribution(train, test, "S_22", nunique_thr=777)

### S_23

In [None]:
plot_target_check(train, "S_23", q=50, use_raw_bin=True)

In [None]:
train["S_23"] = train["S_23"].round(2)
test["S_23"] = test["S_23"].round(2)

In [None]:
plot_target_check(train, "S_23", q=50, use_raw_bin=True, strfy_x=True, nunique_thr=1600)

In [None]:
plot_train_test_distribution(train, test, "S_23", nunique_thr=1600)

In [None]:
train.loc[train["S_23"].between(0.129, 0.141), "S_23"] = np.nan
test.loc[test["S_23"].between(0.129, 0.141), "S_23"] = np.nan

In [None]:
plot_target_check(train, "S_23", q=50, use_raw_bin=True, strfy_x=True, nunique_thr=1600)

In [None]:
plot_train_test_distribution(train, test, "S_23", nunique_thr=1600)

### S_23a

In [None]:
train["S_23a"] = 0
test["S_23a"] = 0

In [None]:
train.loc[~train["S_23"].isnull(), "S_23a"] = 1
test.loc[~test["S_23"].isnull(), "S_23a"] = 1

In [None]:
plot_target_check(train, "S_23a", q=50, use_raw_bin=True, strfy_x=True, nunique_thr=1600)

In [None]:
plot_train_test_distribution(train, test, "S_23a", nunique_thr=1600)

### S_24

In [None]:
plot_target_check(train, "S_24", q=50, use_raw_bin=True)

In [None]:
train["S_24"] = train["S_24"].round(2)
test["S_24"] = test["S_24"].round(2)

In [None]:
plot_target_check(train, "S_24", q=50, use_raw_bin=True, nunique_thr=650, strfy_x=True)

In [None]:
plot_train_test_distribution(train, test, "S_24", nunique_thr=650)

**Train Test Skew: Heavy**

### S_25

In [None]:
plot_target_check(train, "S_25", q=50, use_raw_bin=True)

In [None]:
train["S_25"] = train["S_25"].round(2)
test["S_25"] = test["S_25"].round(2)

In [None]:
plot_target_check(train, "S_25", q=50, use_raw_bin=True)

In [None]:
train.loc[train["S_25"].between(0.979, 0.991), "S_25"] = np.nan
test.loc[test["S_25"].between(0.979, 0.991), "S_25"] = np.nan

In [None]:
plot_target_check(train, "S_25", q=50, use_raw_bin=True, nunique_thr=1500, strfy_x=True)

In [None]:
plot_train_test_distribution(train, test, "S_25", nunique_thr=1500)

In [None]:
train.loc[train["S_25"].between(0.97, 0.99), "S_25"] = np.nan
test.loc[test["S_25"].between(0.97, 0.99), "S_25"] = np.nan

In [None]:
plot_target_check(train, "S_25", q=50, use_raw_bin=True, nunique_thr=1500, strfy_x=True)

In [None]:
plot_train_test_distribution(train, test, "S_25", nunique_thr=1500)

**Train Test Skew: Moderate**

### S_26

In [None]:
plot_target_check(train, "S_26", q=50, use_raw_bin=True)

In [None]:
train["S_26"] = train["S_26"].round(2)
test["S_26"] = test["S_26"].round(2)

In [None]:
plot_target_check(train, "S_26", q=50, use_raw_bin=True, nunique_thr=2000, strfy_x=True)

In [None]:
plot_train_test_distribution(train, test, "S_26", nunique_thr=2000)

### S_27

In [None]:
plot_target_check(train, "S_27", q=50, use_raw_bin=True)

In [None]:
train["S_27"] = train["S_27"].round(2)
test["S_27"] = test["S_27"].round(2)

In [None]:
plot_target_check(train, "S_27", q=50, use_raw_bin=True, nunique_thr=1500, strfy_x=True)

In [None]:
train.loc[train["S_27"].between(0, 0.02), "S_27"] = np.nan
test.loc[test["S_27"].between(0, 0.02), "S_27"] = np.nan

In [None]:
plot_target_check(train, "S_27", q=50, use_raw_bin=True, nunique_thr=1500, strfy_x=True)

In [None]:
plot_train_test_distribution(train, test, "S_27", nunique_thr=1500)

**Train Test Skew: Heavy**

### S_27a

In [None]:
train["S_27a"] = 0
test["S_27a"] = 0

In [None]:
train.loc[~train["S_27"].isnull(), "S_27a"] = 1
test.loc[~test["S_27"].isnull(), "S_27a"] = 1

In [None]:
plot_target_check(train, "S_27a", q=50, use_raw_bin=True, strfy_x=True, nunique_thr=5000)

In [None]:
plot_train_test_distribution(train, test, "S_27a", nunique_thr=1600)

### Column checking

In [None]:
from tqdm import tqdm

In [None]:
all_columns = get_cols(train, ["P_", "S_"])

In [None]:
for d in tqdm(all_columns):
    neg1_series = train.loc[train[d] == -1, d]
    if len(neg1_series) > 0:
        print("Train", d, len(neg1_series))
        train[d] = train[d].replace(-1, -999)
    neg1_series = test.loc[test[d] == -1, d]
    if len(neg1_series) > 0:
        print("Test", d, len(neg1_series))
        test[d] = test[d].replace(-1, -999)

In [None]:
for d in tqdm(all_columns):
    train[d] = train[d].fillna(-127)
    test[d] = test[d].fillna(-127)

In [None]:
for d in tqdm(all_columns):
    neg1_series = train.loc[train[d] == -999, d]
    if len(neg1_series) > 0:
        print("Train", d, len(neg1_series))
        train[d] = train[d].replace(-999, -1)
    neg1_series = test.loc[test[d] == -999, d]
    if len(neg1_series) > 0:
        print("Test", d, len(neg1_series))
        test[d] = test[d].replace(-999, -1)

In [None]:
d_col_nunique = train[all_columns].nunique()

In [None]:
less_unique_d_cols = d_col_nunique[d_col_nunique < 64].index.tolist()

In [None]:
for d in less_unique_d_cols:
    print(d, ":", train[d].dtype, "\n", sorted(train[d].unique().tolist()), "\n")

In [None]:
# for col in tqdm(less_unique_d_cols):
#     train[col] = integerize(train[col])
#     test[col] = integerize(test[col])

In [None]:
train = train.drop(columns="temp", errors="ignore")
test = test.drop(columns="temp", errors="ignore")

In [None]:
train.shape, test.shape

In [None]:
%%time
train.to_parquet(f"{RAW_TRAIN_PARQUET_PATH}/train_ps.parquet")
test.to_parquet(f"{RAW_TEST_PARQUET_PATH}/test_ps.parquet")

In [None]:
END = time.time()

In [None]:
print(f"{END - START:.2f} seconds elapsed")

In [None]:
missing_df = plot_missing_proportion_barchart(train)

### Check Column

In [None]:
(train["D_106"].round(1) - train["D_106"]).fillna(0).sum()