In [4]:
import gc
import joblib
import pandas as pd
import numpy as np
import os
import seaborn as sns
import sys
sys.path.append("../")
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from sklearn.model_selection import train_test_split, StratifiedKFold
from tqdm import tqdm

In [5]:
from utils.common import (
    sigmoid, reverse_sigmoid,
    pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, get_final_metric_df,
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric, 
    plot_score_distribution, train_logistic_regression
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, 
    get_cols, plot_heatmap
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS
)

In [6]:
%load_ext autoreload
%autoreload

In [7]:
ID_COLUMNS = ["customer_ID", "target"]

### Read Data

In [9]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")
target = labels["target"]
labels.head(2)

Shape of data: (458913, 2)


Unnamed: 0,customer_ID,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0


In [11]:
num_statements_df = read_file(f"../{RAW_DATA_PATH}/train_num_statements.csv")
num_statements_df.head(2)

Shape of data: (458913, 2)


Unnamed: 0,customer_ID,num_statements
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,13
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,13


In [13]:
master_train_score_df = read_file(f"../{EXP_PATH}/master_result/master_train_scores.csv")
master_train_score_df = master_train_score_df.loc[:, ID_COLUMNS + get_cols(master_train_score_df, "oof")]

Shape of data: (458913, 44)


In [15]:
train_score_c1 = read_file(f"../{EXP_PATH}/chris/0.792648_custom_emb_local_best_no_sigmoid_skf_oof.csv")
train_score_c1 = train_score_c1.rename(columns={"oof_prediction": "chris_exp1_oof_score"})

Shape of data: (458913, 4)


In [16]:
master_train_score_df = master_train_score_df.merge(
    train_score_c1[ID_COLUMNS + ["chris_exp1_oof_score"]], on=ID_COLUMNS, how='left'
)

In [17]:
oof_columns = get_cols(master_train_score_df, "oof")

In [18]:
for col in oof_columns:
    metric = amex_metric(master_train_score_df["target"], master_train_score_df[col])
    print(f"{col.split('_')[0]}: OOF amex metric - {metric[0]:.5f}")

exp1: OOF amex metric - 0.79868
exp2: OOF amex metric - 0.79871
exp3: OOF amex metric - 0.79779
exp4: OOF amex metric - 0.79880
exp5: OOF amex metric - 0.79717
exp9: OOF amex metric - 0.79732
exp10: OOF amex metric - 0.79549
chris: OOF amex metric - 0.78706


In [19]:
normal_df = master_train_score_df.loc[master_train_score_df["target"] == 0].reset_index(drop=True)
default_df = master_train_score_df.loc[master_train_score_df["target"] == 1].reset_index(drop=True)

In [20]:
normal_df.shape, default_df.shape

((340085, 10), (118828, 10))

In [None]:
selected_columns = [f"exp{i}_oof_score" for i in [1, 2, 4, 9]]

### Get Normal Group

- Group = 0 means easiest & Group = 4 means the hardest to predict this user as a NORMAL user

In [21]:
normal_df.columns

Index(['customer_ID', 'target', 'exp1_oof_score', 'exp2_oof_score',
       'exp3_oof_score', 'exp4_oof_score', 'exp5_oof_score', 'exp9_oof_score',
       'exp10_oof_score', 'chris_exp1_oof_score'],
      dtype='object')

In [23]:
normal_df["train_oof"] = normal_df[selected_columns].mean(axis=1)

In [24]:
normal_df["group"] = pd.qcut(normal_df["train_oof"], q=5).cat.codes

In [25]:
normal_df = normal_df[["customer_ID", "target", "group"]]

In [26]:
normal_df.to_csv(f"./normal_predict_group.csv", index=False)

### Get Default Group

- Group = 0 means easiest & Group = 4 means the hardest to predict this user as a DEFAULT user

In [27]:
default_df.columns

Index(['customer_ID', 'target', 'exp1_oof_score', 'exp2_oof_score',
       'exp3_oof_score', 'exp4_oof_score', 'exp5_oof_score', 'exp9_oof_score',
       'exp10_oof_score', 'chris_exp1_oof_score'],
      dtype='object')

In [29]:
default_df["train_oof"] = default_df[selected_columns].mean(axis=1)

In [30]:
default_df["group"] = (4 - pd.qcut(default_df["train_oof"], q=5).cat.codes)

In [31]:
default_df = default_df[["customer_ID", "target", "group"]]

In [32]:
default_df["group"].value_counts()

2    23766
0    23766
4    23766
3    23765
1    23765
Name: group, dtype: int64

In [33]:
default_df.to_csv(f"./default_predict_group.csv", index=False)