In [1]:
import joblib
import json
import matplotlib.pyplot as plt
import gc
import sys
sys.path.append("../")
sys.path.append("../../")
import optuna
from optuna.integration import TFKerasPruningCallback
import pandas as pd
import numpy as np
import time
import warnings
warnings.simplefilter("ignore")
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, fbeta_score, make_scorer
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

In [2]:
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
from cycler import cycler
from IPython.display import display
from colorama import Fore, Back, Style
plt.rcParams['axes.facecolor'] = '#0057b8' # blue
plt.rcParams['axes.prop_cycle'] = cycler(color=['#ffd700'] +
                                         plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])
plt.rcParams['text.color'] = 'w'

In [3]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.optimizers.schedules import ExponentialDecay

In [4]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import (
    RAW_DATA_PATH, PROCESSED_DATA_PATH, EXP_PATH
)
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, 
    get_cols
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS
)

In [5]:
%load_ext autoreload
%autoreload

### Read Data

In [6]:
%%time
train = read_file(f"../{PROCESSED_DATA_PATH}/v3/train_agg.pkl")

Shape of data: (458913, 4733)
CPU times: user 3.97 s, sys: 9.88 s, total: 13.9 s
Wall time: 19.2 s


In [7]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")

Shape of data: (458913, 2)


In [8]:
train.loc[:, "target"] = labels["target"].values

In [9]:
feature_imp = read_file(f"../{EXP_PATH}/5.lgbm_dart_923_half_fix/feature_importance.csv").set_index("base_feature")

Shape of data: (181, 39)


In [10]:
master_list = []
for col in feature_imp.columns:
    temp_list = [c + f"_{col}" for c in feature_imp.loc[feature_imp[col] > 150].index.tolist()]
    master_list.extend(temp_list)

In [11]:
len(master_list)

1937

In [12]:
train = train.loc[:, ["customer_ID", "target"] + master_list]

In [13]:
missing_counts = train.isnull().sum()

In [14]:
master_list = missing_counts[missing_counts <= int(0.2 * train.shape[0])].index.tolist()

In [15]:
train = train.loc[:, master_list]

In [16]:
train.columns[2:]

Index(['B_1_avg', 'B_10_avg', 'B_11_avg', 'B_12_avg', 'B_13_avg', 'B_14_avg',
       'B_18_avg', 'B_19_avg', 'B_2_avg', 'B_21_avg',
       ...
       'S_24a_velocity', 'S_27_velocity', 'S_3_velocity', 'S_37d_velocity',
       'S_37m_velocity', 'S_3a_velocity', 'S_5_velocity', 'S_7_velocity',
       'S_7a_velocity', 'S_8_velocity'],
      dtype='object', length=1392)

In [17]:
for col in tqdm(train.columns[2:]):
    try:
        train[col] = train[col].fillna(train[col].mean())
    except:
        train[col] = train[col].fillna(train[col].mode()[0])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1392/1392 [04:23<00:00,  5.28it/s]


In [19]:
train = train.set_index("customer_ID")

In [21]:
train.shape, labels.shape

((458913, 1393), (458913, 2))

In [35]:
train = train.replace(np.inf, 1e8).replace(-np.inf, -1e8)

In [54]:
master = train.drop(columns="target")
target = train["target"].values

In [None]:
train.to_pickle(f"./train_data_ready_to_train.pkl")

### Train Val Split

In [36]:
train_, validation_ = train_test_split(train, test_size=0.2, random_state=1020, stratify=train["target"])

In [37]:
X_train = train_.drop(columns="target")
y_train = train_["target"]
X_valid = validation_.drop(columns="target")
y_valid = validation_["target"]

In [None]:
mscaler = MinMaxScaler()

In [None]:
mscaler.fit(X_train)

In [73]:
def create_model(trial):
    n_layers = trial.suggest_int("n_layers", 2, 3)
    model = tf.keras.Sequential()
    hidden_units = []
    for i in range(n_layers):
        if len(hidden_units) == 0: 
            last = 2048
        else:
            last = hidden_units[-1]
        num_hidden = trial.suggest_int(f"n_units_l{i}", int(last/3), int(last/2), log=True)
        hidden_units.append(num_hidden)
        if i == n_layers - 1:
            model.add(tf.keras.layers.Dense(1, activation="sigmoid"))
        elif i == 0:
            model.add(tf.keras.layers.Dense(num_hidden, 
                                            activation="relu", 
                                            kernel_initializer='he_normal', 
                                            input_shape=(master.shape[1],)))
            dropout_rate = trial.suggest_categorical(f"dropout_l{i}", [0.1, 0.2, 0.3])
            model.add(tf.keras.layers.Dropout(dropout_rate))
        else:
            model.add(tf.keras.layers.Dense(num_hidden, 
                                            activation="relu",
                                            kernel_initializer='he_normal'))
            dropout_rate = trial.suggest_categorical(f"dropout_l{i}", [0.1, 0.2, 0.3])
            model.add(tf.keras.layers.Dropout(dropout_rate))
    model.compile(optimizer="adam", 
                  loss="binary_crossentropy", 
                  metrics=['accuracy'])
    es = EarlyStopping(monitor='val_loss', patience=5)
    return model

In [74]:
EPOCHS = 15 # number of epochs per trial
BATCH_SIZE = 128

In [75]:
keras.backend.clear_session()

In [76]:
def objective(trial):
    kf = StratifiedKFold(n_splits=5)
    for fold, (idx_tr, idx_va) in zip(range(1, 5+1), kf.split(master, train["target"])):
        # Clear clutter from previous session graphs.
        keras.backend.clear_session()
        
        fold = str(fold)
        X_train, X_val, y_train, y_val, model = None, None, None, None, None
        X_train_array = mscaler.transform(master.iloc[idx_tr])
        X_valid_array = mscaler.transform(master.iloc[idx_va])
        X_train = tf.convert_to_tensor(X_train_array)
        X_val = tf.convert_to_tensor(X_valid_array)
        y_train = target[idx_tr]
        y_val = target[idx_va]
        
        class Metrics(keras.callbacks.Callback):
            def __init__(self, X_val, y_val):
                self.X_val = X_val
                self.y_val = y_val
                self.y_pred = None
                
            def on_train_begin(self, logs={}):
                self._data = []

            def on_epoch_end(self, batch, logs={}):
                self.y_pred = np.asarray(model.predict(self.X_val))
                self.y_val = np.argmax(self.y_val, axis=1)
                self.y_pred = np.argmax(self.y_pred, axis=1)

                self._data.append({
                    'val_rocauc': roc_auc_score(self.y_val, self.y_pred),
                })
                return

            def get_data(self):
                return self._data
        
        metrics = Metrics(X_val, y_val)
        
        # Generate our trial model.
        model = create_model(trial)
        print(model.summary())
        
        # learning rate scheduler
        scheduler = ExponentialDecay(1e-3, 400 * ((len(idx_tr) * 0.8) / BATCH_SIZE), 1e-5)
        lr = LearningRateScheduler(scheduler, verbose=0)

        # Fit the model on the training data.
        # The TFKerasPruningCallback checks for pruning condition every epoch.
        model.fit(
            X_train,
            y_train,
            batch_size=BATCH_SIZE,
            callbacks=[TFKerasPruningCallback(trial, "val_loss")], # metrics
            epochs=EPOCHS,
            validation_data=(X_val, y_val),
            verbose=1,
        )
        
        del metrics
        y_pred = model.predict(X_val)
        score = amex_metric(y_val, y_pred)
        return score[0]

In [77]:
study = optuna.create_study(direction="minimize", 
                            sampler=optuna.samplers.TPESampler(), 
                            pruner=optuna.pruners.HyperbandPruner())

[32m[I 2022-08-14 03:55:17,489][0m A new study created in memory with name: no-name-a500663f-2ac8-4f13-afac-be81a1bcaf26[0m


In [None]:
study.optimize(objective, n_trials=30)

In [16]:
# m = plot_missing_proportion_barchart(train)

In [17]:
# missing_columns = m.loc[m["missing_proportion"] > 2]["column"].tolist()

# %%time
# train = train.drop(columns=missing_columns, errors="ignore")

# m = plot_missing_proportion_barchart(train)

# missing_columns = m["column"].tolist()
# len(missing_columns)

# for col in tqdm(m["column"].tolist()):
#     try:
#         train[col] = train[col].fillna(train[col].mean())
#     except:
#         train[col] = train[col].fillna(train[col].mode()[0])

#### One Hot Encode Category Columns

In [19]:
# %%time
# temp = pd.get_dummies(train.select_dtypes("category"))
# train = train.drop(columns=train.select_dtypes("category").columns.tolist(), errors="ignore")
# train = pd.concat([train, temp], axis=1)

In [None]:
# train.to_pickle(f"{PROCESSED_DATA_PATH}/neural_network_agg_features/train_agg_final.pkl")

In [None]:
train_["target"].mean(), validation_["target"].mean()

In [None]:
del train_, validation_

In [None]:
final_df = train.drop(columns="target")
target = train["target"].values

### Tune Neural Network

In [7]:
%%time
train = read_file(f"{PROCESSED_DATA_PATH}/neural_network_agg_features/train_agg_final.pkl")

Shape of data: (458913, 3058)
CPU times: user 3.37 s, sys: 16.3 s, total: 19.7 s
Wall time: 29.8 s


In [14]:
target = train["target"].values
train = train.drop(columns="target")

In [15]:
CURRENT_EXP_PATH = f"{EXP_PATH}/neural_network_exp"
CURRENT_EXP_PATH

'../experiments/neural_network_exp'

In [16]:
with open(f"{CURRENT_EXP_PATH}/best_scores.json", "r+") as outfile:
    best_scores_json = json.load(outfile)
np.mean(list(best_scores_json["validation"].values()))

0.79

In [18]:
EPOCHS = 15 # number of epocs per trial
BATCH_SIZE = 128

In [19]:
keras.backend.clear_session()

In [23]:
def objective(trial):
    kf = StratifiedKFold(n_splits=5)
    for fold, (idx_tr, idx_va) in zip(range(1, 5+1), kf.split(train, target)):
        # Clear clutter from previous session graphs.
        keras.backend.clear_session()

        fold = str(fold)
        X_train, X_val, y_train, y_val, model = None, None, None, None, None
        X_train = tf.convert_to_tensor(train.iloc[idx_tr])
        X_val = tf.convert_to_tensor(train.iloc[idx_va])
        y_train = target[idx_tr]
        y_val = target[idx_va]
        
        class Metrics(keras.callbacks.Callback):
            def __init__(self, X_val, y_val):
                self.X_val = X_val
                self.y_val = y_val
                self.y_pred = None
                
            def on_train_begin(self, logs={}):
                self._data = []

            def on_epoch_end(self, batch, logs={}):
                self.y_pred = np.asarray(model.predict(self.X_val))
                self.y_val = np.argmax(self.y_val, axis=1)
                self.y_pred = np.argmax(self.y_pred, axis=1)

                self._data.append({
                    'val_rocauc': roc_auc_score(self.y_val, self.y_pred),
                })
                return

            def get_data(self):
                return self._data
        
        metrics = Metrics(X_val, y_val)
        
        # Generate our trial model.
        model = create_model(trial)
        print(model.summary())
        
        # learning rate scheduler
        scheduler = ExponentialDecay(1e-3, 400 * ((len(idx_tr) * 0.8) / BATCH_SIZE), 1e-5)
        lr = LearningRateScheduler(scheduler, verbose=0)

        # Fit the model on the training data.
        # The TFKerasPruningCallback checks for pruning condition every epoch.
        model.fit(
            X_train,
            y_train,
            batch_size=BATCH_SIZE,
            callbacks=[TFKerasPruningCallback(trial, "val_loss")], # metrics
            epochs=EPOCHS,
            validation_data=(X_val, y_val),
            verbose=1,
        )
        
        del metrics
        # Evaluate the model accuracy on the validation set.
        score = model.evaluate(X_val, y_val, verbose=0)
        return score[1]

In [24]:
study = optuna.create_study(direction="minimize", 
                            sampler=optuna.samplers.TPESampler(), 
                            pruner=optuna.pruners.HyperbandPruner())

[32m[I 2022-08-07 09:07:29,924][0m A new study created in memory with name: no-name-f63af961-c050-4c04-b0b3-beeb7eae45ec[0m


In [25]:
study.optimize(objective, n_trials=30)

Metal device set to: Apple M1 Pro


[33m[W 2022-08-07 09:07:52,765][0m Trial 0 failed because of the following error: InternalError()[0m
Traceback (most recent call last):
  File "/Users/wklee/miniconda3/envs/amex/lib/python3.10/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/rx/900hbpv51bsg7d73m0zmd11m0000gn/T/ipykernel_43674/567814545.py", line 9, in objective
    X_train = tf.convert_to_tensor(train.iloc[idx_tr])
  File "/Users/wklee/miniconda3/envs/amex/lib/python3.10/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/Users/wklee/miniconda3/envs/amex/lib/python3.10/site-packages/tensorflow/python/framework/constant_op.py", line 102, in convert_to_eager_tensor
    return ops.EagerTensor(value, ctx.device_name, dtype)
tensorflow.python.framework.errors_impl.InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

In [26]:
gc.collect()

1677

In [27]:
# %%time
# minmax_scaler = MinMaxScaler()
# final_df = pd.DataFrame(minmax_scaler.fit_transform(final_df), 
#                         columns=final_df.columns)

### Standard Neural Network

In [None]:
X_train = train.drop(columns=["target"])
X_val = val.drop(columns=["target"])

In [None]:
y_train = train["target"]
y_val = val["target"]

In [None]:
with warnings.catch_warnings():
    warnings.filterwarnings('ignore', category=UserWarning)
    history = model.fit(
        X_train_, 
        y_train, 
        epochs=15, 
        batch_size=256, 
        validation_split=0.2, 
        callbacks=[es]
    )
y_train_pred = model.predict(X_train_).reshape(1, -1)[0]
y_val_pred = model.predict(X_val_).reshape(1, -1)[0]
train_metric, train_g, train_top4 = amex_metric_np(y_train_pred, y_train.values)
val_metric, val_g, val_top4 = amex_metric_np(y_val_pred, y_val.values)
# gc.collect()

In [None]:
# print(f"Train: {train_metric:.4f}, {train_g:.4f}, {train_top4:.4f}")
# print(f"Val  : {val_metric:.4f}, {val_g:.4f}, {val_top4:.4f}")

In [None]:
print(f"Train: {train_metric:.4f}, {train_g:.4f}, {train_top4:.4f}")
print(f"Val  : {val_metric:.4f}, {val_g:.4f}, {val_top4:.4f}")

### Optuna Tuning

In [28]:
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=1020, stratify=target)

In [29]:
def objective(trial):
    # Get hyperparams
    params = {
        'drop_out1': trial.suggest_categorical("drop_out1", np.arange(0.1, 0.6, 0.05)),
        'drop_out2': trial.suggest_categorical("drop_out2", np.arange(0.1, 0.6, 0.05)),
        'drop_out3': trial.suggest_categorical("drop_out3", np.arange(0.1, 0.6, 0.05)),
        'dense1': trial.suggest_categorical("dense1", [64, 128, 200, 256]),
        'epochs': trial.suggest_categorical("epochs", [15, 20, 25]),
        'batch_size': trial.suggest_categorical("batch_size", [64, 128, 256, 512, 1024]),
        'validation_split': trial.suggest_float("validation_split", 0.15, 0.3),
    }
    params["dense2"] = trial.suggest_int("dense2", int(params["dense1"] * 0.25), int(params["dense1"] * 0.75))
    params["dense3"] = trial.suggest_int("dense3", int(params["dense2"] * 0.25), int(params["dense2"] * 0.75))
    print(params)
    # Compile model
    model = Sequential()
    model.add(Dense(params["dense1"], input_shape=(X_train.shape[1],), kernel_initializer='he_normal', activation='relu'))
    model.add(Dropout(params["drop_out1"]))
    model.add(Dense(params["dense2"], activation='relu', kernel_initializer='he_normal'))
    model.add(Dropout(params["drop_out2"]))
    model.add(Dense(params["dense3"], activation='relu', kernel_initializer='he_normal'))
    model.add(Dropout(params["drop_out3"]))
    model.add(Dense(1, activation='sigmoid'))
    es = EarlyStopping(monitor='val_loss', patience=5)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])
    
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=UserWarning)
        history = model.fit(
            X_train, 
            y_train, 
            epochs=params["epochs"], 
            batch_size=params["batch_size"], 
            validation_split=params["validation_split"], 
            callbacks=[es], 
            verbose=0
        )
    y_train_pred = model.predict(X_train).reshape(1, -1)[0]
    y_val_pred = model.predict(X_val).reshape(1, -1)[0]
    del model
    train_metric, train_g, train_top4 = amex_metric_np(y_train_pred, y_train.values)
    val_metric, val_g, val_top4 = amex_metric_np(y_val_pred, y_val.values)
    print(f"{Fore.GREEN}{Style.BRIGHT} Train Score = {train_metric:.5f} ({train_g}, {train_top4})")
    print(f"Val Score = {val_metric:.5f}{Style.RESET_ALL} ({val_g}, {val_top4})")
    
    return val_metric

In [30]:
X_train.shape, y_train.shape

((367130, 3057), (367130,))

In [31]:
study = optuna.create_study(direction="maximize")

[32m[I 2022-08-07 09:08:21,641][0m A new study created in memory with name: no-name-66cf7d6c-eee6-465b-9562-9129e4bc262b[0m


In [None]:
study.optimize(objective, n_trials=10)

{'drop_out1': 0.5000000000000001, 'drop_out2': 0.45000000000000007, 'drop_out3': 0.5500000000000002, 'dense1': 256, 'epochs': 25, 'batch_size': 128, 'validation_split': 0.22621634286241138, 'dense2': 165, 'dense3': 69}


In [None]:
gc.collect()

In [None]:
del train

In [None]:
def objective(trial):
    classifier = create_classifier(trial)

    optuna_pruning_hook = optuna.integration.TensorFlowPruningHook(
        trial=trial,
        estimator=classifier,
        metric="accuracy",
        run_every_steps=PRUNING_INTERVAL_STEPS,
    )

    train_spec = tf.estimator.TrainSpec(
        input_fn=train_input_fn, max_steps=TRAIN_STEPS, hooks=[optuna_pruning_hook]
    )

    eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, start_delay_secs=0, throttle_secs=0)

    eval_results, _ = tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)

    return float(eval_results["accuracy"])

### END

### Inference

In [None]:
%%time
test_agg = read_file(f"{PROCESSED_DATA_PATH}/test_agg_finer.pkl")

#### One Hot Encode Category Columns

In [None]:
temp = pd.get_dummies(test_agg.select_dtypes("category"))
test_agg = test_agg.drop(columns=test_agg.select_dtypes("category").columns.tolist(), errors="ignore")

In [None]:
test_agg = pd.concat([test_agg, temp], axis=1)
del temp

In [None]:
test_agg.shape

#### Impute Less Missing Columns

In [None]:
# pd.Series(train_col, name="feature").to_csv(f"{EXP_PATH}/neural_network/features.csv", index=False)

In [None]:
column_to_use = pd.read_csv(f"{EXP_PATH}/neural_network/features.csv")["feature"].tolist()

In [None]:
test_agg = test_agg.loc[:, column_to_use]
test_agg.shape

In [None]:
missing_prop_df = plot_missing_proportion_barchart(test_agg)

In [None]:
can_impute_columns = missing_prop_df.loc[missing_prop_df["missing_proportion"] > 0].column.tolist()

In [None]:
for col in tqdm(can_impute_columns):
    test_agg[col] = test_agg[col].fillna(test_agg[col].mean())

In [None]:
minmax_scaler = MinMaxScaler()

In [None]:
test_agg = pd.DataFrame(minmax_scaler.fit_transform(test_agg), columns=test_agg.columns)

In [None]:
test_agg.shape

In [None]:
tf.config.list_physical_devices('GPU')[0].name

In [None]:
tf.config.experimental.get_memory_usage(device='GPU:0')

In [None]:
tf.config.list_physical_devices("CPU")

In [None]:
with tf.device("GPU:0"):
    test_agg1 = tf.convert_to_tensor(test_agg.iloc[:460_000])
    # test_agg2 = tf.convert_to_tensor(test_agg.iloc[460_000:])

In [None]:
fold = 1
loaded_model = keras.models.load_model(f'{EXP_PATH}/neural_network/models/model{fold}')
pred_ = loaded_model.predict(test_agg1)
a = pred_.reshape(1, -1)[0]

In [None]:
del a, pred_

In [None]:
# result = pd.read_csv(f"{SUBMISSION_DATA_PATH}/submission25_0.799.csv")

In [None]:
for fold in tqdm(range(1, 6)):
    loaded_model = keras.models.load_model(f'{EXP_PATH}/neural_network/models/model{fold}')
    pred_ = loaded_model.predict(test_agg1)
    result.loc[:460_000-1, f"pred{fold}"] = pred_.reshape(1, -1)[0]
    tf.keras.backend.clear_session()

In [None]:
amex_metric_np(y_val_pred.reshape(1, -1)[0], y_val.values)

In [None]:
type(y_val.values), type(y_val_pred)

In [None]:
loss, acc = model.evaluate(X_train_agg_, y_train_agg, verbose=0)
loss, acc

In [None]:
loss, acc = model.evaluate(X_val_, y_val, verbose=0)
loss, acc

In [None]:
model.predict(X_val_)[:, 0]

In [None]:
train["nn_score"] = model.predict(X_train_)[:, 0]
val["nn_score"] = model.predict(X_val_)[:, 0]

In [None]:
train["nn_score"].mean(), val["nn_score"].mean()

In [None]:
y_train_pred = final_lgbm_clf.predict_proba(X_train)[:, 1]
y_val_pred = final_lgbm_clf.predict_proba(X_val)[:, 1]

In [None]:
plot_roc_curves([y_train, y_val], 
                [y_train_pred, y_val_pred], 
                labels=["Train", "Validation"], 
                title="Train Validation ROC AUC")

In [None]:
imp_df = plot_feature_importance(final_lgbm_clf.feature_name_, 
                                 final_lgbm_clf.feature_importances_, 
                                 title="Feature Importance",
                                 limit=50)

In [None]:
feature_imp_thr = imp_df.loc[imp_df["feature"] == "dummy"]["feature_importance"].values[0]

In [None]:
selected_features = imp_df.loc[imp_df["feature_importance"] > feature_imp_thr]["feature"].tolist()
len(selected_features)