In [1]:
import matplotlib.pyplot as plt
import gc
import os
import sys

In [2]:
sys.path.append("../")

In [3]:
import joblib
import pandas as pd
import numpy as np
import time
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, fbeta_score, make_scorer
from skopt import BayesSearchCV
from tqdm import tqdm

In [28]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [5]:
from utils.eval_helpers import plot_roc_curves, plot_feature_importance, amex_metric, amex_metric_np
from utils.eda_helpers import plot_missing_proportion_barchart

In [6]:
DATA_PATH = "../raw_data"
os.listdir(DATA_PATH)

['train_labels.csv',
 '.DS_Store',
 'test_data.ftr',
 'train_data.ftr',
 'sample_submission.csv']

In [7]:
PROCESSED_DATA_PATH = "../processed_data"
SUBMISSION_DATA_PATH = "../submissions"
CATEGORY_COLUMNS = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

In [8]:
%load_ext autoreload
%autoreload

### Read Data

In [9]:
train_data = pd.read_feather(f"{PROCESSED_DATA_PATH}/train_agg_data.ftr")
test_data = pd.read_feather(f"{PROCESSED_DATA_PATH}/test_agg_data.ftr")

In [10]:
train_data.shape, test_data.shape

((458913, 929), (924621, 929))

In [11]:
train_data.columns

Index(['customer_ID', 'P_2_mean', 'P_2_std', 'P_2_min', 'P_2_max', 'P_2_last',
       'D_39_mean', 'D_39_std', 'D_39_min', 'D_39_max',
       ...
       'days', 'record_per_day', 'has_D_110_mean', 'has_D_111_mean',
       'has_D_132_mean', 'has_D_134_mean', 'has_D_135_mean', 'has_D_136_mean',
       'has_D_137_mean', 'has_D_138_mean'],
      dtype='object', length=929)

In [12]:
train_labels = pd.read_csv(f"{DATA_PATH}/train_labels.csv")

In [13]:
train_labels.shape

(458913, 2)

In [14]:
train_labels.columns

Index(['customer_ID', 'target'], dtype='object')

### Train Val Split

In [39]:
train_ = train_data.merge(train_labels[["customer_ID", "target"]], on="customer_ID", how="left")

In [40]:
train, val = train_test_split(train_, test_size=0.15, random_state=1020, stratify=train_["target"])

In [41]:
train["target"].mean(), val["target"].mean()

(0.25893415642080003, 0.2589305170184639)

### Standard Neural Network

In [44]:
cat_columns = train.select_dtypes("category").columns.tolist()

In [45]:
X_train = train.drop(columns=["customer_ID", "target"] + cat_columns)
X_val = val.drop(columns=["customer_ID", "target"] + cat_columns)

In [46]:
y_train = train["target"]
y_val = val["target"]

In [47]:
X_train.shape

(390076, 917)

In [48]:
X_train_ = tf.convert_to_tensor(X_train)
X_val_ = tf.convert_to_tensor(X_val)

In [49]:
model = Sequential()
model.add(Dense(100, input_shape=(X_train.shape[1],), kernel_initializer='he_normal', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu', kernel_initializer='he_normal'))
model.add(Dropout(0.5))
model.add(Dense(10, activation='relu', kernel_initializer='he_normal'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [50]:
es = EarlyStopping(monitor='val_loss', patience=5)

In [51]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [52]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 100)               91800     
                                                                 
 dropout_3 (Dropout)         (None, 100)               0         
                                                                 
 dense_5 (Dense)             (None, 50)                5050      
                                                                 
 dropout_4 (Dropout)         (None, 50)                0         
                                                                 
 dense_6 (Dense)             (None, 10)                510       
                                                                 
 dropout_5 (Dropout)         (None, 10)                0         
                                                                 
 dense_7 (Dense)             (None, 1)                

In [54]:
start = time.time()
history = model.fit(X_train_, y_train, epochs=20, batch_size=512, validation_split=0.2, callbacks=[es], verbose=0)
end = time.time()
print(f"Done in {end - start:.2f} seconds")

2022-06-25 02:27:27.792358: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-06-25 02:27:28.141738: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-06-25 02:27:33.806708: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Done in 49.17 seconds


In [55]:
loss, acc = model.evaluate(X_train_, y_train, verbose=0)
loss, acc

(0.5719102025032043, 0.7410658597946167)

In [56]:
loss, acc = model.evaluate(X_val_, y_val, verbose=0)
loss, acc

(0.5719364285469055, 0.741069495677948)

In [None]:
model.predict(X_val_)[:, 0]



In [None]:
train["nn_score"] = model.predict(X_train_)[:, 0]
val["nn_score"] = model.predict(X_val_)[:, 0]

In [None]:
train["nn_score"].mean(), val["nn_score"].mean()

In [None]:
y_train_pred = final_lgbm_clf.predict_proba(X_train)[:, 1]
y_val_pred = final_lgbm_clf.predict_proba(X_val)[:, 1]

In [None]:
plot_roc_curves([y_train, y_val], 
                [y_train_pred, y_val_pred], 
                labels=["Train", "Validation"], 
                title="Train Validation ROC AUC")

In [None]:
imp_df = plot_feature_importance(final_lgbm_clf.feature_name_, 
                                 final_lgbm_clf.feature_importances_, 
                                 title="Feature Importance",
                                 limit=50)

In [None]:
feature_imp_thr = imp_df.loc[imp_df["feature"] == "dummy"]["feature_importance"].values[0]

In [None]:
selected_features = imp_df.loc[imp_df["feature_importance"] > feature_imp_thr]["feature"].tolist()
len(selected_features)

#### Final x2 LGBM

In [None]:
final_lgbm_clf = LGBMClassifier(**best_params)

In [None]:
X_train_ = X_train.loc[:, selected_features]
X_val_ = X_val.loc[:, selected_features]

In [None]:
final_lgbm_clf.fit(X_train_, y_train)

In [None]:
y_train_pred_ = final_lgbm_clf.predict_proba(X_train_)[:, 1]
y_val_pred_ = final_lgbm_clf.predict_proba(X_val_)[:, 1]

In [None]:
plot_roc_curves([y_train, y_val], 
                [y_train_pred_, y_val_pred_], 
                labels=["Train", "Validation"], 
                title="Train Validation ROC AUC")

In [None]:
y_train_df = pd.DataFrame(y_train).reset_index(drop=True)
y_train_pred_df = pd.DataFrame(y_train_pred_).rename(columns={0: "prediction"})

In [None]:
amex_metric(y_train_df, y_train_pred_df)

In [None]:
y_val_df = pd.DataFrame(y_val).reset_index(drop=True)
y_val_pred_df = pd.DataFrame(y_val_pred_).rename(columns={0: "prediction"})

In [None]:
amex_metric(y_val_df, y_val_pred_df)

In [None]:
# num_list, num_features_list, train_score_list, val_score_list = [], [], [], []
# for i in tqdm(range(30)):
#     selected_features = imp_df.loc[imp_df["feature_importance"] > i]["feature"].tolist()
#     print(f"# of features: {len(selected_features)}")
    
#     X_train_new = train.loc[:, selected_features]
#     X_val_new = val.loc[:, selected_features]
    
#     lgbm_clf = LGBMClassifier(random_state=1020)
#     lgbm_clf.fit(X_train_new, y_train)
    
#     y_train_pred = lgbm_clf.predict_proba(X_train_new)[:, 1]
#     y_val_pred = lgbm_clf.predict_proba(X_val_new)[:, 1]
    
#     y_train_df = pd.DataFrame(y_train).reset_index(drop=True)
#     y_train_pred_df = pd.DataFrame(y_train_pred).rename(columns={0: "prediction"})
#     y_val_df = pd.DataFrame(y_val).reset_index(drop=True)
#     y_val_pred_df = pd.DataFrame(y_val_pred).rename(columns={0: "prediction"})
    
#     train_score = amex_metric(y_train_df, y_train_pred_df)
#     val_score = amex_metric(y_val_df, y_val_pred_df)
#     num_list.append(i)
#     num_features_list.append(len(selected_features))
#     train_score_list.append(train_score)
#     val_score_list.append(val_score)

In [None]:
# eval_df = pd.DataFrame(dict(index_=num_list, 
#                             num_feature=num_features_list, 
#                             train_score=train_score_list, 
#                             val_score=val_score_list))

In [None]:
# plt.figure(figsize=(17, 6))
# plt.plot(eval_df["index_"], eval_df["train_score"], label="Train")
# plt.plot(eval_df["index_"], eval_df["val_score"], label="Validation")
# plt.legend()
# plt.show()

In [None]:
# save model
# joblib.dump(final_lgbm_clf, '../models/lgbm_version1.pkl')

In [None]:
# load model
loaded_lgbm_model = joblib.load('../models/lgbm_version1.pkl')

### Inference

In [None]:
X_test = test_data.loc[:, selected_features]

In [None]:
y_test_pred = loaded_lgbm_model.predict_proba(X_test)[:, 1]

In [None]:
test_data["prediction"] = y_test_pred

In [None]:
test_data.head()

#### Submission

In [None]:
submission = pd.read_csv(f"{DATA_PATH}/sample_submission.csv")

In [None]:
submission = submission.drop(columns="prediction")

In [None]:
result = submission.merge(test_data[["customer_ID", "prediction"]], on="customer_ID")

In [None]:
# result.to_csv(f"{SUBMISSION_DATA_PATH}/submission4.csv", index=False)