In [None]:
import matplotlib.pyplot as plt
import gc
import os
import sys

In [None]:
sys.path.append("../")

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from skopt import BayesSearchCV
from tqdm import tqdm

In [None]:
from utils.eval_helpers import plot_roc_curves, plot_feature_importance, amex_metric, amex_metric_np
from utils.eda_helpers import plot_missing_proportion_barchart

In [None]:
DATA_PATH = "../raw_data"
os.listdir(DATA_PATH)

In [None]:
PROCESSED_DATA_PATH = "../processed_data"
SUBMISSION_DATA_PATH = "../submissions"
CATEGORY_COLUMNS = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

In [None]:
%load_ext autoreload
%autoreload

### Read Data

In [None]:
train_data = pd.read_feather(f"{DATA_PATH}/train_data.ftr")
test_data = pd.read_feather(f"{DATA_PATH}/test_data.ftr")

In [None]:
train_data["S_2"] = pd.to_datetime(train_data["S_2"])
test_data["S_2"] = pd.to_datetime(test_data["S_2"])

In [None]:
train_data.shape, test_data.shape

In [None]:
train_data.columns

In [None]:
train_labels = pd.read_csv(f"{DATA_PATH}/train_labels.csv")

In [None]:
train_labels.shape

In [None]:
train_labels.columns

### Simple Preprocessing

In [None]:
train_data = train_data.sort_values(by=["customer_ID", "S_2"])
test_data = test_data.sort_values(by=["customer_ID", "S_2"])

In [None]:
train_data = train_data.merge(train_labels, on="customer_ID", how="left")

In [None]:
train = train_data.copy()
test = test_data.copy()

#### EDA & Data Cleaning

#### Missing value analysis (Pre Simple Imputation)

In [None]:
missing_prop_df = plot_missing_proportion_barchart(train)

#### Simple Imputation

In [None]:
def create_has_col(df, col):
    has_col = f"has_{col}"
    df.loc[:, has_col] = 0
    df.loc[~df[col].isnull(), has_col] = 1
    return df

In [None]:
def create_sign_col(df, col):
    sign_col = f"{col}_sign"
    df[sign_col] = df[col].apply(lambda x: 0 if x == 0 else x / abs(x))
    return df

In [None]:
def apply_all_fillna(df):
    # Simple Fill NA with 0
    for col in ["D_87", "D_88", "B_39", "B_42"]:
        df[col] = df[col].fillna(0)  # .apply(lambda x: (abs(x) + x) / 2).fillna(0)
    # Create has column
    for col in ["D_110", "D_111", "D_132", "D_134", "D_135", "D_136", "D_137", "D_138", 
                "R_9"]:
        df = create_has_col(df, col=col)
    # Create sign column
    for col in ["B_39"]:
        df = create_sign_col(df, col=col)
    return df

In [None]:
# ! Fill NA
train = apply_all_fillna(train)
test = apply_all_fillna(test)

In [None]:
train.loc[~train["D_134"].isnull()][["D_134", "D_135", "D_136", "D_137", "D_138"]]

In [None]:
train[["has_D_132", "has_D_134", "has_D_135", "has_D_136", "has_D_137", "has_D_138"]].corr()

In [None]:
amex_metric_np

In [None]:
train.shape

In [None]:
train.groupby("target")["B_29"].mean()

In [None]:
train.loc[train["R_9"].isnull()]["target"].mean()

In [None]:
train.loc[~train["R_9"].isnull()]["target"].mean()

In [None]:
train.loc[train["B_39"].apply(lambda x: abs(x - 0) < 1e-5)]["B_39"]

In [None]:
train["R_9"].describe()

In [None]:
test["D_73"].describe()

In [None]:
train["B_39"].value_counts()

#### Missing value analysis (Post Simple Imputation)

In [None]:
missing_prop_df = plot_missing_proportion_barchart(train)

In [None]:
for column in missing_prop_df.iloc[:20]["column"].tolist():
    print(column, "\n", high_missing_train_df[column].value_counts(), "\n")

In [None]:
high_missing_count_columns = 

In [None]:
high_missing_train_df = train_data.loc[:, high_missing_count_columns + ["target"]]

In [None]:
high_missing_train_df.loc[high_missing_train_df["D_87"].isnull()]["target"].mean()

In [None]:
high_missing_train_df.loc[~high_missing_train_df["D_87"].isnull()]["target"].mean()

In [None]:
train_cols_unique_count = train_data.nunique()

In [None]:
less_unique_columns = train_cols_unique_count[train_cols_unique_count <= 300].index.tolist()
print(less_unique_columns)

In [None]:
for column in less_unique_columns:
    print(f"Column {column}")
    print(train_data[column].unique(), end="\n\n")

In [None]:
all_cols = [c for c in train_data.columns if c not in ['customer_ID', 'S_2']]
cat_features = less_unique_columns
num_features = [col for col in all_cols if col not in less_unique_columns]

In [None]:
len(all_cols), len(cat_features), len(num_features)

In [None]:
def get_agg_summary(original_df):
    agg_summary = original_df.groupby("customer_ID").agg(num_records=("S_2", "count"), 
                                                         max_date=("S_2", "max"),
                                                         min_date=("S_2", "min"))
    agg_summary = agg_summary.reset_index()
    agg_summary["days"] = (agg_summary["max_date"] - agg_summary["min_date"]).dt.days
    agg_summary["record_per_day"] = agg_summary["days"] / agg_summary["num_records"]
    return agg_summary

In [None]:
train_agg_summary = get_agg_summary(train_data)
test_agg_summary = get_agg_summary(test_data)

In [None]:
def set_category_columns(df):
    pass

In [None]:
train_data.loc[:, cat_features].

#### Aggregation

#### Numeric features

In [None]:
train_data_agg = train_data.groupby("cid")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])

In [None]:
train_data_agg.columns = ['_'.join(x) for x in train_data_agg.columns]

In [None]:
test_data_agg = test_data.groupby("cid")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])

In [None]:
test_data_agg.columns = ['_'.join(x) for x in test_data_agg.columns]

#### Categorical features

In [None]:
train_cat_agg = train_data.groupby("cid")[cat_features].agg(['count', 'last', 'nunique'])
test_cat_agg = test_data.groupby("cid")[cat_features].agg(['count', 'last', 'nunique'])

In [None]:
train_cat_agg.columns = ['_'.join(x) for x in train_cat_agg.columns]
test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]

In [None]:
train_agg = pd.concat([train_data_agg, train_cat_agg], axis=1)
del train_data_agg, train_cat_agg

In [None]:
train_agg = train_agg.reset_index()

In [None]:
train_agg.shape

In [None]:
test_agg = pd.concat([test_data_agg, test_cat_agg], axis=1)
del test_data_agg, test_cat_agg

In [None]:
test_agg = test_agg.reset_index()

In [None]:
test_agg.shape

### Train Val Split

In [None]:
train_labels.insert(0, "cid", train_labels['customer_ID'].apply(hash).astype('int64'))
train_labels.head(5)

In [None]:
train_labels = train_labels.sort_values(by="cid").reset_index(drop=True)

In [None]:
train_ = train_agg.merge(train_labels[["cid", "target"]], on="cid", how="left")

In [None]:
train_["dummy"] = np.random.randn(train_.shape[0])

In [None]:
train, val = train_test_split(train_, test_size=0.2, random_state=1020, stratify=train_["target"])

In [None]:
train["target"].mean(), val["target"].mean()

### LGBM Model

In [None]:
X_train = train.drop(columns=["cid", "target"])
X_val = val.drop(columns=["cid", "target"])

In [None]:
y_train = train["target"]
y_val = val["target"]

In [None]:
opt = BayesSearchCV(
    LGBMClassifier(random_state=1020),
    {
        'learning_rate': (0.01, 0.1),
        'num_leaves': (31, 127),
        'max_depth': (4, 15),
        'min_child_samples': (15, 63),
        'n_estimators': (50, 150),
        'subsample': (0.7, 0.9),
        'subsample_freq': (2, 5),
        'colsample_bytree': (0.7, 0.9),
        'reg_lambda': (0, 10),
        'min_split_gain': (0, 0.05),
    },
    n_iter=10,
    cv=5,
    scoring=make_scorer(fbeta_score, beta=2)
)

In [None]:
opt.fit(X_train, y_train)

In [None]:
lgbm_clf = LGBMClassifier(random_state=1020)

In [None]:
lgbm_clf.fit(X_train, y_train)

In [None]:
y_train_pred = lgbm_clf.predict_proba(X_train)[:, 1]
y_val_pred = lgbm_clf.predict_proba(X_val)[:, 1]

In [None]:
plot_roc_curves([y_train, y_val], 
                [y_train_pred, y_val_pred], 
                labels=["Train", "Test"], 
                title="Train Test ROC AUC")

In [None]:
imp_df = plot_feature_importance(lgbm_clf.feature_name_, 
                                 lgbm_clf.feature_importances_, 
                                 title="Feature Importance",
                                 limit=50)

In [None]:
num_list, num_features_list, train_score_list, val_score_list = [], [], [], []
for i in tqdm(range(30)):
    selected_features = imp_df.loc[imp_df["feature_importance"] > i]["feature"].tolist()
    print(f"# of features: {len(selected_features)}")
    
    X_train_new = train.loc[:, selected_features]
    X_val_new = val.loc[:, selected_features]
    
    lgbm_clf = LGBMClassifier(random_state=1020)
    lgbm_clf.fit(X_train_new, y_train)
    
    y_train_pred = lgbm_clf.predict_proba(X_train_new)[:, 1]
    y_val_pred = lgbm_clf.predict_proba(X_val_new)[:, 1]
    
    y_train_df = pd.DataFrame(y_train).reset_index(drop=True)
    y_train_pred_df = pd.DataFrame(y_train_pred).rename(columns={0: "prediction"})
    y_val_df = pd.DataFrame(y_val).reset_index(drop=True)
    y_val_pred_df = pd.DataFrame(y_val_pred).rename(columns={0: "prediction"})
    
    train_score = amex_metric(y_train_df, y_train_pred_df)
    val_score = amex_metric(y_val_df, y_val_pred_df)
    num_list.append(i)
    num_features_list.append(len(selected_features))
    train_score_list.append(train_score)
    val_score_list.append(val_score)

In [None]:
eval_df = pd.DataFrame(dict(index_=num_list, 
                            num_feature=num_features_list, 
                            train_score=train_score_list, 
                            val_score=val_score_list))

In [None]:
# eval_df

In [None]:
plt.figure(figsize=(17, 6))
plt.plot(eval_df["index_"], eval_df["train_score"], label="Train")
plt.plot(eval_df["index_"], eval_df["val_score"], label="Validation")
plt.legend()
plt.show()

In [None]:
selected_features = imp_df.loc[imp_df["feature_importance"] > 5]["feature"].tolist()
len(selected_features)

In [None]:
X_train_new = train.loc[:, selected_features]
X_val_new = val.loc[:, selected_features]

In [None]:
lgbm_clf.fit(X_train_new, y_train)

In [None]:
y_train_pred = lgbm_clf.predict_proba(X_train_new)[:, 1]
y_val_pred = lgbm_clf.predict_proba(X_val_new)[:, 1]

In [None]:
plot_roc_curves([y_train, y_val], 
                [y_train_pred, y_val_pred], 
                labels=["Train", "Test"], 
                title="Train Test ROC AUC")

In [None]:
y_train_df = pd.DataFrame(y_train).reset_index(drop=True)
y_train_pred_df = pd.DataFrame(y_train_pred).rename(columns={0: "prediction"})

In [None]:
amex_metric(y_train_df, y_train_pred_df)

In [None]:
y_val_df = pd.DataFrame(y_val).reset_index(drop=True)
y_val_pred_df = pd.DataFrame(y_val_pred).rename(columns={0: "prediction"})

In [None]:
amex_metric(y_val_df, y_val_pred_df)

### Inference

In [None]:
X_test = test_agg.drop(columns=["cid"])

In [None]:
X_test = test_agg.loc[:, selected_features]

In [None]:
y_test_pred = lgbm_clf.predict_proba(X_test)[:, 1]

In [None]:
test_agg["prediction"] = y_test_pred

#### Submission

In [None]:
submission = pd.read_csv(f"{DATA_PATH}/sample_submission.csv")

In [None]:
submission["cid"] = submission['customer_ID'].apply(hash).astype('int64')
submission = submission.drop(columns="prediction")

In [None]:
result = submission.merge(test_agg[["cid", "prediction"]], on="cid").drop(columns="cid")

In [None]:
result.to_csv(f"{SUBMISSION_DATA_PATH}/submission3.csv", index=False)