In [1]:
############################# IMPORT LIBRARY  #################################
import os
import random
import re
from tqdm.notebook import tqdm
from collections import Counter
from datetime import datetime
import argparse
import pickle
import logging
import numpy as np
import pandas as pd

# https://contrib.scikit-learn.org/category_encoders/index.html
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, QuantileTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

from sklearn.feature_selection import RFE, SelectFromModel, SelectKBest, f_classif, chi2
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, precision_score, recall_score

import optuna
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings('ignore')

import pycaret
from pycaret.classification import *

pd.options.display.max_columns = 200
from utils import *

In [4]:
#######################   CONFIG  #######################
parser = argparse.ArgumentParser(description='Anomaly Detection')

parser.add_argument('--data_path', type=str, default='./data')
parser.add_argument('--seed',type=int, default=110)

parser.add_argument('--model', type=str, default='cat')

parser.add_argument('-en', '--encoder', type=str, default='ohe')
parser.add_argument('-s', '--scaler', type=str, default='mms')

downsample_options = {1:"nearmiss", 2:"cluster", 3:"allknn", 4:"oneside", 5:"tomek"}
parser.add_argument('-ds', '--downsampling', type=int, default=5) # TOMEK

upsample_options = {1: "random", 2:"smote", 3:"adasyn", 4:"smotenc", 5:"smoten", 6:"borderline", 7:"kmeans", 8:"svm"}
parser.add_argument('-us', '--upsampling', type=int, default=3) # SMOTEE - NC

parser.add_argument('--fs_mode', type=bool, default=False, help='feature selection T:auto F:manual')
parser.add_argument('--estimator', type=str, default='extra', help="using for feature selection")
parser.add_argument('--selector', type=str, default='sfm', help='auto feature selector')

parser.add_argument('--check_all', type=bool, default=False)
parser.add_argument('--tune_mode', type=bool, default=True, help='optuna tuning')

config = parser.parse_args([])

exp_config = f"{config.encoder}_{config.scaler}_{downsample_options[config.downsampling]}_{upsample_options[config.upsampling]}"

random.seed(config.seed)
np.random.seed(config.seed)

In [3]:
#######################   LOAD DATA  #######################
df_tr = pd.read_csv(os.path.join(config.data_path, "train_v2.csv"))
df_te = pd.read_csv(os.path.join(config.data_path, "test_v2.csv"))
df_list = [df_tr, df_te]

In [14]:
############################  FEATURE HANDLING  ###########################
## CATEGORICAL FEATURES
cat_features = ["Equipment_Dam",
                "Equipment_Fill1",
                "Equipment_Fill2",
                "Model.Suffix",
                "Workorder Category",
                "Chamber Temp. Judge Value_AutoClave"]

## BINNING FEATURES
bins_features = df_tr.columns[df_tr.columns.str.contains(r".*Bins.*")].tolist()
# Bins 열 만드는 데 사용된 열
from_bins_features = [re.sub(r'\s*Bins\s*', '', f).strip() for f in bins_features]

cat_features.extend(bins_features)

for df in df_list:
    df[cat_features] = df[cat_features].astype("category")

## NUMERICAL FEATURES
num_features = df_tr.select_dtypes(exclude=["category"]).columns.to_list()
num_features.remove("target")

## ALL FEATURES
all_features = num_features + cat_features

## TARGET ENCODING
df_tr["target"] = df_tr["target"].map({"Normal": 0, "AbNormal": 1})
    
## DATA SPLITTING 
X_tr, y_tr = df_tr.drop("target", axis=1), df_tr["target"]
X_te = df_te.drop("Set ID", axis=1)

##
# for df in [X_tr, X_te]:
#     df.columns = df.columns.str.replace(' ', '_')

In [20]:
#############################  FEATURE ENCODING/SCALING ###########################
## ENCODING
if config.encoder == "le":
    le = LabelEncoder()
    for cat_feature in cat_features:
        X_tr[cat_feature] = le.fit_transform(X_tr[cat_feature])
        X_te[cat_feature] = le.transform(X_te[cat_feature])
        
elif config.encoder == "js":
    js = ce.JamesSteinEncoder(cols=cat_features)
    
    X_tr = js.fit_transform(X_tr, y_tr)
    X_te = js.transform(X_te)
    
elif config.encoder == "woe":
    woe = ce.WOEEncoder(cols=cat_features)
    
    X_tr = woe.fit_transform(X_tr, y_tr)
    X_te = woe.transform(X_te)
    
elif config.encoder == "ohe": 
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    
    encoded_tr = ohe.fit_transform(X_tr[cat_features])
    encoded_df_tr = pd.DataFrame(encoded_tr, columns=ohe.get_feature_names_out())
    X_tr = pd.concat([X_tr[num_features], encoded_df_tr], axis=1)
    
    encoded_te = ohe.transform(X_te[cat_features])
    encoded_df_te = pd.DataFrame(encoded_te, columns=ohe.get_feature_names_out())
    X_te = pd.concat([X_te[num_features], encoded_df_te], axis=1)

In [23]:
## SCALING
if config.scaler == "mms":
    mms = MinMaxScaler()
    X_tr[num_features] = mms.fit_transform(X_tr[num_features])
    X_te[num_features] = mms.transform(X_te[num_features])
    
elif config.scaler == "ss":
    ss = StandardScaler()
    X_tr[num_features] = ss.fit_transform(X_tr[num_features])
    X_te[num_features] = ss.transform(X_te[num_features])
    
elif config.scaler == "qt":
    qt = QuantileTransformer(random_state=config.seed, output_distribution='normal', n_quantiles=min(100, len(X_tr) // 5)) # n_quantiles = 1000
    
    X_tr[num_features] = qt.fit_transform(X_tr[num_features])
    X_te[num_features] = qt.transform(X_te[num_features])

elif config.scaler == "pt":
    pts = PowerTransformer(method='yeo-johnson')
    
    X_tr[num_features] = pts.fit_transform(X_tr[num_features])
    X_te[num_features] = pts.transform(X_te[num_features])

In [None]:
#################################  DOWN SAMPLING  ###############################
downsampled_df_tr = resampling.downsample(X_tr, y_tr, method=downsample_options[config.downsampling], random_seed=config.seed)

#################################  UP SAMPLING  ###############################
cat_idx = [downsampled_df_tr.columns.get_loc(col) for col in cat_features]
# cat_idx = [X_tr.columns.get_loc(col) for col in cat_features]
X_tr = downsampled_df_tr.drop("target", axis=1)
y_tr = downsampled_df_tr["target"]

upsampled_df_tr = resampling.upsample(X_tr, y_tr, cat_idx=cat_idx, method=upsample_options[config.upsampling], random_seed=config.seed)

In [None]:
## RESAMPLED DATA
# X_tr = downsampled_df_tr.drop("target", axis=1)
# y_tr = downsampled_df_tr["target"]

X_tr = upsampled_df_tr.drop("target", axis=1)
y_tr = upsampled_df_tr["target"]

In [None]:
###############################  FEATURE SELECTION  ############################
if config.fs_mode:
    estimator = classifiers[config.estimator]
    estimator.fit(X_tr, y_tr)
    
    selectors = {
        'rfe': RFE(estimator=estimator, n_features_to_select=50),
        'sfm': SelectFromModel(estimator=estimator, threshold="mean"),
        'kbest': SelectKBest(score_func=f_classif,),
    }
    
    selector = selectors[config.selector]
    
    X_tr_selec = selector.fit_transform(X_tr, y_tr)
    X_te_selec = selector.transform(X_te)
    
else:
    # 기존 열 대신 Bins 열 사용
    selected_features = [feature for feature in all_features if feature not in from_bins_features]
    
    X_tr_selec = X_tr[selected_features]
    X_te_selec = X_te[selected_features]
    
print("FEATRUE SELECTION")
print("Before ", X_tr.shape)
print("After ", X_tr_selec.shape, end='\n')

In [None]:
############################  AutoML  ###########################
exp_name = f"exp_{exp_config}_only_upsample" 

classification = setup(data=upsampled_df_tr, target="target",
                       session_id=config.seed,
                       log_experiment=True, 
                       experiment_name=exp_name,
                       fold=10,
                       normalize=True,
                       remove_outliers=True,
                    #    transformation=True,
                       feature_selection=True,
                    #    low_variance_threshold = 0.1,
                       remove_multicollinearity=True,
                       )
# feature_selection_method="classic",
# n_features_to_select=0.7
set_config("seed", config.seed) 
# normalize_method

# catboost = create_model('catboost',fold = 20, return_train_score = True)
# tune_catboost =tune_model(catboost, optimize = 'MAE')
# dt_results = pull()

# sort="AUC" buget_time=0.5
# probability_threshold = 0.25
# include = ['lr', 'dt', 'lightgbm']
best_model_top3 = compare_models(sort='F1', n_select=3)
# plot_model(best, plot = 'auc')
# plot_model(best, plot = 'confusion_matrix')
# results = pull()
# blended = blend_models(estimator_list=best_model_top3, fold=5, method='soft')

In [None]:
results = pull()
# ada = create_model("ada")

blended_top3 = blend_models(estimator_list=best_model_top3, fold=10, method="soft", weights = [0.5,0.3,0.2])

In [None]:
tuned_blended_soft_top3 = tune_model(blended_soft_top3, optimize="F1")

In [None]:
final_model = finalize_model(tuned_blended_soft_top3)
final_preds = predict_model(tuned_blended_soft_top3, data=X_te)

In [None]:
df_sub = pd.read_csv(os.path.join(config.data_path, "submission.csv"))
df_sub["target"] = final_preds["prediction_label"]
df_sub["target"] = df_sub["target"].map({0 : "Normal", 1 : "AbNormal"})

print('=============================')
print(df_sub["target"].value_counts())

curr_date = datetime.now().strftime("%m-%d_%H-%M-%S")

# pickle.dump(final_clf, open(f"{config.model}_{curr_date}.pkl", "wb"))
# final_clf = pickle.load(open(".pkl", "rb"))
df_sub.to_csv(os.path.join(config.data_path, f"submission_{curr_date}_{exp_config}.csv"), index=False)

In [None]:
!mlflow ui

In [None]:
# pred_holdout = predict_model(blended, data=X_te)
# save_model(best, 'my_best_pipeline')
# loaded_model = load_model('my_best_pipeline')