In [None]:
############################# IMPORT LIBRARY  #################################
import os
import random
import re
from tqdm.notebook import tqdm
from collections import Counter
from datetime import datetime
import argparse
import pickle
import logging
import numpy as np
import pandas as pd

# https://contrib.scikit-learn.org/category_encoders/index.html
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, QuantileTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

from sklearn.feature_selection import RFE, SelectFromModel, SelectKBest, f_classif, chi2
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, precision_score, recall_score

import optuna
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings('ignore')

import pycaret
from pycaret.classification import *

pd.options.display.max_columns = 200
from utils import *

In [None]:
!pip install jupyter notebook

In [2]:
#######################   CONFIG  #######################
parser = argparse.ArgumentParser(description='Anomaly Detection')

parser.add_argument('--data_path', type=str, default='./data')
parser.add_argument('--seed',type=int, default=110)

parser.add_argument('--model', type=str, default='cat')

parser.add_argument('-en', '--encoder', type=str, default='js')
parser.add_argument('-s', '--scaler', type=str, default='mms')

downsample_options = {1:"nearmiss", 2:"cluster", 3:"allknn", 4:"oneside", 5:"tomek"}
parser.add_argument('-ds', '--downsampling', type=int, default=5) # TOMEK

upsample_options = {1: "random", 2:"smote", 3:"adasyn", 4:"smotenc", 5:"smoten", 6:"borderline", 7:"kmeans", 8:"svm"}
parser.add_argument('-us', '--upsampling', type=int, default=4) # SMOTEE - NC

parser.add_argument('--fs_mode', type=bool, default=False, help='feature selection T:auto F:manual')
parser.add_argument('--estimator', type=str, default='extra', help="using for feature selection")
parser.add_argument('--selector', type=str, default='sfm', help='auto feature selector')

parser.add_argument('--check_all', type=bool, default=False)
parser.add_argument('--tune_mode', type=bool, default=True, help='optuna tuning')

config = parser.parse_args([])

exp_config = f"{config.encoder}_{config.scaler}_{downsample_options[config.downsampling]}_{upsample_options[config.upsampling]}"

random.seed(config.seed)
np.random.seed(config.seed)

In [3]:
#######################   LOAD DATA  #######################
df_tr = pd.read_csv(os.path.join(config.data_path, "train_v2.csv"))
df_te = pd.read_csv(os.path.join(config.data_path, "test_v2.csv"))
df_list = [df_tr, df_te]

In [4]:
############################  FEATURE HANDLING  ###########################
## CATEGORICAL FEATURES
cat_features = ["Equipment_Dam",
                "Equipment_Fill1",
                "Equipment_Fill2",
                "Model.Suffix",
                "Workorder Category",
                "Chamber Temp. Judge Value_AutoClave"]

## BINNING FEATURES
bins_features = df_tr.columns[df_tr.columns.str.contains(r".*Bins.*")].tolist()
# Bins 열 만드는 데 사용된 열
from_bins_features = [re.sub(r'\s*Bins\s*', '', f).strip() for f in bins_features]

cat_features.extend(bins_features)

for df in df_list:
    df[cat_features] = df[cat_features].astype("category")

## NUMERICAL FEATURES
num_features = df_tr.select_dtypes(exclude=["category"]).columns.to_list()
num_features.remove("target")

## ALL FEATURES
all_features = num_features + cat_features

## TARGET ENCODING
df_tr["target"] = df_tr["target"].map({"Normal": 0, "AbNormal": 1})
    
## DATA SPLITTING 
X_tr, y_tr = df_tr.drop("target", axis=1), df_tr["target"]
X_te = df_te.drop("Set ID", axis=1)

In [5]:
#############################  FEATURE ENCODING/SCALING ###########################
## ENCODING
if config.encoder == "le":
    le = LabelEncoder()
    for cat_feature in cat_features:
        X_tr[cat_feature] = le.fit_transform(X_tr[cat_feature])
        X_te[cat_feature] = le.transform(X_te[cat_feature])
        
elif config.encoder == "js":
    js = ce.JamesSteinEncoder(cols=cat_features)
    
    X_tr = js.fit_transform(X_tr, y_tr)
    X_te = js.transform(X_te)
    
elif config.encoder == "woe":
    woe = ce.WOEEncoder(cols=cat_features)
    
    X_tr = woe.fit_transform(X_tr, y_tr)
    X_te = woe.transform(X_te)
    
elif config.encoder == "ohe": 
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    
    encoded_tr = ohe.fit_transform(X_tr[cat_features])
    encoded_df_tr = pd.DataFrame(encoded_tr, columns=ohe.get_feature_names_out())
    X_tr = pd.concat([X_tr[num_features], encoded_df_tr], axis=1)
    
    encoded_te = ohe.transform(X_te[cat_features])
    encoded_df_te = pd.DataFrame(encoded_te, columns=ohe.get_feature_names_out())
    X_te = pd.concat([X_te[num_features], encoded_df_te], axis=1)

In [6]:
## SCALING
if config.scaler == "mms":
    mms = MinMaxScaler()
    X_tr[num_features] = mms.fit_transform(X_tr[num_features])
    X_te[num_features] = mms.transform(X_te[num_features])
    
elif config.scaler == "ss":
    ss = StandardScaler()
    X_tr[num_features] = ss.fit_transform(X_tr[num_features])
    X_te[num_features] = ss.transform(X_te[num_features])
    
elif config.scaler == "qt":
    qt = QuantileTransformer(random_state=config.seed, output_distribution='normal', n_quantiles=min(100, len(X_tr) // 5)) # n_quantiles = 1000
    
    X_tr[num_features] = qt.fit_transform(X_tr[num_features])
    X_te[num_features] = qt.transform(X_te[num_features])

elif config.scaler == "pt":
    pts = PowerTransformer(method='yeo-johnson')
    
    X_tr[num_features] = pts.fit_transform(X_tr[num_features])
    X_te[num_features] = pts.transform(X_te[num_features])

In [5]:
#################################  DOWN SAMPLING  ###############################
# downsampled_df_tr = resampling.downsample(X_tr, y_tr, method=downsample_options[config.downsampling], random_seed=config.seed)

#################################  UP SAMPLING  ###############################
# cat_idx = [downsampled_df_tr.columns.get_loc(col) for col in cat_features]
cat_idx = [X_tr.columns.get_loc(col) for col in cat_features]
# X_tr = downsampled_df_tr.drop("target", axis=1)
# y_tr = downsampled_df_tr["target"]

upsampled_df_tr = resampling.upsample(X_tr, y_tr, cat_idx=cat_idx, method=upsample_options[config.upsampling], random_seed=config.seed)

UP SAMPLNG
Original dataset shape Counter({0: 38155, 1: 2350})
Resampled dataset shape Counter({0: 38155, 1: 38155})


In [6]:
## RESAMPLED DATA
# X_tr = downsampled_df_tr.drop("target", axis=1)
# y_tr = downsampled_df_tr["target"]

X_tr = upsampled_df_tr.drop("target", axis=1)
y_tr = upsampled_df_tr["target"]

In [10]:
################ MODEL ############### 
classifiers = {
    "cat": CatBoostClassifier(random_state=config.seed, auto_class_weights="Balanced"),
    "lgbm": LGBMClassifier(random_state=config.seed,),
    "xgb": XGBClassifier(random_state=config.seed, eval_metric='auc', objective="binary:logistic"),
    "ada": AdaBoostClassifier(random_state=config.seed),
    "rfc": RandomForestClassifier(random_state=config.seed, class_weight='balanced'),
    "lr": LogisticRegression(random_state=config.seed),
    "extra": ExtraTreesClassifier(random_state=config.seed)
}

In [9]:
###############################  FEATURE SELECTION  ############################
if config.fs_mode:
    estimator = classifiers[config.estimator]
    estimator.fit(X_tr, y_tr)
    
    selectors = {
        'rfe': RFE(estimator=estimator, n_features_to_select=50),
        'sfm': SelectFromModel(estimator=estimator, threshold="mean"),
        'kbest': SelectKBest(score_func=f_classif,),
    }
    
    selector = selectors[config.selector]
    
    X_tr_selec = selector.fit_transform(X_tr, y_tr)
    X_te_selec = selector.transform(X_te)
    
else:
    # 기존 열 대신 Bins 열 사용
    selected_features = [feature for feature in all_features if feature not in from_bins_features]
    
    X_tr_selec = X_tr[selected_features]
    X_te_selec = X_te[selected_features]
    
print("FEATRUE SELECTION")
print("Before ", X_tr.shape)
print("After ", X_tr_selec.shape, end='\n')

FEATRUE SELECTION
Before  (76310, 173)
After  (76310, 131)


In [10]:
upsampled_df_tr.columns = upsampled_df_tr.columns.str.replace(' ', '_')

In [18]:
############################  AutoML  ###########################
exp_name = f"exp_{exp_config}" 

# https://pycaret.readthedocs.io/en/latest/api/classification.html#pycaret.classification.setup
# encoding_method=category_encoders.target_encoder.TargetEncoder(smoothing=10)
clf = setup(data=upsampled_df_tr, target="target",
                       session_id=config.seed,
                       log_experiment=True, 
                       experiment_name=exp_name,
                       fold=10,
                       fold_shuffle=True,
                       normalize=True,
                       normalize_method="minmax",
                       remove_outliers=True,
                    #    transformation=True,
                       feature_selection=True,
                       # feature_selection_method="sequential",
                       # n_features_to_select=0.7
                       n_features_to_select=0.4,
                    #    low_variance_threshold = 0.1,
                       remove_multicollinearity=True,
                       )

set_config("seed", config.seed) 

# # sort="AUC" buget_time=0.5
# probability_threshold = 0.25
# include = ['lr', 'dt', 'lightgbm']
# plot_model(best, plot = 'auc')
# plot_model(best, plot = 'confusion_matrix')
best_model_top3 = compare_models(sort='F1', n_select=3)

Unnamed: 0,Description,Value
0,Session id,110
1,Target,target
2,Target type,Binary
3,Original data shape,"(74096, 174)"
4,Transformed data shape,"(71503, 35)"
5,Transformed train set shape,"(49274, 35)"
6,Transformed test set shape,"(22229, 35)"
7,Numeric features,173
8,Preprocess,True
9,Imputation type,simple


2024/08/26 23:46:28 INFO mlflow.tracking.fluent: Experiment with name 'exp_js_mms_tomek_smoten' does not exist. Creating a new experiment.


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9568,0.9812,0.9217,0.9912,0.9552,0.9135,0.9158,9.552
catboost,CatBoost Classifier,0.9532,0.9756,0.9322,0.9731,0.9522,0.9065,0.9073,30.685
et,Extra Trees Classifier,0.9527,0.9801,0.9157,0.9889,0.9509,0.9055,0.908,10.302
xgboost,Extreme Gradient Boosting,0.9513,0.9749,0.9333,0.9682,0.9504,0.9027,0.9033,6.677
lightgbm,Light Gradient Boosting Machine,0.9364,0.9707,0.9193,0.9519,0.9353,0.8728,0.8733,6.774
dt,Decision Tree Classifier,0.9288,0.9288,0.9348,0.9237,0.9292,0.8576,0.8576,5.822
knn,K Neighbors Classifier,0.9235,0.9566,0.9294,0.9187,0.924,0.847,0.8471,13.606
gbc,Gradient Boosting Classifier,0.8617,0.933,0.8428,0.8759,0.859,0.7234,0.724,13.234
ada,Ada Boost Classifier,0.8028,0.8826,0.8096,0.7988,0.8042,0.6056,0.6058,7.076
lda,Linear Discriminant Analysis,0.7477,0.8237,0.7806,0.7325,0.7558,0.4954,0.4965,6.133




In [19]:
clf.pipeline

In [21]:
best_model_top3

[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='sqrt',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_samples_leaf=1,
                        min_samples_split=2, min_weight_fraction_leaf=0.0,
                        monotonic_cst=None, n_estimators=100, n_jobs=-1,
                        oob_score=False, random_state=110, verbose=0,
                        warm_start=False),
 <catboost.core.CatBoostClassifier at 0x21e0134b9a0>,
 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                      criterion='gini', max_depth=None, max_features='sqrt',
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=0.0, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      monotonic_cst=None, n_e

In [33]:
# results = pull()
cat = create_model("catboost", fold=20, return_train_score = True)

# !pip install numpy==1.16.0
# blended_top3 = blend_models(estimator_list=best_model_top3, fold=10, method="soft", weights = [0.5,0.3,0.2])

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9503,0.976,0.9283,0.971,0.9491,0.9005,0.9014
1,0.9528,0.975,0.931,0.9734,0.9517,0.9055,0.9064
2,0.951,0.9748,0.9287,0.9721,0.9499,0.9021,0.903
3,0.9481,0.9719,0.9248,0.9701,0.9469,0.8963,0.8973
4,0.9526,0.9754,0.9337,0.9704,0.9517,0.9051,0.9058
5,0.957,0.9782,0.9372,0.9759,0.9561,0.914,0.9147
6,0.9533,0.9707,0.9302,0.9753,0.9522,0.9067,0.9077
7,0.9587,0.9804,0.9433,0.9733,0.9581,0.9175,0.9179
8,0.9545,0.9751,0.9329,0.975,0.9535,0.909,0.9098
9,0.9539,0.9787,0.9321,0.9746,0.9529,0.9078,0.9087




In [36]:
# tuned_blended_soft_top3 = tune_model(blended_top3, optimize="F1")
# !pip install optuna-integration
tuned_cat = tune_model(cat, optimize="F1", search_library="optuna")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9107,0.9619,0.8835,0.9343,0.9082,0.8215,0.8227
1,0.9111,0.9632,0.8774,0.9409,0.908,0.8222,0.8241
2,0.9148,0.9626,0.8893,0.937,0.9125,0.8296,0.8306
3,0.9096,0.9597,0.8843,0.9314,0.9073,0.8192,0.8202
4,0.9138,0.9623,0.887,0.9373,0.9115,0.8276,0.8288
5,0.915,0.9669,0.8886,0.9381,0.9127,0.83,0.8311
6,0.921,0.9595,0.8975,0.9417,0.9191,0.8419,0.8428
7,0.9242,0.9709,0.9001,0.9457,0.9223,0.8484,0.8494
8,0.9159,0.9628,0.8916,0.9372,0.9138,0.8319,0.8328
9,0.9184,0.9675,0.8993,0.935,0.9168,0.8369,0.8375


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).




In [37]:
final_model = finalize_model(tuned_cat)
final_preds = predict_model(tuned_cat, data=X_te)



In [38]:
df_sub = pd.read_csv(os.path.join(config.data_path, "submission.csv"))
df_sub["target"] = final_preds["prediction_label"]
df_sub["target"] = df_sub["target"].map({0 : "Normal", 1 : "AbNormal"})

print('=============================')
print(df_sub["target"].value_counts())

curr_date = datetime.now().strftime("%m-%d_%H-%M-%S")

# pickle.dump(final_clf, open(f"{config.model}_{curr_date}.pkl", "wb"))
# final_clf = pickle.load(open(".pkl", "rb"))
df_sub.to_csv(os.path.join(config.data_path, f"submission_{curr_date}_{exp_config}.csv"), index=False)

Normal      16840
AbNormal      521
Name: target, dtype: int64


In [None]:
!mlflow ui

In [None]:
# pred_holdout = predict_model(blended, data=X_te)
# save_model(best, 'my_best_pipeline')
# loaded_model = load_model('my_best_pipeline')