In [1]:
############################# IMPORT LIBRARY  #################################
import os
import random
import re
from tqdm.notebook import tqdm
from collections import Counter
from datetime import datetime
import argparse
import pickle
import logging
import numpy as np
import pandas as pd

# https://contrib.scikit-learn.org/category_encoders/index.html
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, QuantileTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

from sklearn.feature_selection import RFE, SelectFromModel, SelectKBest, f_classif, chi2
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, precision_score, recall_score
from gensim.models import Word2Vec

import optuna
from optuna.samplers import TPESampler, NSGAIISampler, NSGAIIISampler
import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = 200

In [20]:
#######################   CONFIG  #######################
parser = argparse.ArgumentParser(description='Anomaly Detection')

parser.add_argument('--data_path', type=str, default='./Data')
parser.add_argument('--seed',type=int, default=881)

parser.add_argument('--model', type=str, default='cat')

parser.add_argument('-en', '--encoder', type=str, default='js')
parser.add_argument('-s', '--scaler', type=str, default='qt')

downsample_options = {1:"nearmiss", 2:"cluster", 3:"allknn", 4:"oneside", 5:"tomek"}
parser.add_argument('-ds', '--downsampling', type=int, default=4) # TOMEK

upsample_options = {1: "random", 2:"smote", 3:"adasyn", 4:"smotenc", 5:"smoten", 6:"borderline", 7:"kmeans", 8:"svm", 9: "ctgan", 10: "tvae"}
parser.add_argument('-us', '--upsampling', type=int, default=2) # SVMSMOTE

####### 추가 augmentation으로 CTGAN을 사용하는 경우 #############
augmentation_options = {1: "ctgan", 2: "tvae"}
parser.add_argument('-ag', '--augmentation', type=int, default=2)

parser.add_argument('--fs_mode', type=bool, default=True, help='feature selection T:auto F:manual')
parser.add_argument('--estimator', type=str, default='cat', help="using for feature selection")
parser.add_argument('--selector', type=str, default='kbest', help='auto feature selector')

parser.add_argument('--k', type=int, default=10, help='k fold split')
parser.add_argument('--check_all', type=bool, default=True)
parser.add_argument('--tune_mode', type=bool, default=True, help='optuna tuning')

config = parser.parse_args([])

exp_config = f"{config.encoder}_{config.scaler}_{downsample_options[config.downsampling]}_{upsample_options[config.upsampling]}"

random.seed(config.seed)
np.random.seed(config.seed)

In [21]:
config

Namespace(data_path='./Data', seed=881, model='cat', encoder='js', scaler='qt', downsampling=4, upsampling=2, augmentation=2, fs_mode=True, estimator='cat', selector='kbest', k=10, check_all=True, tune_mode=True)

In [22]:
#######################   LOAD DATA  #######################
df_tr = pd.read_csv(os.path.join(config.data_path, "train_v1.csv"))
df_te = pd.read_csv(os.path.join(config.data_path, "test_v1.csv"))
df_list = [df_tr, df_te]

# Workorder (test에 있는데, train에는 없는 경우가 있어 그냥 제외)
# 대신 Workorder Categeory 사용
for df in df_list:
    df.drop(["Workorder"], axis=1, inplace=True)

In [23]:
############################  FEATURE HANDLING  ###########################
## CATEGORICAL FEATURES
cat_features = ["Equipment_Dam",
                "Equipment_Fill1",
                "Equipment_Fill2",
                "Model.Suffix",
                "Workorder Category",
                "Chamber Temp. Judge Value_AutoClave"]

## BINNING FEATURES
bins_features = df_tr.columns[df_tr.columns.str.contains(r".*Bins.*")].tolist()
# Bins 열 만드는 데 사용된 열
from_bins_features = [re.sub(r'\s*Bins\s*', '', f).strip() for f in bins_features]

cat_features.extend(bins_features)

for df in df_list:
    df[cat_features] = df[cat_features].astype("category")

## NUMERICAL FEATURES
num_features = df_tr.select_dtypes(exclude=["category"]).columns.to_list()
num_features.remove("target")

## ALL FEATURES
all_features = num_features + cat_features

## TARGET ENCODING
df_tr["target"] = df_tr["target"].map({"Normal": 0, "AbNormal": 1})

## DATA SPLITTING 
X_tr, y_tr = df_tr.drop("target", axis=1), df_tr["target"]
X_te = df_te.drop("Set ID", axis=1)

In [24]:
#############################  FEATURE ENCODING/SCALING ###########################
## ENCODING
if config.encoder == "le":
    le = LabelEncoder()
    for cat_feature in cat_features:
        X_tr[cat_feature] = le.fit_transform(X_tr[cat_feature])
        X_te[cat_feature] = le.transform(X_te[cat_feature])
        
elif config.encoder == "js":
    js = ce.JamesSteinEncoder(cols=cat_features)
    
    X_tr = js.fit_transform(X_tr, y_tr)
    X_te = js.transform(X_te)
    
elif config.encoder == "woe":
    woe = ce.WOEEncoder(cols=cat_features)
    
    X_tr = woe.fit_transform(X_tr, y_tr)
    X_te = woe.transform(X_te)
    
elif config.encoder == "ohe":
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
    
    X_tr[cat_features] = ohe.fit_transform(X_tr[cat_features])
    X_te[cat_features] = ohe.transform(X_te[cat_features])

In [25]:
## SCALING
if config.scaler == "mms":
    mms = MinMaxScaler()
    X_tr[num_features] = mms.fit_transform(X_tr[num_features])
    X_te[num_features] = mms.transform(X_te[num_features])
    
elif config.scaler == "ss":
    ss = StandardScaler()
    X_tr[num_features] = ss.fit_transform(X_tr[num_features])
    X_te[num_features] = ss.transform(X_te[num_features])
    
elif config.scaler == "qt":
    qt = QuantileTransformer(random_state=config.seed, output_distribution='normal', n_quantiles=min(100, len(X_tr) // 5)) # n_quantiles = 1000
    
    X_tr[num_features] = qt.fit_transform(X_tr[num_features])
    X_te[num_features] = qt.transform(X_te[num_features])

elif config.scaler == "pt":
    pts = PowerTransformer(method='yeo-johnson')
    
    X_tr[num_features] = pts.fit_transform(X_tr[num_features])
    X_te[num_features] = pts.transform(X_te[num_features])

In [26]:
from collections import Counter
from sklearn.utils import resample
from imblearn.under_sampling import (NearMiss,
                                     ClusterCentroids,
                                     AllKNN,
                                     OneSidedSelection,
                                     TomekLinks)
from imblearn.over_sampling import (RandomOverSampler,
                                    SMOTE,
                                    ADASYN,
                                    SMOTENC,
                                    SMOTEN,
                                    BorderlineSMOTE,
                                    KMeansSMOTE,
                                    SVMSMOTE)
from imblearn.combine import SMOTEENN

# Import libraries for CTGAN and VAE
from ctgan import CTGAN, TVAE
from sklearn.neural_network import MLPRegressor

def random_downsample(df, random_seed, sample_ratio=1.0):
    df_normal = df[df["target"] == 0] 
    df_abnormal = df[df["target"] == 1]
    
    downsampled = resample(
        df_normal,
        replace=False,
        n_samples=int(len(df_abnormal) * sample_ratio),
        random_state=random_seed
    )
    
    downsampled_df = pd.concat([df_abnormal, downsampled])
    
    return downsampled_df

def downsample(X, y, method, random_seed):
    # NearMiss
    if method == "nearmiss":
        # sampling_strategy="auto"
        nm = NearMiss(sampling_strategy=0.4)
        X_downsampled, y_downsampled = nm.fit_resample(X, y)
    # ClusterCentroids
    elif method == "cluster":
        cc = ClusterCentroids(random_state=random_seed)
        X_downsampled, y_downsampled = cc.fit_resample(X, y)
    # AllKNN
    elif method == "allknn":
        allknn = AllKNN()
        X_downsampled, y_downsampled = allknn.fit_resample(X, y)
    # OneSidedSelection
    elif method == "oneside":
        oss = OneSidedSelection(random_state=random_seed)
        X_downsampled, y_downsampled = oss.fit_resample(X, y)
    # Tomeklinks
    elif method == "tomek":
        tl = TomekLinks()
        X_downsampled, y_downsampled = tl.fit_resample(X, y)
    
    X_downsampled_df= pd.DataFrame(X_downsampled, columns=X.columns)
    y_downsampled_df = pd.Series(y_downsampled, name="target") 
    downsampled_df = pd.concat([X_downsampled_df, y_downsampled_df], axis=1)
    
    print('DOWN SAMPLING')
    print('=============')
    print('Original dataset shape %s' % Counter(y))
    print('Resampled dataset shape %s' % Counter(y_downsampled), end='\n')
    
    return downsampled_df


def upsample(X, y, cat_idx, method, random_seed):
    
    if method == "random":
        ros = RandomOverSampler(random_state=random_seed)
        X_upsampled, y_upsampled = ros.fit_resample(X, y)
        
    # SMOTE
    elif method == "smote":
        smote = SMOTE(random_state=random_seed)
        X_upsampled, y_upsampled = smote.fit_resample(X, y)
        
    # ADASYN
    elif method == "adasyn":
        adasyn = ADASYN(random_state=random_seed)
        X_upsampled, y_upsampled = adasyn.fit_resample(X, y)
        
    # SMOTE-NC
    elif method == "smotenc":
        smotenc = SMOTENC(random_state=random_seed, sampling_strategy="auto", categorical_features=cat_idx)
        X_upsampled, y_upsampled = smotenc.fit_resample(X, y)
        
    elif method == "smoten":
        smoten = SMOTEN(random_state=random_seed, sampling_strategy="auto", k_neighbors=5)
        X_upsampled, y_upsampled = smoten.fit_resample(X, y)
        
    elif method == "borderline":
        borderline_smote = BorderlineSMOTE(random_state=random_seed)
        X_upsampled, y_upsampled = borderline_smote.fit_resample(X, y)
        
    elif method == "kmeans":
        kmeans_smote = KMeansSMOTE(random_state=random_seed, sampling_strategy="auto", k_neighbors=5)
        X_upsampled, y_upsampled = kmeans_smote.fit_resample(X, y)
        
    elif method == "svm":
        svm_smote = SVMSMOTE(random_state=42)
        X_upsampled, y_upsampled = svm_smote.fit_resample(X, y)
    
    elif method == "ctgan":
        X = pd.DataFrame(X)
        # Assuming cat_idx is the list of indices of categorical features
        discrete_columns = X.columns[cat_idx]  # Get the names of categorical columns

        # Fit the CTGAN model on the original data
        ctgan = CTGAN(epochs=100)
        ctgan.fit(X, discrete_columns=discrete_columns)

        # Generate synthetic samples
        synthetic_samples = ctgan.sample(X.shape[0])

        # Combine original and synthetic samples
        X_upsampled = pd.concat([X, synthetic_samples])
        y_upsampled = pd.concat([y, pd.Series([1] * synthetic_samples.shape[0], name="target")])
        
    elif method == "tvae":
            if isinstance(X, np.ndarray):
                X = pd.DataFrame(X)
            discrete_columns = X.columns[cat_idx]
            
            tvae = TVAE(epochs=100)
            tvae.fit(X, discrete_columns=discrete_columns)
            synthetic_samples = tvae.sample(X.shape[0])
            
            X_upsampled = pd.concat([X, synthetic_samples])
            y_upsampled = pd.concat([y, pd.Series([1] * synthetic_samples.shape[0], name="target")])
            

    upsampled_df = pd.concat([X_upsampled, y_upsampled], axis=1)
        
    print('UP SAMPLNG')
    print('==========')
    print('Original dataset shape %s' % Counter(y))
    print('Resampled dataset shape %s' % Counter(y_upsampled), end='\n')
    
    return upsampled_df

In [27]:
#################################  DOWN SAMPLING  ###############################
downsampled_df_tr = downsample(X_tr, y_tr, method=downsample_options[config.downsampling], random_seed=config.seed)

#################################  UP SAMPLING  ###############################
cat_idx = [downsampled_df_tr.columns.get_loc(col) for col in cat_features]
cat_idx = [X_tr.columns.get_loc(col) for col in cat_features]
X_tr = downsampled_df_tr.drop("target", axis=1)
y_tr = downsampled_df_tr["target"]

upsampled_df_tr = upsample(X_tr, y_tr, cat_idx=cat_idx, method=upsample_options[config.upsampling], random_seed=config.seed)

DOWN SAMPLING
Original dataset shape Counter({0: 36197, 1: 2210})
Resampled dataset shape Counter({0: 35239, 1: 2210})
UP SAMPLNG
Original dataset shape Counter({0: 35239, 1: 2210})
Resampled dataset shape Counter({0: 35239, 1: 35239})


In [28]:
## RESAMPLED DATA
# X_tr = downsampled_df_tr.drop("target", axis=1)
# y_tr = downsampled_df_tr["target"]

X_tr = upsampled_df_tr.drop("target", axis=1)
y_tr = upsampled_df_tr["target"]

In [29]:
X_tr.to_csv(os.path.join(config.data_path, f"train_smote.csv"), index=False)
y_tr.to_csv(os.path.join(config.data_path, f"test_smote.csv"), index=False)

In [30]:
def augmentation(X, y, method, random_seed):
    print('AUGMENTATION')
    print('==========')
    print('Original dataset shape %s' % Counter(y))
    
    if method == "ctgan":
        X = pd.DataFrame(X)
        # Assuming cat_idx is the list of indices of categorical features
        discrete_columns = X.columns[cat_idx]  # Get the names of categorical columns

        # Fit the CTGAN model on the original data
        ctgan = CTGAN(epochs=100)
        ctgan.fit(X, discrete_columns=discrete_columns)

        # Generate synthetic samples
        synthetic_samples = ctgan.sample(X.shape[0])

        # Combine original and synthetic samples
        X_upsampled = pd.concat([X, synthetic_samples])
        y_upsampled = pd.concat([y, pd.Series([1] * synthetic_samples.shape[0], name="target")])
        
    elif method == "tvae":
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        discrete_columns = X.columns[cat_idx]
        
        tvae = TVAE(epochs=100)
        tvae.fit(X, discrete_columns=discrete_columns)
        synthetic_samples = tvae.sample(X.shape[0])
        
        X_upsampled = pd.concat([X, synthetic_samples])
        y_upsampled = pd.concat([y, pd.Series([1] * synthetic_samples.shape[0], name="target")])
        
    upsampled_df = pd.concat([X_upsampled, y_upsampled], axis=1)
        
    
    print('Resampled dataset shape %s' % Counter(y_upsampled), end='\n')
    
            
    return augment_df

In [31]:
augmentation_df = augmentation(X_tr, y_tr, method=augmentation_options[config.augmentation], random_seed=config.seed)

In [None]:
X_tr = augmentation_df.drop("target", axis=1)
y_tr = augmentation_df["target"]

In [16]:
################ MODEL ############### 
classifiers = {
    "cat": CatBoostClassifier(random_state=config.seed, auto_class_weights="SqrtBalanced"),
    "lgbm": LGBMClassifier(random_state=config.seed,),
    "xgb": XGBClassifier(random_state=config.seed, eval_metric='auc', objective="binary:logistic"),
    "ada": AdaBoostClassifier(random_state=config.seed),
    "rfc": RandomForestClassifier(random_state=config.seed, class_weight='balanced'),
    "lr": LogisticRegression(random_state=config.seed),
    "extra": ExtraTreesClassifier(random_state=config.seed)
}

In [17]:
###############################  FEATURE SELECTION  ############################
if config.fs_mode:
    estimator = classifiers[config.estimator]
    estimator.fit(X_tr, y_tr)
    
    selectors = {
        'rfe': RFE(estimator=estimator, n_features_to_select=50),
        'sfm': SelectFromModel(estimator=estimator, threshold="mean"),
        'kbest': SelectKBest(score_func=f_classif, k=10),
    }
    
    selector = selectors[config.selector]
    
    # Fit the selector on the training data
    selector.fit(X_tr, y_tr)
    
    # Get the support mask of selected features
    support_mask = selector.get_support()
    
    # Retain selected features from the original DataFrame and preserve feature names
    X_tr_selec = X_tr.loc[:, support_mask]
    X_te_selec = X_te.loc[:, support_mask]
    
else:
    # 기존 열 대신 Bins 열 사용
    selected_features = [feature for feature in all_features if feature not in from_bins_features]
    
    X_tr_selec = X_tr[selected_features]
    X_te_selec = X_te[selected_features]
    
print("FEATRUE SELECTION")
print("Before ", X_tr.shape)
print("After ", X_tr_selec.shape, end='\n')

Learning rate set to 0.065063
0:	learn: 0.5559421	total: 75.5ms	remaining: 1m 15s
1:	learn: 0.4482026	total: 113ms	remaining: 56.6s
2:	learn: 0.3740895	total: 142ms	remaining: 47s
3:	learn: 0.3181010	total: 172ms	remaining: 42.8s
4:	learn: 0.2716881	total: 196ms	remaining: 39.1s
5:	learn: 0.2361149	total: 219ms	remaining: 36.2s
6:	learn: 0.2136012	total: 241ms	remaining: 34.2s
7:	learn: 0.1925237	total: 260ms	remaining: 32.2s
8:	learn: 0.1768716	total: 281ms	remaining: 31s
9:	learn: 0.1642499	total: 303ms	remaining: 30s
10:	learn: 0.1545142	total: 321ms	remaining: 28.9s
11:	learn: 0.1473661	total: 339ms	remaining: 27.9s
12:	learn: 0.1408403	total: 356ms	remaining: 27s
13:	learn: 0.1355074	total: 372ms	remaining: 26.2s
14:	learn: 0.1312999	total: 390ms	remaining: 25.6s
15:	learn: 0.1277541	total: 410ms	remaining: 25.2s
16:	learn: 0.1248652	total: 432ms	remaining: 25s
17:	learn: 0.1222227	total: 451ms	remaining: 24.6s
18:	learn: 0.1201507	total: 469ms	remaining: 24.2s
19:	learn: 0.118445

In [18]:
X_tr_selec.columns

Index(['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',
       'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
       'PalletID Collect Result_Dam', 'Production Qty Collect Result_Dam',
       '1st Pressure Collect Result_AutoClave',
       'PalletID Collect Result_Fill1', 'Production Qty Collect Result_Fill1',
       'Production Qty Collect Result_Fill2', 'Receip No Collect Result_Fill2',
       'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result Bins_Dam'],
      dtype='object')

In [None]:
# ###############################  EVALUATION  ############################
# stk = StratifiedKFold(n_splits=10, random_state=config.seed, shuffle=True)
# rstk = RepeatedStratifiedKFold(n_splits=10, random_state=config.seed)

# if config.check_all:
#     classifiers_lst = list(classifiers.values())
#     score_df = pd.DataFrame(columns=classifiers.keys())
    
#     for clf_name, clf in classifiers.items():
#         scores = cross_val_score(clf, X_tr_selec, y_tr, scoring="f1", cv=stk)
#         print(scores)
#         score_df.loc[0, clf_name] = scores.mean()
#         print(score_df)
    
    
# else:
#     metrics = ['accuracy', 'precision', 'recall', 'f1', 'f1_weighted', 'roc_auc']
#     score_df = pd.DataFrame(columns=metrics)
    
#     for metric in metrics:
#         scores = cross_val_score(classifiers["cat"], X_tr_selec, y_tr, scoring=metric, cv=stk)
#         score_df[metric] = scores.mean()
    
# print("MODEL CHECK")
# print(score_df, end='\n')

In [19]:
def catboost_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.0001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.5, 1.0),
        'od_type': trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        'od_wait': trial.suggest_int("od_wait", 10, 50),
    }

    cat_clf = CatBoostClassifier(**params, random_state=config.seed, auto_class_weights="Balanced",) # eval_metric="TotalF1"
    
    stk = StratifiedKFold(n_splits=config.k, random_state=config.seed, shuffle=True)
    f1_scores = np.empty(config.k)
    
    for idx, (tr_idx, val_idx) in enumerate(stk.split(X_tr_selec, y_tr)):
        X_tr_fold, X_val_fold = X_tr.iloc[tr_idx], X_tr.iloc[val_idx]
        y_tr_fold, y_val_fold = y_tr.iloc[tr_idx], y_tr.iloc[val_idx]
        
        cat_clf.fit(X_tr_fold, y_tr_fold, eval_set=[(X_val_fold, y_val_fold)], early_stopping_rounds=50, verbose=False)
        y_pred_fold = cat_clf.predict(X_val_fold)
        f1_scores[idx] = f1_score(y_val_fold, y_pred_fold)

    return np.mean(f1_scores) 

In [None]:
def lgbm_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'num_leaves': trial.suggest_int('num_leaves', 10, 1000),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
    }
    
    lgbm_clf = LGBMClassifier(**params, random_state=config.seed)
    
    stk = StratifiedKFold(n_splits=config.k, random_state=config.seed, shuffle=True)
    f1_scores = np.empty(config.k)
    
    for idx, (tr_idx, val_idx) in enumerate(stk.split(X_tr_selec, y_tr)):
        X_tr_fold, X_val_fold = X_tr_selec.iloc[tr_idx], X_tr_selec.iloc[val_idx]
        y_tr_fold, y_val_fold = y_tr.iloc[tr_idx], y_tr.iloc[val_idx]
        
        lgbm_clf.fit(X_tr_fold, y_tr_fold)
        y_pred_fold = lgbm_clf.predict(X_val_fold)
        f1_scores[idx] = f1_score(y_val_fold, y_pred_fold)

    return np.mean(f1_scores)

In [None]:
def xgboost_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.0001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),  # colsample_bylevel -> colsample_bytree
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),  # XGBoost의 중요한 파라미터 중 하나
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),  # 트리의 분할을 조정하는 파라미터
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),  # L1 정규화 항
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),  # L2 정규화 항
    }

    xgb_clf = XGBClassifier(**params, random_state=config.seed, use_label_encoder=False,
                            eval_metric='auc',
                            early_stopping_rounds=50, 
                            objective = "binary:logistic",)
    
    stk = StratifiedKFold(n_splits=config.k, random_state=config.seed, shuffle=True)
    f1_scores = np.empty(config.k)
    
    for idx, (tr_idx, val_idx) in enumerate(stk.split(X_tr_selec, y_tr)):
        X_tr_fold, X_val_fold = X_tr.iloc[tr_idx], X_tr.iloc[val_idx]
        y_tr_fold, y_val_fold = y_tr.iloc[tr_idx], y_tr.iloc[val_idx]
        
        xgb_clf.fit(X_tr_fold, y_tr_fold, eval_set=[(X_val_fold, y_val_fold)], early_stopping_rounds=50, verbose=False)
        y_pred_fold = xgb_clf.predict(X_val_fold)
        f1_scores[idx] = f1_score(y_val_fold, y_pred_fold)

    return np.mean(f1_scores)

In [20]:
# logger = logging.getLogger()
    
# logger.setLevel(logging.INFO)
# logger.addHandler(logging.FileHandler(f"./log/{config.model}_optuna.log", mode="w"))
    
# optuna.logging.enable_propagation() 
# optuna.logging.disable_default_handler()
        
sampler = NSGAIISampler(seed=config.seed)
pruner = optuna.pruners.HyperbandPruner()

if config.tune_mode and config.model == "cat":
    
    cat_study = optuna.create_study(study_name="cat", direction='maximize', sampler=sampler, pruner=pruner)
    cat_study.optimize(catboost_objective, n_trials=10)
    
    cat_best_params = cat_study.best_params
    cat_best_score = cat_study.best_value
    
    print("CatBoost Best Hyperparams: ", cat_best_params)
    print("CatBoost Best F1 Score: ", cat_best_score, end='\n')
    
    final_clf = CatBoostClassifier(**cat_best_params, random_state=config.seed, auto_class_weights="Balanced",)
    
elif config.tune_mode and config.model == "lgbm":
    
    lgbm_study = optuna.create_study(study_name="lgbm", direction='maximize', sampler=sampler, pruner=pruner)
    lgbm_study.optimize(lgbm_objective, n_trials=15)
    
    lgbm_best_params = lgbm_study.best_params
    lgbm_best_score= lgbm_study.best_value
    
    print("LGBM Best Hyperparams: ",lgbm_best_params)
    print("LGBM Best F1 Score: ", lgbm_best_score, end='\n')
    
    final_clf = LGBMClassifier(**lgbm_best_params, random_state=config.seed,)

elif config.tune_mode and config.model == "xgb":
    
    xgb_study = optuna.create_study(study_name="xgb", direction='maximize', sampler=sampler, pruner=pruner)
    xgb_study.optimize(xgboost_objective, n_trials=15)
    
    xgb_best_params = xgb_study.best_params
    xgb_best_score = xgb_study.best_value
    print("XGBoost Best Hyperparams: ", xgb_best_params)
    print("XGBoost Best F1 Score: ", xgb_best_score)
    
else:
    final_clf = classifiers[config.model]

# with open(f"./log/{config.model}_optuna.log") as f:
#     assert f.readline().startswith("A new study created")
#     assert f.readline() == "Start optimization.\n"

[I 2024-08-26 11:05:24,973] A new study created in memory with name: cat
[I 2024-08-26 11:09:59,724] Trial 0 finished with value: 0.97235294558823 and parameters: {'n_estimators': 482, 'learning_rate': 0.007870322063188457, 'max_depth': 8, 'subsample': 0.7012836414644832, 'colsample_bylevel': 0.6720489300307877, 'od_type': 'Iter', 'od_wait': 14}. Best is trial 0 with value: 0.97235294558823.
[I 2024-08-26 11:10:38,885] Trial 1 finished with value: 0.9721665179225463 and parameters: {'n_estimators': 170, 'learning_rate': 0.015314909484200338, 'max_depth': 5, 'subsample': 0.5590724561439451, 'colsample_bylevel': 0.6158019907405197, 'od_type': 'IncToDec', 'od_wait': 43}. Best is trial 0 with value: 0.97235294558823.
[I 2024-08-26 11:12:50,441] Trial 2 finished with value: 0.9699354354751349 and parameters: {'n_estimators': 273, 'learning_rate': 0.00011578523132411305, 'max_depth': 7, 'subsample': 0.7665711157674153, 'colsample_bylevel': 0.8443308944666477, 'od_type': 'Iter', 'od_wait': 47

CatBoost Best Hyperparams:  {'n_estimators': 346, 'learning_rate': 0.05346913391764926, 'max_depth': 6, 'subsample': 0.6796526428168673, 'colsample_bylevel': 0.5344536110296659, 'od_type': 'IncToDec', 'od_wait': 41}
CatBoost Best F1 Score:  0.9726997880042536


In [22]:
X_tr_selec.columns

Index(['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',
       'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
       'PalletID Collect Result_Dam', 'Production Qty Collect Result_Dam',
       '1st Pressure Collect Result_AutoClave',
       'PalletID Collect Result_Fill1', 'Production Qty Collect Result_Fill1',
       'Production Qty Collect Result_Fill2', 'Receip No Collect Result_Fill2',
       'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result Bins_Dam'],
      dtype='object')

In [None]:
#cat_study.visualize()
#cat_study.trials_dataframe()

In [21]:
################################################################
#####################     SUBMISSION   #########################
################################################################
final_clf.fit(X_tr_selec, y_tr, ) # use_best_model=True
final_preds = final_clf.predict(X_te_selec)

0:	learn: 0.6084276	total: 12.3ms	remaining: 4.25s
1:	learn: 0.5578253	total: 20.6ms	remaining: 3.54s
2:	learn: 0.5161744	total: 28.4ms	remaining: 3.25s
3:	learn: 0.4773283	total: 37ms	remaining: 3.17s
4:	learn: 0.4329543	total: 45.3ms	remaining: 3.09s
5:	learn: 0.4033543	total: 53.4ms	remaining: 3.02s
6:	learn: 0.3822908	total: 61.8ms	remaining: 2.99s
7:	learn: 0.3576703	total: 69.6ms	remaining: 2.94s
8:	learn: 0.3392314	total: 77.8ms	remaining: 2.91s
9:	learn: 0.3245509	total: 86.1ms	remaining: 2.89s
10:	learn: 0.3060509	total: 94.5ms	remaining: 2.88s
11:	learn: 0.2938498	total: 103ms	remaining: 2.86s
12:	learn: 0.2796937	total: 111ms	remaining: 2.85s
13:	learn: 0.2723225	total: 119ms	remaining: 2.82s
14:	learn: 0.2647635	total: 127ms	remaining: 2.8s
15:	learn: 0.2553688	total: 135ms	remaining: 2.79s
16:	learn: 0.2486119	total: 144ms	remaining: 2.79s
17:	learn: 0.2400356	total: 152ms	remaining: 2.78s
18:	learn: 0.2333730	total: 162ms	remaining: 2.78s
19:	learn: 0.2264644	total: 170ms

FileNotFoundError: [Errno 2] No such file or directory: './Data/submission.csv'

In [25]:
df_sub = pd.read_csv("/home/work/Aimers/data/submission.csv")
df_sub["target"] = final_preds
df_sub["target"] = df_sub["target"].map({0 : "Normal", 1 : "AbNormal"})

print('=============================')
print(df_sub["target"].value_counts())

target
Normal      17320
AbNormal       41
Name: count, dtype: int64


In [None]:
curr_date = datetime.now().strftime("%m-%d_%H-%M-%S")

# pickle.dump(final_clf, open(f"{config.model}_{curr_date}.pkl", "wb"))
# final_clf = pickle.load(open(".pkl", "rb"))
df_sub.to_csv(os.path.join(config.data_path, f"submission_{curr_date}_{exp_config}.csv"), index=False)