In [58]:
############################# IMPORT LIBRARY  #################################
import os
import random
import re
from tqdm.notebook import tqdm
from collections import Counter
from datetime import datetime
import argparse
import pickle
import logging
import numpy as np
import pandas as pd

# https://contrib.scikit-learn.org/category_encoders/index.html
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, QuantileTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

from sklearn.feature_selection import RFE, SelectFromModel, SelectKBest, f_classif, chi2
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, precision_score, recall_score

import optuna
from optuna.samplers import TPESampler, NSGAIISampler, NSGAIIISampler
import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = 200

In [59]:
#######################   CONFIG  #######################
parser = argparse.ArgumentParser(description='Anomaly Detection')

parser.add_argument('--data_path', type=str, default='./data')
parser.add_argument('--seed',type=int, default=110)

parser.add_argument('--model', type=str, default='cat')

parser.add_argument('-en', '--encoder', type=str, default='js')
parser.add_argument('-s', '--scaler', type=str, default='qt')

downsample_options = {1:"nearmiss", 2:"cluster", 3:"allknn", 4:"oneside", 5:"tomek"}
parser.add_argument('-ds', '--downsampling', type=int, default=4) # TOMEK

upsample_options = {1: "random", 2:"smote", 3:"adasyn", 4:"smotenc", 5:"smoten", 6:"borderline", 7:"kmeans", 8:"svm"}
parser.add_argument('-us', '--upsampling', type=int, default=5) # SMOTEE - NC

parser.add_argument('--fs_mode', type=bool, default=True, help='feature selection T:auto F:manual')
parser.add_argument('--estimator', type=str, default='rfc', help="using for feature selection")
parser.add_argument('--selector', type=str, default='kbest', help='auto feature selector')

parser.add_argument('--k', type=int, default=10, help='k fold split')
parser.add_argument('--check_all', type=bool, default=True)
parser.add_argument('--tune_mode', type=bool, default=True, help='optuna tuning')

config = parser.parse_args([])

exp_config = f"{config.encoder}_{config.scaler}_{downsample_options[config.downsampling]}_{upsample_options[config.upsampling]}"

random.seed(config.seed)
np.random.seed(config.seed)

In [60]:
config

Namespace(data_path='./data', seed=110, model='cat', encoder='js', scaler='qt', downsampling=4, upsampling=5, fs_mode=True, estimator='rfc', selector='kbest', k=10, check_all=True, tune_mode=True)

In [61]:
#######################   LOAD DATA  #######################
df_tr = pd.read_csv(os.path.join(config.data_path, "train_v1.csv"))
df_te = pd.read_csv(os.path.join(config.data_path, "test_v1.csv"))
df_list = [df_tr, df_te]

# Workorder (test에 있는데, train에는 없는 경우가 있어 그냥 제외)
# 대신 Workorder Categeory 사용
for df in df_list:
    df.drop(["Workorder"], axis=1, inplace=True)

In [62]:
############################  FEATURE HANDLING  ###########################
## CATEGORICAL FEATURES
cat_features = ["Equipment_Dam",
                "Equipment_Fill1",
                "Equipment_Fill2",
                "Model.Suffix",
                "Workorder Category",
                "Chamber Temp. Judge Value_AutoClave"]

## BINNING FEATURES
bins_features = df_tr.columns[df_tr.columns.str.contains(r".*Bins.*")].tolist()
# Bins 열 만드는 데 사용된 열
from_bins_features = [re.sub(r'\s*Bins\s*', '', f).strip() for f in bins_features]

cat_features.extend(bins_features)

for df in df_list:
    df[cat_features] = df[cat_features].astype("category")

## NUMERICAL FEATURES
num_features = df_tr.select_dtypes(exclude=["category"]).columns.to_list()
num_features.remove("target")

## ALL FEATURES
all_features = num_features + cat_features

## TARGET ENCODING
df_tr["target"] = df_tr["target"].map({"Normal": 0, "AbNormal": 1})

## DATA SPLITTING 
X_tr, y_tr = df_tr.drop("target", axis=1), df_tr["target"]
X_te = df_te.drop("Set ID", axis=1)

In [63]:
#############################  FEATURE ENCODING/SCALING ###########################
## ENCODING
if config.encoder == "le":
    le = LabelEncoder()
    for cat_feature in cat_features:
        X_tr[cat_feature] = le.fit_transform(X_tr[cat_feature])
        X_te[cat_feature] = le.transform(X_te[cat_feature])
        
elif config.encoder == "js":
    js = ce.JamesSteinEncoder(cols=cat_features)
    
    X_tr = js.fit_transform(X_tr, y_tr)
    X_te = js.transform(X_te)
    
elif config.encoder == "woe":
    woe = ce.WOEEncoder(cols=cat_features)
    
    X_tr = woe.fit_transform(X_tr, y_tr)
    X_te = woe.transform(X_te)
    
elif config.encoder == "ohe":
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
    
    X_tr[cat_features] = ohe.fit_transform(X_tr[cat_features])
    X_te[cat_features] = ohe.transform(X_te[cat_features])

In [64]:
## SCALING
if config.scaler == "mms":
    mms = MinMaxScaler()
    X_tr[num_features] = mms.fit_transform(X_tr[num_features])
    X_te[num_features] = mms.transform(X_te[num_features])
    
elif config.scaler == "ss":
    ss = StandardScaler()
    X_tr[num_features] = ss.fit_transform(X_tr[num_features])
    X_te[num_features] = ss.transform(X_te[num_features])
    
elif config.scaler == "qt":
    qt = QuantileTransformer(random_state=config.seed, output_distribution='normal', n_quantiles=min(100, len(X_tr) // 5)) # n_quantiles = 1000
    
    X_tr[num_features] = qt.fit_transform(X_tr[num_features])
    X_te[num_features] = qt.transform(X_te[num_features])

elif config.scaler == "pt":
    pts = PowerTransformer(method='yeo-johnson')
    
    X_tr[num_features] = pts.fit_transform(X_tr[num_features])
    X_te[num_features] = pts.transform(X_te[num_features])

In [65]:
from collections import Counter
from sklearn.utils import resample
from imblearn.under_sampling import (NearMiss,
                                     ClusterCentroids,
                                     AllKNN,
                                     OneSidedSelection,
                                     TomekLinks)
from imblearn.over_sampling import (RandomOverSampler,
                                    SMOTE,
                                    ADASYN,
                                    SMOTENC,
                                    SMOTEN,
                                    BorderlineSMOTE,
                                    KMeansSMOTE,
                                    SVMSMOTE)
from imblearn.combine import SMOTEENN
# https://imbalanced-learn.org/stable/references/index.html

def random_downsample(df, random_seed, sample_ratio=1.0):
    df_normal = df[df["target"] == 0] 
    df_abnormal = df[df["target"] == 1]
    
    downsampled = resample(
        df_normal,
        replace=False,
        n_samples=int(len(df_abnormal) * sample_ratio),
        random_state=random_seed
    )
    
    downsampled_df = pd.concat([df_abnormal, downsampled])
    
    return downsampled_df

def downsample(X, y, method, random_seed):
    # NearMiss
    if method == "nearmiss":
        # sampling_strategy="auto"
        nm = NearMiss(sampling_strategy=0.4)
        X_downsampled, y_downsampled = nm.fit_resample(X, y)
    # ClusterCentroids
    elif method == "cluster":
        cc = ClusterCentroids(random_state=random_seed)
        X_downsampled, y_downsampled = cc.fit_resample(X, y)
    # AllKNN
    elif method == "allknn":
        allknn = AllKNN()
        X_downsampled, y_downsampled = allknn.fit_resample(X, y)
    # OneSidedSelection
    elif method == "oneside":
        oss = OneSidedSelection(random_state=random_seed)
        X_downsampled, y_downsampled = oss.fit_resample(X, y)
    # Tomeklinks
    elif method == "tomek":
        tl = TomekLinks()
        X_downsampled, y_downsampled = tl.fit_resample(X, y)
    
    X_downsampled_df= pd.DataFrame(X_downsampled, columns=X.columns)
    y_downsampled_df = pd.Series(y_downsampled, name="target") 
    downsampled_df = pd.concat([X_downsampled_df, y_downsampled_df], axis=1)
    
    print('DOWN SAMPLING')
    print('=============')
    print('Original dataset shape %s' % Counter(y))
    print('Resampled dataset shape %s' % Counter(y_downsampled), end='\n')
    
    return downsampled_df


def upsample(X, y, cat_idx, method, random_seed):
    
    if method == "random":
        ros = RandomOverSampler(random_state=random_seed)
        X_upsampled, y_upsampled = ros.fit_resample(X, y)
        
    # SMOTE
    elif method == "smote":
        smote = SMOTE(random_state=random_seed)
        X_upsampled, y_upsampled = smote.fit_resample(X, y)
        
    # ADASYN
    elif method == "adasyn":
        adasyn = ADASYN(random_state=random_seed)
        X_upsampled, y_upsampled = adasyn.fit_resample(X, y)
        
    # SMOTE-NC
    elif method == "smotenc":
        smotenc = SMOTENC(random_state=random_seed, sampling_strategy="auto", categorical_features=cat_idx)
        X_upsampled, y_upsampled = smotenc.fit_resample(X, y)
        
    elif method == "smoten":
        smoten = SMOTEN(random_state=random_seed, sampling_strategy="auto", k_neighbors=5)
        X_upsampled, y_upsampled = smoten.fit_resample(X, y)
        
    elif method == "borderline":
        borderline_smote = BorderlineSMOTE(random_state=random_seed)
        X_upsampled, y_upsampled = borderline_smote.fit_resample(X, y)
        
    elif method == "kmeans":
        kmeans_smote = KMeansSMOTE(random_state=random_seed, sampling_strategy="auto", k_neighbors=5)
        X_upsampled, y_upsampled = kmeans_smote.fit_resample(X, y)
        
    elif method == "svm":
        svm_smote = SVMSMOTE(random_state=42)
        X_upsampled, y_upsampled = svm_smote.fit_resample(X, y)
        
    X_upsampled_df= pd.DataFrame(X_upsampled, columns=X.columns)
    y_upsampled_df = pd.Series(y_upsampled, name="target") 
    upsampled_df = pd.concat([X_upsampled_df, y_upsampled_df], axis=1)
    
    print('UP SAMPLNG')
    print('==========')
    print('Original dataset shape %s' % Counter(y))
    print('Resampled dataset shape %s' % Counter(y_upsampled), end='\n')
    
    return upsampled_df

In [66]:
#################################  DOWN SAMPLING  ###############################
downsampled_df_tr = downsample(X_tr, y_tr, method=downsample_options[config.downsampling], random_seed=config.seed)

#################################  UP SAMPLING  ###############################
cat_idx = [downsampled_df_tr.columns.get_loc(col) for col in cat_features]
cat_idx = [X_tr.columns.get_loc(col) for col in cat_features]
X_tr = downsampled_df_tr.drop("target", axis=1)
y_tr = downsampled_df_tr["target"]

upsampled_df_tr = upsample(X_tr, y_tr, cat_idx=cat_idx, method=upsample_options[config.upsampling], random_seed=config.seed)

DOWN SAMPLING
Original dataset shape Counter({0: 36197, 1: 2210})
Resampled dataset shape Counter({0: 35225, 1: 2210})
UP SAMPLNG
Original dataset shape Counter({0: 35225, 1: 2210})
Resampled dataset shape Counter({0: 35225, 1: 35225})


In [67]:
## RESAMPLED DATA
# X_tr = downsampled_df_tr.drop("target", axis=1)
# y_tr = downsampled_df_tr["target"]

X_tr = upsampled_df_tr.drop("target", axis=1)
y_tr = upsampled_df_tr["target"]

In [68]:
################ MODEL ############### 
classifiers = {
    "cat": CatBoostClassifier(random_state=config.seed, auto_class_weights="Balanced"),
    "lgbm": LGBMClassifier(random_state=config.seed,),
    "xgb": XGBClassifier(random_state=config.seed, eval_metric='auc', objective="binary:logistic"),
    "ada": AdaBoostClassifier(random_state=config.seed),
    "rfc": RandomForestClassifier(random_state=config.seed, class_weight='balanced'),
    "lr": LogisticRegression(random_state=config.seed),
    "extra": ExtraTreesClassifier(random_state=config.seed)
}

In [70]:
###############################  FEATURE SELECTION  ############################
if config.fs_mode:
    estimator = classifiers[config.estimator]
    estimator.fit(X_tr, y_tr)
    
    selectors = {
        'rfe': RFE(estimator=estimator, n_features_to_select=50),
        'sfm': SelectFromModel(estimator=estimator, threshold="mean"),
        'kbest': SelectKBest(score_func=f_classif, k=20),
    }
    
    selector = selectors[config.selector]
    
    # Fit the selector on the training data
    selector.fit(X_tr, y_tr)
    
    # Get the support mask of selected features
    support_mask = selector.get_support()
    
    # Retain selected features from the original DataFrame and preserve feature names
    X_tr_selec = X_tr.loc[:, support_mask]
    X_te_selec = X_te.loc[:, support_mask]
    
else:
    # 기존 열 대신 Bins 열 사용
    selected_features = [feature for feature in all_features if feature not in from_bins_features]
    
    X_tr_selec = X_tr[selected_features]
    X_te_selec = X_te[selected_features]
    
print("FEATRUE SELECTION")
print("Before ", X_tr.shape)
print("After ", X_tr_selec.shape, end='\n')

FEATRUE SELECTION
Before  (70450, 173)
After  (70450, 20)


In [47]:
###############################  EVALUATION  ############################
stk = StratifiedKFold(n_splits=10, random_state=config.seed, shuffle=True)
rstk = RepeatedStratifiedKFold(n_splits=10, random_state=config.seed)

if config.check_all:
    classifiers_lst = list(classifiers.values())
    score_df = pd.DataFrame(columns=classifiers.keys())
    
    for clf_name, clf in classifiers.items():
        scores = cross_val_score(clf, X_tr_selec, y_tr, scoring="f1", cv=stk)
        print(scores)
        score_df.loc[0, clf_name] = scores.mean()
        print(score_df)
    
    
else:
    metrics = ['accuracy', 'precision', 'recall', 'f1', 'f1_weighted', 'roc_auc']
    score_df = pd.DataFrame(columns=metrics)
    
    for metric in metrics:
        scores = cross_val_score(classifiers["cat"], X_tr_selec, y_tr, scoring=metric, cv=stk)
        score_df[metric] = scores.mean()
    
print("MODEL CHECK")
print(score_df, end='\n')

Learning rate set to 0.060383
0:	learn: 0.6642521	total: 5.47ms	remaining: 5.46s
1:	learn: 0.6403070	total: 11.1ms	remaining: 5.53s
2:	learn: 0.6142507	total: 16.3ms	remaining: 5.42s
3:	learn: 0.5869110	total: 22.6ms	remaining: 5.62s
4:	learn: 0.5601658	total: 28.3ms	remaining: 5.64s
5:	learn: 0.5274678	total: 34.2ms	remaining: 5.67s
6:	learn: 0.5023339	total: 39.7ms	remaining: 5.64s
7:	learn: 0.4851807	total: 46.1ms	remaining: 5.72s
8:	learn: 0.4694051	total: 52ms	remaining: 5.72s
9:	learn: 0.4528567	total: 57.2ms	remaining: 5.67s
10:	learn: 0.4410299	total: 63ms	remaining: 5.66s
11:	learn: 0.4228134	total: 69.2ms	remaining: 5.7s
12:	learn: 0.4087026	total: 75.3ms	remaining: 5.71s
13:	learn: 0.3934441	total: 81ms	remaining: 5.71s
14:	learn: 0.3837385	total: 87.1ms	remaining: 5.72s
15:	learn: 0.3710082	total: 92.7ms	remaining: 5.7s
16:	learn: 0.3597929	total: 98.5ms	remaining: 5.69s
17:	learn: 0.3504595	total: 104ms	remaining: 5.67s
18:	learn: 0.3409833	total: 110ms	remaining: 5.66s
19

In [48]:
def catboost_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.0001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.5, 1.0),
        'od_type': trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        'od_wait': trial.suggest_int("od_wait", 10, 50),
    }

    cat_clf = CatBoostClassifier(**params, random_state=config.seed, auto_class_weights="Balanced",) # eval_metric="TotalF1"
    
    stk = StratifiedKFold(n_splits=config.k, random_state=config.seed, shuffle=True)
    f1_scores = np.empty(config.k)
    
    for idx, (tr_idx, val_idx) in enumerate(stk.split(X_tr_selec, y_tr)):
        X_tr_fold, X_val_fold = X_tr.iloc[tr_idx], X_tr.iloc[val_idx]
        y_tr_fold, y_val_fold = y_tr.iloc[tr_idx], y_tr.iloc[val_idx]
        
        cat_clf.fit(X_tr_fold, y_tr_fold, eval_set=[(X_val_fold, y_val_fold)], early_stopping_rounds=50, verbose=False)
        y_pred_fold = cat_clf.predict(X_val_fold)
        f1_scores[idx] = f1_score(y_val_fold, y_pred_fold)

    return np.mean(f1_scores) 

In [51]:
def lgbm_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'num_leaves': trial.suggest_int('num_leaves', 10, 1000),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
    }
    
    lgbm_clf = LGBMClassifier(**params, random_state=config.seed)
    
    stk = StratifiedKFold(n_splits=config.k, random_state=config.seed, shuffle=True)
    f1_scores = np.empty(config.k)
    
    for idx, (tr_idx, val_idx) in enumerate(stk.split(X_tr_selec, y_tr)):
        X_tr_fold, X_val_fold = X_tr_selec.iloc[tr_idx], X_tr_selec.iloc[val_idx]
        y_tr_fold, y_val_fold = y_tr.iloc[tr_idx], y_tr.iloc[val_idx]
        
        lgbm_clf.fit(X_tr_fold, y_tr_fold)
        y_pred_fold = lgbm_clf.predict(X_val_fold)
        f1_scores[idx] = f1_score(y_val_fold, y_pred_fold)

    return np.mean(f1_scores)

In [52]:
def xgboost_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.0001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),  # colsample_bylevel -> colsample_bytree
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),  # XGBoost의 중요한 파라미터 중 하나
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),  # 트리의 분할을 조정하는 파라미터
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),  # L1 정규화 항
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),  # L2 정규화 항
    }

    xgb_clf = XGBClassifier(**params, random_state=config.seed, use_label_encoder=False,
                            eval_metric='auc',
                            early_stopping_rounds=50, 
                            objective = "binary:logistic",)
    
    stk = StratifiedKFold(n_splits=config.k, random_state=config.seed, shuffle=True)
    f1_scores = np.empty(config.k)
    
    for idx, (tr_idx, val_idx) in enumerate(stk.split(X_tr_selec, y_tr)):
        X_tr_fold, X_val_fold = X_tr.iloc[tr_idx], X_tr.iloc[val_idx]
        y_tr_fold, y_val_fold = y_tr.iloc[tr_idx], y_tr.iloc[val_idx]
        
        xgb_clf.fit(X_tr_fold, y_tr_fold, eval_set=[(X_val_fold, y_val_fold)], early_stopping_rounds=50, verbose=False)
        y_pred_fold = xgb_clf.predict(X_val_fold)
        f1_scores[idx] = f1_score(y_val_fold, y_pred_fold)

    return np.mean(f1_scores)

In [53]:
# logger = logging.getLogger()
    
# logger.setLevel(logging.INFO)
# logger.addHandler(logging.FileHandler(f"./log/{config.model}_optuna.log", mode="w"))
    
# optuna.logging.enable_propagation() 
# optuna.logging.disable_default_handler()
        
sampler = NSGAIISampler(seed=config.seed)
pruner = optuna.pruners.HyperbandPruner()

if config.tune_mode and config.model == "cat":
    
    cat_study = optuna.create_study(study_name="cat", direction='maximize', sampler=sampler, pruner=pruner)
    cat_study.optimize(catboost_objective, n_trials=15)
    
    cat_best_params = cat_study.best_params
    cat_best_score = cat_study.best_value
    
    print("CatBoost Best Hyperparams: ", cat_best_params)
    print("CatBoost Best F1 Score: ", cat_best_score, end='\n')
    
    final_clf = CatBoostClassifier(**cat_best_params, random_state=config.seed, auto_class_weights="Balanced",)
    
elif config.tune_mode and config.model == "lgbm":
    
    lgbm_study = optuna.create_study(study_name="lgbm", direction='maximize', sampler=sampler, pruner=pruner)
    lgbm_study.optimize(lgbm_objective, n_trials=15)
    
    lgbm_best_params = lgbm_study.best_params
    lgbm_best_score= lgbm_study.best_value
    
    print("LGBM Best Hyperparams: ",lgbm_best_params)
    print("LGBM Best F1 Score: ", lgbm_best_score, end='\n')
    
    final_clf = LGBMClassifier(**lgbm_best_params, random_state=config.seed,)

elif config.tune_mode and config.model == "xgb":
    
    xgb_study = optuna.create_study(study_name="xgb", direction='maximize', sampler=sampler, pruner=pruner)
    xgb_study.optimize(xgboost_objective, n_trials=15)
    
    xgb_best_params = xgb_study.best_params
    xgb_best_score = xgb_study.best_value
    print("XGBoost Best Hyperparams: ", xgb_best_params)
    print("XGBoost Best F1 Score: ", xgb_best_score)
    
else:
    final_clf = classifiers[config.model]

# with open(f"./log/{config.model}_optuna.log") as f:
#     assert f.readline().startswith("A new study created")
#     assert f.readline() == "Start optimization.\n"

[I 2024-08-26 13:50:08,985] A new study created in memory with name: lgbm


[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004202 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12495
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507
[LightGBM] [Info] Start training from score -0.016507
[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001241 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12409
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507


[I 2024-08-26 13:50:42,740] Trial 0 finished with value: 0.9659790583707679 and parameters: {'n_estimators': 204, 'learning_rate': 0.02074487219119474, 'max_depth': 11, 'num_leaves': 617, 'subsample': 0.8397452762602085, 'colsample_bytree': 0.8412315646112949}. Best is trial 0 with value: 0.9659790583707679.


[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003281 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12495
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507
[LightGBM] [Info] Start training from score -0.016507
[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001172 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12409
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507


[I 2024-08-26 13:51:36,275] Trial 1 finished with value: 0.9696401344792551 and parameters: {'n_estimators': 960, 'learning_rate': 0.01881247253010721, 'max_depth': 13, 'num_leaves': 95, 'subsample': 0.6937342267656712, 'colsample_bytree': 0.6452218265334382}. Best is trial 1 with value: 0.9696401344792551.


[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001105 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12495
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507
[LightGBM] [Info] Start training from score -0.016507
[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001145 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12409
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info

[I 2024-08-26 13:52:11,917] Trial 2 finished with value: 0.9252305992153472 and parameters: {'n_estimators': 221, 'learning_rate': 0.0010476378230807992, 'max_depth': 11, 'num_leaves': 498, 'subsample': 0.5106987880086793, 'colsample_bytree': 0.7797550347675031}. Best is trial 1 with value: 0.9696401344792551.


[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001089 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12495
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507
[LightGBM] [Info] Start training from score -0.016507
[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001019 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12409
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info

[I 2024-08-26 13:52:33,449] Trial 3 finished with value: 0.9475129301439909 and parameters: {'n_estimators': 169, 'learning_rate': 0.0065363111684785015, 'max_depth': 10, 'num_leaves': 773, 'subsample': 0.8005098898589394, 'colsample_bytree': 0.708679084820276}. Best is trial 1 with value: 0.9696401344792551.


[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001063 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12495
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507
[LightGBM] [Info] Start training from score -0.016507
[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001274 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12409
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info

[I 2024-08-26 13:52:46,821] Trial 4 finished with value: 0.8569198137941869 and parameters: {'n_estimators': 401, 'learning_rate': 0.0011222481062825112, 'max_depth': 6, 'num_leaves': 856, 'subsample': 0.7455556142668338, 'colsample_bytree': 0.6248022436840732}. Best is trial 1 with value: 0.9696401344792551.


[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12495
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507
[LightGBM] [Info] Start training from score -0.016507
[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003312 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12409
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507


[I 2024-08-26 13:54:28,069] Trial 5 finished with value: 0.9689716217364763 and parameters: {'n_estimators': 785, 'learning_rate': 0.012035550442299522, 'max_depth': 11, 'num_leaves': 648, 'subsample': 0.5858344070520952, 'colsample_bytree': 0.8545928192567123}. Best is trial 1 with value: 0.9696401344792551.


[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001228 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12495
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507
[LightGBM] [Info] Start training from score -0.016507
[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004227 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12409
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507


[I 2024-08-26 13:54:38,539] Trial 6 finished with value: 0.9481642820338824 and parameters: {'n_estimators': 136, 'learning_rate': 0.015843477466331905, 'max_depth': 8, 'num_leaves': 256, 'subsample': 0.6995544348694334, 'colsample_bytree': 0.8569301680959336}. Best is trial 1 with value: 0.9696401344792551.


[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000918 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12495
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507
[LightGBM] [Info] Start training from score -0.016507
[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000762 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12409
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info

[I 2024-08-26 13:54:57,165] Trial 7 finished with value: 0.968987042370492 and parameters: {'n_estimators': 813, 'learning_rate': 0.06162306576319882, 'max_depth': 8, 'num_leaves': 22, 'subsample': 0.8804104152527709, 'colsample_bytree': 0.591331080622456}. Best is trial 1 with value: 0.9696401344792551.


[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004787 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12495
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507
[LightGBM] [Info] Start training from score -0.016507
[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001760 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12409
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507


[I 2024-08-26 13:55:16,780] Trial 8 finished with value: 0.9690035303160064 and parameters: {'n_estimators': 814, 'learning_rate': 0.07970857876468943, 'max_depth': 5, 'num_leaves': 900, 'subsample': 0.9431995615478999, 'colsample_bytree': 0.9934701188538326}. Best is trial 1 with value: 0.9696401344792551.


[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001143 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12495
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507
[LightGBM] [Info] Start training from score -0.016507
[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001420 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12409
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info

[I 2024-08-26 13:58:41,259] Trial 9 finished with value: 0.9692839476356774 and parameters: {'n_estimators': 873, 'learning_rate': 0.0038055245421162497, 'max_depth': 19, 'num_leaves': 492, 'subsample': 0.7456994408598546, 'colsample_bytree': 0.9530921624917044}. Best is trial 1 with value: 0.9696401344792551.


[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000933 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12495
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507
[LightGBM] [Info] Start training from score -0.016507
[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000887 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12409
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info

[I 2024-08-26 14:00:09,085] Trial 10 finished with value: 0.9697264971796085 and parameters: {'n_estimators': 796, 'learning_rate': 0.021060014026957615, 'max_depth': 11, 'num_leaves': 841, 'subsample': 0.7026041309562342, 'colsample_bytree': 0.5333395145643103}. Best is trial 10 with value: 0.9697264971796085.


[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001215 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12495
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507
[LightGBM] [Info] Start training from score -0.016507
[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005123 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12409
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507


[I 2024-08-26 14:01:40,276] Trial 11 finished with value: 0.963477894931394 and parameters: {'n_estimators': 671, 'learning_rate': 0.0021870816190847726, 'max_depth': 19, 'num_leaves': 235, 'subsample': 0.5409643172742791, 'colsample_bytree': 0.7782533892432042}. Best is trial 10 with value: 0.9697264971796085.


[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001375 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12495
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507
[LightGBM] [Info] Start training from score -0.016507
[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001218 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12409
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info

[I 2024-08-26 14:05:14,562] Trial 12 finished with value: 0.9633338015953387 and parameters: {'n_estimators': 866, 'learning_rate': 0.001243585151923008, 'max_depth': 16, 'num_leaves': 517, 'subsample': 0.5254103588295729, 'colsample_bytree': 0.8535967369680808}. Best is trial 10 with value: 0.9697264971796085.


[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000944 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12495
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507
[LightGBM] [Info] Start training from score -0.016507
[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12409
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507


[I 2024-08-26 14:06:29,041] Trial 13 finished with value: 0.9272767252618449 and parameters: {'n_estimators': 923, 'learning_rate': 0.0010266983883039805, 'max_depth': 8, 'num_leaves': 993, 'subsample': 0.8811617473462106, 'colsample_bytree': 0.8801669199511248}. Best is trial 10 with value: 0.9697264971796085.


[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001211 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12495
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495873 -> initscore=-0.016507
[LightGBM] [Info] Start training from score -0.016507
[LightGBM] [Info] Number of positive: 31183, number of negative: 31702
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001222 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12409
[LightGBM] [Info] Number of data points in the train set: 62885, number of used features: 50
[LightGBM] [Info

[I 2024-08-26 14:08:01,483] Trial 14 finished with value: 0.962670303606064 and parameters: {'n_estimators': 528, 'learning_rate': 0.005422110530545388, 'max_depth': 11, 'num_leaves': 481, 'subsample': 0.6190652248684867, 'colsample_bytree': 0.9298349555985272}. Best is trial 10 with value: 0.9697264971796085.


LGBM Best Hyperparams:  {'n_estimators': 796, 'learning_rate': 0.021060014026957615, 'max_depth': 11, 'num_leaves': 841, 'subsample': 0.7026041309562342, 'colsample_bytree': 0.5333395145643103}
LGBM Best F1 Score:  0.9697264971796085


In [35]:
#cat_study.visualize()
#cat_study.trials_dataframe()

AttributeError: 'Study' object has no attribute 'visualize'

In [54]:
################################################################
#####################     SUBMISSION   #########################
################################################################
final_clf.fit(X_tr_selec, y_tr, ) # use_best_model=True
final_preds = final_clf.predict(X_te_selec)

df_sub = pd.read_csv(os.path.join(config.data_path, "submission.csv"))
df_sub["target"] = final_preds
df_sub["target"] = df_sub["target"].map({0 : "Normal", 1 : "AbNormal"})

print('=============================')
print(df_sub["target"].value_counts())

[LightGBM] [Info] Number of positive: 34648, number of negative: 35225
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000811 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12501
[LightGBM] [Info] Number of data points in the train set: 69873, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495871 -> initscore=-0.016516
[LightGBM] [Info] Start training from score -0.016516
target
Normal      17276
AbNormal       85
Name: count, dtype: int64


In [55]:
curr_date = datetime.now().strftime("%m-%d_%H-%M-%S")

# pickle.dump(final_clf, open(f"{config.model}_{curr_date}.pkl", "wb"))
# final_clf = pickle.load(open(".pkl", "rb"))
df_sub.to_csv(os.path.join(config.data_path, f"submission_{curr_date}_{exp_config}.csv"), index=False)