In [2]:
from tools.preprocess import LoadDataset

import pandas as pd
import numpy as np

from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest
from catboost import Pool,CatBoostClassifier

train_path = "../data/train.csv"
test_path = "../data/valid.csv"
valid_path = "../data/valid.csv"

# do with train, test, valid
data = LoadDataset(train_path,
                   test_path, 
                   valid_path)

data.nan_processing({"ALL": "DROP COLUMN"})
data.sampling("OFF")
data.scaling({"ALL": "MINMAX"})
data.encoding({"ALL": "LABEL"})
# data.feature_engineering("PCA PER PROCESS", {"min_proba": 0.9999})
print(data.train.x.shape)

Missing Value Processing...

(before processing)The number of nan
- train: 59028
- valid: 13149

  1. Processing <ALL> Columns with 'DROP COLUMN'

Finish!
(after processing)The number of nan
- train: 0
- valid: 0

Sampling with OFF...

(before sampling)Value count
target
0    31156
1     2000
Name: count, dtype: int64 

Finish!
(after sampling)Value count
target
0    31156
1     2000
Name: count, dtype: int64

Scaling our dataset...

  1. Scaling <ALL> Columns with 'MINMAX'

Finish! (the number of columns: 130)

Encoding our dataset...

  1. Encoding <ALL> Columns with 'LABEL'

Finish! (the number of columns: 12)

(33156, 142)


In [3]:
RANDOM_SEED = 42
Dam_RF = RandomForestClassifier(random_state=RANDOM_SEED)
Dam_RF.fit(data.train.get('Dam'), data.train.y)

Fill1_RF = RandomForestClassifier(random_state=RANDOM_SEED)
Fill1_RF.fit(data.train.get('Fill1'), data.train.y)

Fill2_RF = RandomForestClassifier(random_state=RANDOM_SEED)
Fill2_RF.fit(data.train.get('Fill2'), data.train.y)

Auto_RF = RandomForestClassifier(random_state=RANDOM_SEED)
Auto_RF.fit(data.train.get('AutoClave'), data.train.y)

In [4]:
params = {
    'iterations': 3000,                # 최대 3000번의 반복
    'learning_rate': 0.05,             # 학습률: 0.05 (초기값으로 적절히 설정)
    'depth': 6,                        # 트리의 깊이: 6 (복잡한 모델을 방지하기 위해 중간값 설정)
    'l2_leaf_reg': 3,                  # L2 정규화: 3 (모델 복잡도 제어)
    'one_hot_max_size': 10,            # one-hot 인코딩으로 변환할 카테고리형 변수의 최대 크기
    'random_seed': data.seed,          # 재현성을 위한 랜덤 시드 설정
    'task_type': "CPU",                # CPU 사용 (GPU로 변경 가능)
    'loss_function': 'Logloss',        # 이진 분류를 위한 로그 손실 함수
    'eval_metric': "F1",               # 평가 지표: F1 스코어
    'auto_class_weights': 'Balanced',  # 자동 클래스 가중치: 불균형 데이터에 대응
    'early_stopping_rounds': 500,      # 조기 종료를 위한 patience 설정
    'verbose': 100                     # 100회 반복마다 결과 출력
}

Dam_CB = CatBoostClassifier(**params, cat_features= sorted(set(data.train.get_cat_cols()) & set(data.train.get_cols("Dam"))))
Dam_CB.fit(data.train.get('Dam'), data.train.y, eval_set = [(data.valid.get('Dam'), data.valid.y)])

Fill1_CB = CatBoostClassifier(**params, cat_features= sorted(set(data.train.get_cat_cols()) & set(data.train.get_cols("Fill1"))))
Fill1_CB.fit(data.train.get('Fill1'), data.train.y, eval_set = [(data.valid.get('Fill1'), data.valid.y)])

Fill2_CB = CatBoostClassifier(**params, cat_features= sorted(set(data.train.get_cat_cols()) & set(data.train.get_cols("Fill2"))))
Fill2_CB.fit(data.train.get('Fill2'), data.train.y, eval_set = [(data.valid.get('Fill2'), data.valid.y)])

Auto_CB = CatBoostClassifier(**params, cat_features= sorted(set(data.train.get_cat_cols()) & set(data.train.get_cols("AutoClave"))))
Auto_CB.fit(data.train.get('AutoClave'), data.train.y, eval_set = [(data.valid.get('AutoClave'), data.valid.y)])

0:	learn: 0.5535732	test: 0.5099651	best: 0.5099651 (0)	total: 166ms	remaining: 8m 17s
100:	learn: 0.6213777	test: 0.5644570	best: 0.5688862 (98)	total: 1.29s	remaining: 37s
200:	learn: 0.6500290	test: 0.5776335	best: 0.5838492 (172)	total: 2.36s	remaining: 32.8s
300:	learn: 0.6981730	test: 0.5968074	best: 0.6006015 (297)	total: 3.43s	remaining: 30.8s
400:	learn: 0.7352738	test: 0.5798343	best: 0.6037606 (323)	total: 4.54s	remaining: 29.4s
500:	learn: 0.7601770	test: 0.5705596	best: 0.6037606 (323)	total: 5.62s	remaining: 28s
600:	learn: 0.7872784	test: 0.5598969	best: 0.6037606 (323)	total: 6.8s	remaining: 27.1s
700:	learn: 0.8089953	test: 0.5570174	best: 0.6037606 (323)	total: 8.02s	remaining: 26.3s
800:	learn: 0.8224466	test: 0.5462223	best: 0.6037606 (323)	total: 9.21s	remaining: 25.3s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.6037605619
bestIteration = 323

Shrink model to first 324 iterations.
0:	learn: 0.5044264	test: 0.4957772	best: 0.4957772 (0)	tota

<catboost.core.CatBoostClassifier at 0x2058bc36e20>

In [17]:
proba_RF = (
    Dam_RF.predict_proba(data.train.get('Dam')),
    Fill1_RF.predict_proba(data.train.get('Fill1')),
    Fill2_RF.predict_proba(data.train.get('Fill2')),
    Auto_RF.predict_proba(data.train.get('AutoClave')),
    Dam_CB.predict_proba(data.train.get('Dam')),
    Fill1_CB.predict_proba(data.train.get('Fill1')),
    Fill2_CB.predict_proba(data.train.get('Fill2')),
    Auto_CB.predict_proba(data.train.get('AutoClave')),
)

probability = pd.DataFrame(np.column_stack(proba_RF))
Main_Model = IsolationForest(contamination=0.0275, random_state=RANDOM_SEED)
Main_Model.fit(probability)

In [19]:
_proba_RF = (
    Dam_RF.predict_proba(data.valid.get('Dam')),
    Fill1_RF.predict_proba(data.valid.get('Fill1')),
    Fill2_RF.predict_proba(data.valid.get('Fill2')),
    Auto_RF.predict_proba(data.valid.get('AutoClave')),
    Dam_CB.predict_proba(data.valid.get('Dam')),
    Fill1_CB.predict_proba(data.valid.get('Fill1')),
    Fill2_CB.predict_proba(data.valid.get('Fill2')),
    Auto_CB.predict_proba(data.valid.get('AutoClave'))
)
probability = pd.DataFrame(np.column_stack(_proba_RF))

test_pred = Main_Model.predict(probability)
test_pred = pd.DataFrame(test_pred).replace({1:"Normal", -1:"AbNormal"})
test_pred.value_counts()

Normal      7228
AbNormal     122
Name: count, dtype: int64

In [20]:
answer = data.valid.y.replace({0:"Normal", 1:"AbNormal"})
f1 = f1_score(answer, test_pred, pos_label = "AbNormal")
print(f1)

0.211864406779661
