# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [6]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

import os
from pprint import pprint
from typing import Any, Dict

#import rtdl
import scipy.special
import zero
from collections import Counter
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks, CondensedNearestNeighbour
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import OneSidedSelection
from sklearn.mixture import GaussianMixture

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

### 데이터 읽어오기


In [7]:
ROOT_DIR = "./data"
RANDOM_STATE = 881

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))

수치, 범주형 미리 나누어서 처리

In [8]:
def preprocess_train_data(train_data, target):
    cat_features = []
    num_features = []

    # 범주형 특성과 수치형 특성 분리 및 전처리
    for col in train_data.columns:
        if col != target:
            if train_data[col].dtype == 'object':
                le = LabelEncoder()
                train_data[col] = le.fit_transform(train_data[col])
                cat_features.append(col)
            else:
                if train_data[col].nunique() > 1: 
                    train_data[col] = train_data[col].astype(float)
                    num_features.append(col)
                    
    # 특정 컬럼 제거 (예시로 "Workorder"가 포함된 컬럼 제거)
    for col in cat_features[:]:  # 리스트의 복사본을 생성
        if "Workorder" in col:
            cat_features.remove(col)
    
    # 수치형과 범주형 데이터의 결합
    features = num_features + cat_features
    df_final = train_data[features + [target]]  # target 열을 포함하여 최종 데이터프레임 생성

    # 각 범주형 변수의 고유값 수 계산
    train_cardinalities = [train_data[cat].nunique() for cat in cat_features]

    return df_final, cat_features, num_features, train_cardinalities

# Example usage
target = 'target'  # target 열 이름을 설정
train_data, cat_features, num_features, train_cardinalities = preprocess_train_data(train_data, target)


# 결과 확인
print("최종 데이터의 크기:", train_data.shape)
print("범주형 특성:", cat_features)
print("수치형 특성:", num_features)
print("범주형 특성별 고유값 수:", train_cardinalities)

최종 데이터의 크기: (40506, 163)
범주형 특성: ['Wip Line_Dam', 'Process Desc._Dam', 'Equipment_Dam', 'Model.Suffix_Dam', 'Insp Judge Code_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam', 'Wip Line_AutoClave', 'Process Desc._AutoClave', 'Equipment_AutoClave', 'Model.Suffix_AutoClave', 'Insp Judge Code_AutoClave', '1st Pressure Judge Value_AutoClave', '2nd Pressure Judge Value_AutoClave', '3rd Pressure Judge Value_AutoClave', 'Chamber Temp. Judge Value_AutoClave', 'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave', 'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave', 'Wip Line_Fill1', 'Process Desc._Fill1', 'Equipment_Fill1', 'Model.Suffix_Fill1', 'Insp Judge Code_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1', 'Wip Line_Fill2', 'Process Desc._Fill2', 'Equipment_Fill2', 'Model.Suffix_Fill2', 'Insp Judge Code_Fill2', 'HEAD NORMAL COORDINATE X AX

### 샘플링


기존의 랜덤샘플링 코드


In [9]:
df_normal = train_data[train_data["target"] == "Normal"]
df_abnormal = train_data[train_data["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

# normal_ratio = 14.5  # 1.0 means 1:1 ratio
normal_ratio = 1.0  # 1.0 means 1:1 ratio
df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")
print('features of initial data: ', len(df_concat.columns))

Total: Normal: 38156, AbNormal: 2350
features of initial data:  163


새로 만든 oss, adasyn 코드

In [10]:
from imblearn.under_sampling import OneSidedSelection
from imblearn.over_sampling import ADASYN
import pandas as pd
import numpy as np

def oss_resample(features, target, RANDOM_STATE, step_ratio=0.02):
    """
    Perform One-Sided Selection (OSS) under-sampling.
    """
    oss = OneSidedSelection(random_state=RANDOM_STATE)
    X_oss, y_oss = oss.fit_resample(features, target)
    
    print('Performing OSS...')

    # Calculate the number of samples needed to achieve the step ratio
    initial_count_major = (y_oss == 'Normal').sum()
    initial_count_minor = (y_oss == 'AbNormal').sum()

    target_count_major = int(initial_count_major * (1 - step_ratio))
    count_major = initial_count_major
    print('Target ratio: ', target_count_major / initial_count_minor)

    while count_major > target_count_major:
        X_oss, y_oss = oss.fit_resample(X_oss, y_oss)
        count_major = (y_oss == 'Normal').sum()
        count_minor = (y_oss == 'AbNormal').sum()
        print('count_major: ', count_major, 'count_minor: ', count_minor, 'Updated ratio:', count_major / count_minor)

    return X_oss, y_oss

def adasyn_resample(features, target, RANDOM_STATE, step_ratio=0.05):
    """
    Perform ADASYN over-sampling.
    """
    adasyn = ADASYN(random_state=RANDOM_STATE)

    X_adasyn, y_adasyn = adasyn.fit_resample(features, target)
    
    print('Performing ADASYN...')
    
    # Calculate the number of samples needed to achieve the step ratio
    initial_count_major = (y_adasyn == 'Normal').sum()
    initial_count_minor = (y_adasyn == 'AbNormal').sum()

    target_count_minor = int(initial_count_minor * (1 + step_ratio))
    count_minor = initial_count_minor
    print('initial count minor: ', initial_count_minor)
    print('Target ratio: ', initial_count_major / target_count_minor)

    return X_adasyn, y_adasyn

def sampling(df, RANDOM_STATE, target_ratio=1.5):
    """
    Perform under-sampling and over-sampling to adjust the class distribution.

    Parameters:
    - df: DataFrame containing the data with 'target' column.
    - RANDOM_STATE: int, random state for reproducibility.
    - target_ratio: float, the desired ratio of Normal to AbNormal after sampling.
    - step_ratio: float, the step ratio to apply in each under-sampling and over-sampling step.

    Returns:
    - df_concat: DataFrame with the sampled data.
    """
    
    # Separate the target column
    target = df['target']
    features = df.drop(columns=['target'])

    # Initial class distribution
    count_major = (target == 'Normal').sum()
    count_minor = (target == 'AbNormal').sum()
    current_ratio = count_major / count_minor
    print('Initial class distribution - Normal:', count_major, 'AbNormal:', count_minor)
    print('Initial ratio:', current_ratio)

    while current_ratio > target_ratio:
        # Perform One-Sided Selection (OSS) under-sampling
        X_oss, y_oss = oss_resample(features, target, RANDOM_STATE, step_ratio=0.02)
        df_oss = pd.concat([pd.DataFrame(X_oss, columns=features.columns), pd.Series(y_oss, name='target')], axis=1)

        # Perform ADASYN over-sampling
        X_adasyn, y_adasyn = adasyn_resample(df_oss.drop(columns=['target']), df_oss['target'], RANDOM_STATE, step_ratio=0.05)
        df_adasyn = pd.concat([pd.DataFrame(X_adasyn, columns=features.columns), pd.Series(y_adasyn, name='target')], axis=1)

        features = df_adasyn.drop(columns=['target'])
        target = df_adasyn['target']

        # Update class distribution
        count_major = (target == 'Normal').sum()
        count_minor = (target == 'AbNormal').sum()
        current_ratio = count_major / count_minor
        print('Updated class distribution - Normal:', count_major, 'AbNormal:', count_minor)
        print('Updated ratio:', current_ratio)

    df_concat = df_adasyn

    # Reorder the columns to place 'target' at the front
    columns = ['target'] + [col for col in df_concat.columns if col != 'target']
    df_concat = df_concat[columns]

    # Print the final counts
    print(df_concat['target'].value_counts())

    return df_concat

# Example usage:
RANDOM_STATE = 42
df_concat = sampling(train_data, RANDOM_STATE, target_ratio=1.5)


Initial class distribution - Normal: 38156 AbNormal: 2350
Initial ratio: 16.23659574468085
Performing OSS...
Target ratio:  15.43531914893617
count_major:  36579 count_minor:  2350 Updated ratio: 15.565531914893617
count_major:  36459 count_minor:  2350 Updated ratio: 15.514468085106383
count_major:  36428 count_minor:  2350 Updated ratio: 15.501276595744681
count_major:  36394 count_minor:  2350 Updated ratio: 15.486808510638298
count_major:  36349 count_minor:  2350 Updated ratio: 15.467659574468085
count_major:  36329 count_minor:  2350 Updated ratio: 15.459148936170212
count_major:  36310 count_minor:  2350 Updated ratio: 15.451063829787234
count_major:  36269 count_minor:  2350 Updated ratio: 15.433617021276596
Performing ADASYN...
initial count minor:  36348
Target ratio:  0.9503209747150531
Updated class distribution - Normal: 36269 AbNormal: 36348
Updated ratio: 0.997826565423132
target
AbNormal    36348
Normal      36269
Name: count, dtype: int64


In [15]:
df_concat.shape

(72617, 163)

### 데이터 전처리

In [16]:
features = [col for col in df_concat.columns if col != target]
X = df_concat[features].values
y = df_concat[target].values
X.shape

(72617, 162)

### 데이터 분할


In [25]:
train_x = df_concat[features].values
train_y = df_concat["target"].values

In [26]:
scaler = StandardScaler()
train_x[:, :len(features)] = scaler.fit_transform(train_x[:, :len(features)])

In [18]:
label_encoder = LabelEncoder()
train_y = label_encoder.fit_transform(train_y)

## 3. 모델 학습


### 모델 정의


In [19]:
# 모델 객체 생성
xgb_clf = XGBClassifier()
lgb_clf = LGBMClassifier()
cat_clf = CatBoostClassifier(verbose=0)
meta_clf = CatBoostClassifier(verbose=0)  # 메타 모델로 사용할 CatBoost

In [20]:
# 메타 모델을 위한 학습 및 검증 데이터 만들기
def get_stacking_base_datasets(model, X_train_n, y_train_n, n_folds):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=11)
    # 빈 배열 생성
    train_fold_pred = np.zeros((X_train_n.shape[0], 1))
    
    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        print(f'폴드 세트: {folder_counter + 1} 시작')
        X_tr = X_train_n[train_index]
        y_tr = y_train_n[train_index]
        X_te = X_train_n[valid_index] 
        
        # 폴드 내 모델 학습
        model.fit(X_tr, y_tr)
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1, 1)  # y_train 예측, 폴드 끝나면 concat해야함
        
    return train_fold_pred  # 하나의 모델에 대한 학습 데이터 생성

### 모델 학습


In [21]:
# 개별 모델로부터 메타 모델에 필요한 데이터 셋 만들기
xgb_train = get_stacking_base_datasets(xgb_clf, train_x, train_y, 7)
lgb_train = get_stacking_base_datasets(lgb_clf, train_x, train_y, 7)
cat_train = get_stacking_base_datasets(cat_clf, train_x, train_y, 7)

# 개별 모델로부터 나온 y_train 예측값들 옆으로 붙이기
Stack_final_X_train = np.concatenate((xgb_train, lgb_train, cat_train), axis=1)

# 메타 모델 학습 및 검증
kf = KFold(n_splits=7, shuffle=True, random_state=11)
val_accuracies = []
val_precisions = []
val_recalls = []
val_f1_scores = []

for train_index, val_index in kf.split(Stack_final_X_train):
    X_tr, X_val = Stack_final_X_train[train_index], Stack_final_X_train[val_index]
    y_tr, y_val = train_y[train_index], train_y[val_index]

    meta_clf.fit(X_tr, y_tr)
    stack_final = meta_clf.predict(X_val)
    
    accuracy = accuracy_score(y_val, stack_final)
    precision = precision_score(y_val, stack_final, average='weighted')
    recall = recall_score(y_val, stack_final, average='weighted')
    f1 = f1_score(y_val, stack_final, average='weighted')
    print('f1_score: ', f1)
    
    val_accuracies.append(accuracy)
    val_precisions.append(precision)
    val_recalls.append(recall)
    val_f1_scores.append(f1)

# 각 메트릭의 평균 출력
print(f'7-fold 교차 검증의 평균 메타 모델 정확도: {np.mean(val_accuracies):.4f}')
print(f'7-fold 교차 검증의 평균 메타 모델 Precision: {np.mean(val_precisions):.4f}')
print(f'7-fold 교차 검증의 평균 메타 모델 Recall: {np.mean(val_recalls):.4f}')
print(f'7-fold 교차 검증의 평균 메타 모델 F1-Score: {np.mean(val_f1_scores):.4f}')

폴드 세트: 1 시작
폴드 세트: 2 시작
폴드 세트: 3 시작
폴드 세트: 4 시작
폴드 세트: 5 시작
폴드 세트: 6 시작
폴드 세트: 7 시작
폴드 세트: 1 시작
[LightGBM] [Info] Number of positive: 31044, number of negative: 31199
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033358 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18604
[LightGBM] [Info] Number of data points in the train set: 62243, number of used features: 146
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498755 -> initscore=-0.004980
[LightGBM] [Info] Start training from score -0.004980
폴드 세트: 2 시작
[LightGBM] [Info] Number of positive: 31060, number of negative: 31183
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032339 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 18590
[LightGBM] [Info] Number of data points in the train set: 62243,

## 4. 제출하기


### 테스트 데이터 예측


테스트 데이터 불러오기


In [22]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

In [23]:
print('features of test data:', len(test_data.columns))
print(f'Number of columns in test data: {test_data.shape[0]}')

features of test data: 465
Number of columns in test data: 17361


In [30]:
# 학습 데이터에서 사용된 features 리스트에 있는 열만 선택
df_test_x = test_data[features]
print("X_test 데이터의 크기:", df_test_x.shape)
# 범주형 및 수치형 특성 분리 및 전처리
cat_features_test = []
num_features_test = []

for col in df_test_x.columns:
    if col in cat_features:
        le = LabelEncoder()
        df_test_x[col] = le.fit_transform(df_test_x[col])
        cat_features_test.append(col)
    elif col in num_features:
        df_test_x[col] = df_test_x[col].astype(float)
        num_features_test.append(col)

# 수치형 특성에 대해 학습 데이터에서 사용된 스케일러 적용
X_test_num = df_test_x[num_features].values

# 범주형 특성에 대해 인코딩된 값을 가져옴
X_test_cat = df_test_x[cat_features].values

# 범주형 특성과 수치형 특성을 결합하여 최종 X_test 생성
X_test = np.concatenate([X_test_num, X_test_cat], axis=1)
X_test = scaler.transform(X_test)

# 최종 데이터 확인
print("범주형 특성:", cat_features_test)
print("수치형 특성:", num_features_test)
print("X_test 데이터의 크기:", X_test.shape)

X_test 데이터의 크기: (17361, 162)
범주형 특성: ['Wip Line_Dam', 'Process Desc._Dam', 'Equipment_Dam', 'Model.Suffix_Dam', 'Insp Judge Code_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam', 'Wip Line_AutoClave', 'Process Desc._AutoClave', 'Equipment_AutoClave', 'Model.Suffix_AutoClave', 'Insp Judge Code_AutoClave', '1st Pressure Judge Value_AutoClave', '2nd Pressure Judge Value_AutoClave', '3rd Pressure Judge Value_AutoClave', 'Chamber Temp. Judge Value_AutoClave', 'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave', 'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave', 'Wip Line_Fill1', 'Process Desc._Fill1', 'Equipment_Fill1', 'Model.Suffix_Fill1', 'Insp Judge Code_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1', 'Wip Line_Fill2', 'Process Desc._Fill2', 'Equipment_Fill2', 'Model.Suffix_Fill2', 'Insp Judge Code_Fill2', 'HEAD NORMAL COORDINATE 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_x[col] = df_test_x[col].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_x[col] = df_test_x[col].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_x[col] = df_test_x[col].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.


In [32]:
# 예측 결과
test_pred = meta_clf.predict(X_test)

# 원래 라벨로 되돌리기
test_pred_original = label_encoder.inverse_transform(test_pred)

# 요소의 개수 세기
counter = Counter(test_pred_original)

# 결과 출력
print(counter)


Counter({'AbNormal': 10734, 'Normal': 6627})


### 제출 파일 작성


In [33]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")

# 예측 결과를 원래 라벨로 변환된 값으로 설정
df_sub["target"] = test_pred_original

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)
print('successfully saved submission.csv')

successfully saved submission.csv


**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**
