# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

import os
from pprint import pprint
from typing import Any, Dict

import scipy.special
import zero
from collections import Counter
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from pycaret.classification import *

### 데이터 읽어오기


In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 881

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))

### 언더 샘플링


데이타 불균형을 해결하기 위해 언더 샘플링을 진행합니다.


In [3]:
df_normal = train_data[train_data["target"] == "Normal"]
df_abnormal = train_data[train_data["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

normal_ratio = 1.1  # 1.0 means 1:1 ratio
# normal_ratio = 1.0  # 1.0 means 1:1 ratio
df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")

  Total: Normal: 38156, AbNormal: 2350


target
Normal      2585
AbNormal    2350
Name: count, dtype: int64

### 데이터 분할


## 3. 모델 학습


### 모델 정의


In [4]:
# K-Fold
kf = KFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)

### 모델 학습


In [5]:
# Label encoding for categorical features
features = []

# Preprocess df_concat
for col in df_concat.columns:
    if df_concat[col].dtype == 'object' and col != 'target' and df_concat[col].nunique() > 1:
        le = LabelEncoder()
        df_concat[col] = le.fit_transform(df_concat[col])
        features.append(col)
        if "Workorder" in col:
            features.remove(col)
for col in df_concat.columns:
    if df_concat[col].dtype != 'object':
        try:
            if df_concat[col].nunique() > 1: 
                df_concat[col] = df_concat[col].astype(int)
                features.append(col)
        except ValueError:
            try: 
                if df_concat[col].nunique() > 1:
                    df_concat[col] = df_concat[col].astype(float)
                    features.append(col)
            except:
                continue
                

features = list(set(features))

In [6]:
X = df_concat[features]
y = df_concat["target"]

In [7]:
# 언더샘플링 및 오버샘플링을 위한 파이프라인 구성
over = SMOTE(random_state=RANDOM_STATE)
under = RandomUnderSampler(random_state=RANDOM_STATE)
pipeline = Pipeline(steps=[('u', under),('o', over)])

In [8]:
# 데이터 전처리 (라벨 인코딩)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [9]:
# Initialize a list to store F1-scores for each fold
f1_scores_train = []
f1_scores_val = []

# Initialize variables to store the best model and best F1-score
best_model = None
best_f1_score = 0

for train_index, val_index in kf.split(X):
    df_train, df_val = X.iloc[train_index], X.iloc[val_index]
    train_y, val_y = y_encoded[train_index], y_encoded[val_index]
    
    # 데이터 불균형 처리 (예: SMOTE 사용)
    pipeline = Pipeline([
        ('smote', SMOTE(random_state=42))
    ])
    train_x_resampled, train_y_resampled = pipeline.fit_resample(df_train, train_y)
    
    # Gradient Boosting Classifier
    model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=RANDOM_STATE)
    
    # Train the model
    model.fit(train_x_resampled, train_y_resampled)
    
    # Predictions and F1-score for training set
    train_pred = model.predict(df_train)
    train_score = f1_score(train_y, train_pred)
    f1_scores_train.append(train_score)
    
    # Predictions and F1-score for validation set
    val_pred = model.predict(df_val)
    val_score = f1_score(val_y, val_pred)
    f1_scores_val.append(val_score)
    
    # Check if this model has the best validation F1-score
    if val_score > best_f1_score:
        best_f1_score = val_score
        best_model = model

# Calculate and print the average F1-scores
print(f"Average Train F1-Score: {np.mean(f1_scores_train)}")
print(f"Average Validation F1-Score: {np.mean(f1_scores_val)}")
print(f"Best Validation F1-Score: {best_f1_score}")

Average Train F1-Score: 0.6864513138663021
Average Validation F1-Score: 0.6275227804657881
Best Validation F1-Score: 0.6494464944649446


In [11]:
# 요소의 개수 세기
counter = Counter(val_pred)

# 결과 출력
print(counter)

Counter({1: 263, 0: 230})


## 4. 제출하기


### 테스트 데이터 예측


테스트 데이터 불러오기


In [12]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

In [13]:
# Preprocess df_test_x similarly
for col in test_data.columns:
    if test_data[col].dtype == 'object':
        if test_data[col].nunique() > 1:
            le = LabelEncoder()
            test_data[col] = le.fit_transform(test_data[col])
            
for col in test_data.columns:
    if test_data[col].dtype != 'object':
        try:
            if test_data[col].nunique() > 1: 
                test_data[col] = test_data[col].astype(int)
                
        except ValueError:
            try: 
                if test_data[col].nunique() > 1:
                    test_data[col] = test_data[col].astype(float)
                   
            except:
                continue

In [14]:
df_test_x = test_data[features]

In [15]:
# 모델 예측
y_pred_encoded = model.predict(df_test_x)
y_pred_labels = np.where(y_pred_encoded == 1, 'Normal', 'AbNormal')  # 원래 레이블로 변환

In [16]:
y_pred_labels

array(['Normal', 'Normal', 'AbNormal', ..., 'Normal', 'AbNormal',
       'Normal'], dtype='<U8')

### 제출 파일 작성


In [17]:
# 제출 파일 작성
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = y_pred_labels.astype(object)
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**
