# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [53]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm
import xgboost as xgb
from catboost import CatBoostClassifier

### 데이터 읽어오기


In [54]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
train_data

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,,,...,7,,,127,,,1,,,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,,,...,185,,,1,,,0,,,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,,,...,10,,,73,,,1,,,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,,,...,268,,,1,,,0,,,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,,,...,121,,,1,,,0,,,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240.0,,,...,318,,,1,,,0,,,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000.0,,,...,14,,,197,,,1,,,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240.0,,,...,1,,,27,,,1,,,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000.0,,,...,117,,,1,,,0,,,Normal


## 2. 데이터 전처리

### 데이터 변환

In [55]:
# 데이터 전처리 (1) - 오류 컬럼 'ok'값 변환
train_data.loc[:, 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] = train_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].replace('OK', np.nan)
train_data.loc[:, 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] = train_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].replace('OK', np.nan)
train_data.loc[:, 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'] = train_data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].replace('OK', np.nan)

# 데이터 전처리 (2) - 'OK'값 컬럼 원핫인코딩
ok_columns = train_data.columns[train_data.isin(['OK']).any()]
train_data.loc[:, ok_columns] = train_data[ok_columns].apply(lambda col: col.map(lambda x: 1 if x == 'OK' else 0))

# 데이터 전처리 (3) - Equipment_dam
def convert_value(x):
    if '#1' in x:
        return 1
    elif '#2' in x:
        return 2
    return x
train_data['Equipment_Dam'] = train_data['Equipment_Dam'].apply(convert_value)

# 데이터 전처리 (4) - Model.Suffix / Workorder 레이블 인코딩
columns_to_encode = ['Model.Suffix_Dam', 'Workorder_Dam']
label_encoder = LabelEncoder()
for col in columns_to_encode:
    train_data[col] = label_encoder.fit_transform(train_data[col])

# 확인
train_data

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,1,3,657,1,1,240.0,,,...,7,,,127,,,1,,,Normal
1,IVI-OB6,Dam Dispenser,1,3,283,1,1,240.0,,,...,185,,,1,,,0,,,Normal
2,IVI-OB6,Dam Dispenser,2,0,589,1,1,1000.0,,,...,10,,,73,,,1,,,Normal
3,IVI-OB6,Dam Dispenser,2,0,251,1,1,1000.0,,,...,268,,,1,,,0,,,Normal
4,IVI-OB6,Dam Dispenser,1,0,142,1,1,240.0,,,...,121,,,1,,,0,,,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,1,0,238,1,1,240.0,,,...,318,,,1,,,0,,,Normal
40502,IVI-OB6,Dam Dispenser,2,0,643,1,1,1000.0,,,...,14,,,197,,,1,,,Normal
40503,IVI-OB6,Dam Dispenser,1,0,540,1,1,240.0,,,...,1,,,27,,,1,,,Normal
40504,IVI-OB6,Dam Dispenser,2,0,164,1,1,1000.0,,,...,117,,,1,,,0,,,Normal


### 컬럼 삭제

In [56]:
# null 컬럼 삭제 (464개 -> 186개 컬럼)
train_data = train_data.dropna(axis=1, how='all')

# 오류 컬럼 삭제 (186개 -> 185개 컬럼)
train_data = train_data.drop(columns=['GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave'])

# 단일값 컬럼 제거 (185개 -> 150개 컬럼)
unique_values = train_data.nunique()
single_value_columns = unique_values[unique_values == 1].index
train_data = train_data.drop(columns=single_value_columns)

# 중복 컬럼 제거 (150개 -> 142개 컬럼)
# [Model.Suffix / Workorder/ Equipment]
cols_to_drop = ['Model.Suffix_AutoClave', 'Model.Suffix_Fill1', 'Model.Suffix_Fill2',
               'Workorder_AutoClave', 'Workorder_Fill1', 'Workorder_Fill2',
               'Equipment_Fill1', 'Equipment_Fill2']
train_data = train_data.drop(columns=cols_to_drop)

# 결측치 컬럼 삭제 (142개 -> 139개 컬럼)
columns_to_remove = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'
]
train_data = train_data.drop(columns=columns_to_remove)

# 확인
train_data

Unnamed: 0,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,1,3,657,240.0,2.5,-90,100,1030,-90,16,...,50.0,91.8,270,50,114.612,19.9,7,127,1,Normal
1,1,3,283,240.0,2.5,-90,70,1030,-90,10,...,91.8,270.0,50,85,19.600,7.0,185,1,0,Normal
2,2,0,589,1000.0,12.5,90,85,280,90,16,...,50.0,91.8,270,50,114.612,19.8,10,73,1,Normal
3,2,0,251,1000.0,12.5,90,70,280,90,10,...,91.8,270.0,50,85,19.900,12.0,268,1,0,Normal
4,1,0,142,240.0,2.5,-90,70,1030,-90,10,...,91.8,270.0,50,85,19.700,8.0,121,1,0,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,1,0,238,240.0,2.5,-90,70,1030,-90,10,...,91.8,270.0,50,85,19.200,1.0,318,1,0,Normal
40502,2,0,643,1000.0,12.5,90,100,280,90,16,...,50.0,91.8,270,50,114.612,20.5,14,197,1,Normal
40503,1,0,540,240.0,2.5,-90,100,1030,-90,16,...,50.0,91.8,270,50,85.000,19.7,1,27,1,Normal
40504,2,0,164,1000.0,12.5,90,70,280,90,10,...,91.8,270.0,50,85,20.100,13.0,117,1,0,Normal


### Target 컬럼 인코딩

In [57]:
# target 열의 값 인코딩
train_data['target'] = train_data['target'].map({'Normal': 1, 'AbNormal': 0})

### 'object' 컬럼 정수형으로 변환

In [58]:
object_columns = train_data.select_dtypes(include=['object'])
for col in object_columns:
    train_data[col] = train_data[col].astype('category').cat.codes

In [59]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 139 entries, Equipment_Dam to target
dtypes: float64(68), int32(2), int64(64), int8(5)
memory usage: 41.3 MB


### 언더 샘플링

데이타 불균형을 해결하기 위해 언더 샘플링을 진행합니다.

In [60]:
normal_ratio = 1.0  # 1.0 means 1:1 ratio

df_normal = train_data[train_data["target"] == 1] #Normal
df_abnormal = train_data[train_data["target"] == 0] #AbNormal

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

df_normal = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)
df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
df_concat.value_counts("target")

  Total: Normal: 38156, AbNormal: 2350


target
0    2350
1    2350
Name: count, dtype: int64

### 데이터 분할


In [61]:
df_train, df_val = train_test_split(
    df_concat,
    test_size=0.3,
    stratify=df_concat["target"],
    random_state=RANDOM_STATE,
)


def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == 1]) #Normal
    num_abnormal = len(df[df["target"] == 0]) #AbNormal

    print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}" + f" ratio: {num_abnormal/num_normal}")


# Print statistics
print(f"  \tAbnormal\tNormal")
print_stats(df_train)
print_stats(df_val)

  	Abnormal	Normal
  Total: Normal: 1645, AbNormal: 1645 ratio: 1.0
  Total: Normal: 705, AbNormal: 705 ratio: 1.0


## 3. 모델 학습


### 모델 정의


In [62]:
model = CatBoostClassifier(
    iterations=500,       # 학습 반복 횟수
    learning_rate=0.1,    # 학습률
    depth=6,              # 트리의 깊이
    random_state=42,
    verbose=100           # 출력 로그의 빈도
)

### 모델 학습

In [63]:
features = []

for col in df_train.columns:
    try:
        df_train[col] = df_train[col].astype(int)
        features.append(col)
    except:
        continue

train_x = df_train[features]
train_y = df_train["target"]

model.fit(train_x, train_y)

0:	learn: 0.4368775	total: 5.01ms	remaining: 2.5s
100:	learn: 0.0005824	total: 419ms	remaining: 1.66s
200:	learn: 0.0005813	total: 893ms	remaining: 1.33s
300:	learn: 0.0005801	total: 1.3s	remaining: 858ms
400:	learn: 0.0005791	total: 1.71s	remaining: 422ms
499:	learn: 0.0005782	total: 2.1s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x24407f49c90>

In [66]:
# 검증 데이터에 대한 예측
val_x = df_val[features]
val_y = df_val["target"]
val_predictions = model.predict(val_x)

# F1 점수 계산
f1 = f1_score(val_y, val_predictions, pos_label=0)

# F1 점수 출력
print(f"F1 Score: {f1:.4f}")

F1 Score: 1.0000


## 4. 제출하기


### 테스트 데이터 예측


테스트 데이터 불러오기


In [67]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

In [68]:
df_test_x = test_data[features]

for col in df_test_x.columns:
    try:
        df_test_x.loc[:, col] = df_test_x[col].astype(int)
    except:
        continue

In [69]:
# 'OK' 컬럼 전처리
okok_columns = df_test_x.columns[df_test_x.isin(['OK']).any()]
df_test_x.loc[:, okok_columns] = df_test_x.loc[:, okok_columns].apply(lambda col: col.map(lambda x: 1 if x == 'OK' else 0))

# equipment_dam 전처리
df_test_x.loc[:, 'Equipment_Dam'] = df_test_x['Equipment_Dam'].apply(convert_value)

# medel.suffix 와 workorder 전처리
columns_to_encode = ['Model.Suffix_Dam', 'Workorder_Dam']
for col in columns_to_encode:
    df_test_x.loc[:, col] = label_encoder.fit_transform(df_test_x[col])

# 'object'형 컬럼 정수형으로 변환
object_columns = df_test_x.select_dtypes(include=['object']).columns
for col in object_columns:
    df_test_x.loc[:, col] = df_test_x[col].astype('category').cat.codes

df_test_x

Unnamed: 0,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,1,0,240,1000.0,12.0,90,70,280,90,10,...,91.0,270.0,50,85,19.0,13.0,195,1,0,
1,1,0,509,1000.0,12.0,90,70,280,90,16,...,50.0,91.0,270,50,85.0,19.0,14,256,1,
2,0,0,128,240.0,2.0,-90,70,1030,-90,10,...,91.0,270.0,50,85,19.0,1.0,98,1,0,
3,1,0,306,1000.0,12.0,90,70,280,90,10,...,50.0,91.0,270,50,85.0,20.0,14,0,1,
4,0,0,415,240.0,2.0,-90,70,1030,-90,16,...,50.0,91.0,270,50,85.0,19.0,1,215,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,1,0,277,1000.0,12.0,90,70,280,90,10,...,50.0,91.0,270,50,85.0,19.0,14,131,1,
17357,1,0,439,1000.0,12.0,90,70,280,90,16,...,50.0,91.0,270,50,85.0,19.0,12,279,1,
17358,0,0,314,240.0,2.0,-90,70,1030,-90,16,...,50.0,91.0,270,50,85.0,20.0,4,66,1,
17359,0,0,8,240.0,2.0,-90,70,1030,-90,10,...,91.0,270.0,50,85,18.0,1.0,117,1,0,


In [70]:
test_pred = model.predict(df_test_x)
test_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

### 제출 파일 작성


In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**
