# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.datasets import make_classification

In [2]:
!pip install catboost

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### 데이터 읽어오기


In [2]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
train_data

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,,,...,7,,,127,,,1,,,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,,,...,185,,,1,,,0,,,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,,,...,10,,,73,,,1,,,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,,,...,268,,,1,,,0,,,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,,,...,121,,,1,,,0,,,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240.0,,,...,318,,,1,,,0,,,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000.0,,,...,14,,,197,,,1,,,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240.0,,,...,1,,,27,,,1,,,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000.0,,,...,117,,,1,,,0,,,Normal


### View summary of dataframe


In [3]:
train_data.info()

# 'target' 특성의 클래스별 개수 계산
class_counts = train_data['target'].value_counts()

# 클래스별 개수 출력
print(class_counts)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 464 entries, Wip Line_Dam to target
dtypes: float64(350), int64(77), object(37)
memory usage: 143.4+ MB
target
Normal      38156
AbNormal     2350
Name: count, dtype: int64


### view unique values in dataset - feature의 고유값

In [4]:
train_data.nunique()

# 유일한 값이 1인 열 제거
df_cleaned = train_data.loc[:, train_data.nunique() > 1]

print("제거된 데이터프레임:")
df_cleaned

제거된 데이터프레임:


Unnamed: 0,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,Dam dispenser #1,AJX75334505,4F1XA938-1,240.0,2.5,-90,100,1030,-90,16,...,50.0,91.8,270,50,114.612,19.9,7,127,1,Normal
1,Dam dispenser #1,AJX75334505,3KPM0016-2,240.0,2.5,-90,70,1030,-90,10,...,91.8,270.0,50,85,19.600,7.0,185,1,0,Normal
2,Dam dispenser #2,AJX75334501,4E1X9167-1,1000.0,12.5,90,85,280,90,16,...,50.0,91.8,270,50,114.612,19.8,10,73,1,Normal
3,Dam dispenser #2,AJX75334501,3K1X0057-1,1000.0,12.5,90,70,280,90,10,...,91.8,270.0,50,85,19.900,12.0,268,1,0,Normal
4,Dam dispenser #1,AJX75334501,3HPM0007-1,240.0,2.5,-90,70,1030,-90,10,...,91.8,270.0,50,85,19.700,8.0,121,1,0,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,Dam dispenser #1,AJX75334501,3J1XF434-2,240.0,2.5,-90,70,1030,-90,10,...,91.8,270.0,50,85,19.200,1.0,318,1,0,Normal
40502,Dam dispenser #2,AJX75334501,4E1XC796-1,1000.0,12.5,90,100,280,90,16,...,50.0,91.8,270,50,114.612,20.5,14,197,1,Normal
40503,Dam dispenser #1,AJX75334501,4C1XD438-1,240.0,2.5,-90,100,1030,-90,16,...,50.0,91.8,270,50,85.000,19.7,1,27,1,Normal
40504,Dam dispenser #2,AJX75334501,3I1XA258-1,1000.0,12.5,90,70,280,90,10,...,91.8,270.0,50,85,20.100,13.0,117,1,0,Normal


In [5]:
# 타겟 변수 인코딩
df_cleaned['target_encoded'] = df_cleaned['target'].apply(lambda x: 1 if x == 'AbNormal' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['target_encoded'] = df_cleaned['target'].apply(lambda x: 1 if x == 'AbNormal' else 0)


In [6]:
# 모든 피처 사용
all_features = df_cleaned.drop(columns=['target', 'target_encoded']).columns

In [7]:
# 데이터프레임에서 특성과 레이블 분리
X = df_cleaned[all_features]
y = df_cleaned['target_encoded']

In [8]:
# 데이터 분할 (훈련 데이터와 테스트 데이터)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# 범주형 및 수치형 피처 분리
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(exclude=['object']).columns

In [10]:
# 파이프라인 구성 (수치형: 표준화, 범주형: One-Hot 인코딩)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # 결측값 처리
    ('scaler', StandardScaler()),    # 표준화
    ('normalizer', MinMaxScaler())  # 정규화
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # 결측값 처리
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [11]:
# ColumnTransformer를 통해 수치형 및 범주형 피처를 변환
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [12]:
# 전처리 데이터 얻기
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

### Trucated SVD

In [13]:
# TruncatedSVD 적용
svd = TruncatedSVD(n_components=60)
X_train_reduced = svd.fit_transform(X_train_processed)
X_test_reduced = svd.transform(X_test_processed)

## 3. 모델 학습


### 모델 정의


In [14]:
# CatBoostClassifier 정의 및 학습
model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    depth=6,
    random_state=42,
    class_weights=[1, 8]
)

In [None]:
from sklearn.model_selection import GridSearchCV

# 하이퍼파라미터의 가능한 조합을 정의
param_grid = {
    'depth': [6, 8, 10],            # 트리의 깊이
    'learning_rate': [0.01, 0.1, 0.2],  # 학습률
    'iterations': [100, 200, 300]   # 트리의 개수 (부스팅 단계의 수)
}

# CatBoost 모델 생성 (random_state와 class_weights 포함)
model_dkdk = CatBoostClassifier(
    random_state=42,
    class_weights=[1, 8]  # 클래스 불균형을 다루기 위한 가중치
)

# GridSearchCV를 사용하여 하이퍼파라미터 조합을 테스트합니다.
grid_search = GridSearchCV(
    estimator=model_dkdk,           # 최적의 하이퍼파라미터를 찾기 위한 모델
    param_grid=param_grid,     # 테스트할 하이퍼파라미터 조합
    scoring='f1',              # 평가 지표 (여기서는 F1 점수)
    cv=3)

# 학습 데이터를 사용하여 하이퍼파라미터 튜닝을 수행합니다.
grid_search.fit(X_train_reduced, y_train)

# 최적의 하이퍼파라미터 조합
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# 최적의 모델을 선택합니다.
best_model = grid_search.best_estimator_

# GridSearchCV 결과를 통해 모든 조합과 그에 대한 성능을 출력할 수도 있습니다.
results_df = pd.DataFrame(grid_search.cv_results_)

# 모든 하이퍼파라미터 조합과 성능을 출력
print(results_df[['params', 'mean_test_score', 'std_test_score']])

0:	learn: 0.6912475	total: 78.6ms	remaining: 7.78s
1:	learn: 0.6894933	total: 105ms	remaining: 5.13s
2:	learn: 0.6879176	total: 124ms	remaining: 4s
3:	learn: 0.6863526	total: 139ms	remaining: 3.34s
4:	learn: 0.6847903	total: 154ms	remaining: 2.93s
5:	learn: 0.6833153	total: 168ms	remaining: 2.63s
6:	learn: 0.6818008	total: 181ms	remaining: 2.4s
7:	learn: 0.6804773	total: 193ms	remaining: 2.22s
8:	learn: 0.6790254	total: 207ms	remaining: 2.09s
9:	learn: 0.6776756	total: 220ms	remaining: 1.98s
10:	learn: 0.6763480	total: 233ms	remaining: 1.89s
11:	learn: 0.6749352	total: 247ms	remaining: 1.81s
12:	learn: 0.6736975	total: 261ms	remaining: 1.75s
13:	learn: 0.6722682	total: 274ms	remaining: 1.68s
14:	learn: 0.6709801	total: 288ms	remaining: 1.63s
15:	learn: 0.6695936	total: 301ms	remaining: 1.58s
16:	learn: 0.6681735	total: 314ms	remaining: 1.53s
17:	learn: 0.6668653	total: 327ms	remaining: 1.49s
18:	learn: 0.6656521	total: 341ms	remaining: 1.45s
19:	learn: 0.6644032	total: 354ms	remaining:

### 모델 학습


In [15]:
# 파이프라인 학습
model.fit(X_train_reduced, y_train)
#best_model.fit(X_train_reduced, y_train)

0:	learn: 0.6775504	total: 64.5ms	remaining: 6.38s
1:	learn: 0.6636836	total: 81.5ms	remaining: 3.99s
2:	learn: 0.6535742	total: 97.2ms	remaining: 3.14s
3:	learn: 0.6448364	total: 111ms	remaining: 2.67s
4:	learn: 0.6376682	total: 126ms	remaining: 2.4s
5:	learn: 0.6313336	total: 142ms	remaining: 2.22s
6:	learn: 0.6259529	total: 158ms	remaining: 2.09s
7:	learn: 0.6207825	total: 174ms	remaining: 2s
8:	learn: 0.6157171	total: 190ms	remaining: 1.92s
9:	learn: 0.6121513	total: 205ms	remaining: 1.85s
10:	learn: 0.6096178	total: 220ms	remaining: 1.78s
11:	learn: 0.6066554	total: 236ms	remaining: 1.73s
12:	learn: 0.6048265	total: 253ms	remaining: 1.69s
13:	learn: 0.6021535	total: 270ms	remaining: 1.66s
14:	learn: 0.6000857	total: 287ms	remaining: 1.63s
15:	learn: 0.5976468	total: 303ms	remaining: 1.59s
16:	learn: 0.5956602	total: 320ms	remaining: 1.56s
17:	learn: 0.5941574	total: 335ms	remaining: 1.53s
18:	learn: 0.5924301	total: 351ms	remaining: 1.5s
19:	learn: 0.5912446	total: 366ms	remaining

<catboost.core.CatBoostClassifier at 0x7f1095724fd0>

In [16]:
y_pred =model.predict(X_test_reduced)
#y_pred1 =best_model.predict(X_test_reduced)

In [17]:
# 성능 평가
print(f"Original number of features: {X.shape[1]}")
print(f"Reduced number of features after TruncatedSVD: {svd.n_components}")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Original number of features: 145
Reduced number of features after TruncatedSVD: 60
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      7644
           1       0.17      0.18      0.17       458

    accuracy                           0.90      8102
   macro avg       0.56      0.56      0.56      8102
weighted avg       0.91      0.90      0.90      8102

[[7232  412]
 [ 375   83]]


## 4. 제출하기


### 테스트 데이터 예측


테스트 데이터 불러오기


In [18]:
df_test = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))
df_test

Unnamed: 0,Set ID,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,0001be084fbc4aaa9d921f39e595961b,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3J1XF767-1,1,OK,1000.0,,...,195,,,1,,,0,,,
1,0005bbd180064abd99e63f9ed3e1ac80,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4B1XD472-2,1,OK,1000.0,,...,14,,,256,,,1,,,
2,000948934c4140d883d670adcb609584,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3H1XE355-1,1,OK,240.0,,...,98,,,1,,,0,,,
3,000a6bfd02874c6296dc7b2e9c5678a7,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3L1XA128-1,1,OK,1000.0,,...,14,,,0,,,1,,,
4,0018e78ce91343678716e2ea27a51c95,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4A1XA639-1,1,OK,240.0,,...,1,,,215,,,1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,ffea508b59934d689b540f95eb3fa730,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1XB597-1,1,OK,1000.0,,...,14,,,131,,,1,,,
17357,ffed8923c8a448a98afc641b770be153,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4A1XB974-1,1,OK,1000.0,,...,12,,,279,,,1,,,
17358,fff1e73734da40adbe805359b3efb462,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3L1XA998-1,1,OK,240.0,,...,4,,,66,,,1,,,
17359,fff8e38bdd09470baf95f71e92075dec,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3F1XC376-1,1,OK,240.0,,...,117,,,1,,,0,,,


In [21]:
df_test.nunique()

# 유일한 값이 1인 열 제거
df_test_cleaned = df_test.loc[:, df_test.nunique() > 1]

print("제거된 데이터프레임:")
df_test_cleaned

제거된 데이터프레임:


Unnamed: 0,Set ID,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,...,Head Clean Position Y Collect Result_Fill2,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2
0,0001be084fbc4aaa9d921f39e595961b,Dam dispenser #2,AJX75334501,3J1XF767-1,1000.0,12.5,90,70,280,90,...,50,91.8,270.0,50,85,19.8,13.0,195,1,0
1,0005bbd180064abd99e63f9ed3e1ac80,Dam dispenser #2,AJX75334501,4B1XD472-2,1000.0,12.5,90,70,280,90,...,119,50.0,91.8,270,50,85.0,19.8,14,256,1
2,000948934c4140d883d670adcb609584,Dam dispenser #1,AJX75334501,3H1XE355-1,240.0,2.5,-90,70,1030,-90,...,50,91.8,270.0,50,85,19.7,1.0,98,1,0
3,000a6bfd02874c6296dc7b2e9c5678a7,Dam dispenser #2,AJX75334501,3L1XA128-1,1000.0,12.5,90,70,280,90,...,119,50.0,91.8,270,50,85.0,20.0,14,0,1
4,0018e78ce91343678716e2ea27a51c95,Dam dispenser #1,AJX75334501,4A1XA639-1,240.0,2.5,-90,70,1030,-90,...,119,50.0,91.8,270,50,85.0,19.8,1,215,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,ffea508b59934d689b540f95eb3fa730,Dam dispenser #2,AJX75334501,3K1XB597-1,1000.0,12.5,90,70,280,90,...,119,50.0,91.8,270,50,85.0,19.5,14,131,1
17357,ffed8923c8a448a98afc641b770be153,Dam dispenser #2,AJX75334501,4A1XB974-1,1000.0,12.5,90,70,280,90,...,119,50.0,91.8,270,50,85.0,19.8,12,279,1
17358,fff1e73734da40adbe805359b3efb462,Dam dispenser #1,AJX75334501,3L1XA998-1,240.0,2.5,-90,70,1030,-90,...,119,50.0,91.8,270,50,85.0,20.5,4,66,1
17359,fff8e38bdd09470baf95f71e92075dec,Dam dispenser #1,AJX75334501,3F1XC376-1,240.0,2.5,-90,70,1030,-90,...,50,91.8,270.0,50,85,18.9,1.0,117,1,0


In [22]:
# 모든 피처 사용
X_test = df_test_cleaned[all_features]

# 데이터 전처리
X_new_processed = preprocessor.transform(X_test)

X_new_reduced = svd.transform(X_new_processed)

In [23]:
test_pred = model.predict(X_new_reduced)

test_pred_labels = ['AbNormal' if y == 1 else 'Normal' for y in test_pred]

test_pred_labels

['Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',

In [24]:
test_pred_series = pd.Series(test_pred_labels)
count_labels = test_pred_series.value_counts()

print(count_labels)

Normal      16287
AbNormal     1074
Name: count, dtype: int64


### 제출 파일 작성


In [25]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred_labels

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**
