In [60]:
import os
from pprint import pprint

import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

from imblearn.under_sampling import RandomUnderSampler

from tqdm import tqdm

In [49]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# Load data
df_train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
df_train

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,,,...,7,,,127,,,1,,,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,,,...,185,,,1,,,0,,,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,,,...,10,,,73,,,1,,,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,,,...,268,,,1,,,0,,,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,,,...,121,,,1,,,0,,,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240.0,,,...,318,,,1,,,0,,,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000.0,,,...,14,,,197,,,1,,,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240.0,,,...,1,,,27,,,1,,,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000.0,,,...,117,,,1,,,0,,,Normal


In [50]:
# 타겟 변수 인코딩
df_train['target_encoded'] = df_train['target'].apply(lambda x: 1 if x == 'AbNormal' else 0)

In [51]:
# 모든 피처 사용
all_features = df_train.drop(columns=['target', 'target_encoded']).columns

In [52]:
# X와 y 정의
X = df_train[all_features]
y = df_train['target_encoded']

In [53]:
# 범주형 및 수치형 피처 분리
categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X.select_dtypes(exclude=['object']).columns

In [54]:
# 파이프라인 구성 (수치형: 표준화, 범주형: One-Hot 인코딩)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # 결측값 처리
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # 결측값 처리
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [55]:
# ColumnTransformer를 통해 수치형 및 범주형 피처를 변환
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [9]:
# # 데이터를 전처리 (One-Hot 인코딩 및 표준화 적용)
# X_processed = preprocessor.fit_transform(X)

In [None]:
# # TruncatedSVD 적용 (다양한 n_components 값에 대해 분산 비율 계산)
# explained_variance_ratios = []
# components_range = range(1, min(300, X_processed.shape[1] + 1), 10)  # 10단위로 최대 300까지 테스트

# for n in components_range:
#     svd = TruncatedSVD(n_components=n, random_state=42)
#     svd.fit(X_processed)
#     explained_variance_ratios.append(svd.explained_variance_ratio_.sum())

In [None]:
# # 결과 시각화
# import matplotlib.pyplot as plt

# plt.figure(figsize=(10, 6))
# plt.plot(components_range, explained_variance_ratios, marker='o')
# plt.title('Explained Variance Ratio by Number of Components')
# plt.xlabel('Number of Components')
# plt.ylabel('Explained Variance Ratio')
# plt.grid(True)
# plt.show()

In [56]:
# TruncatedSVD 구성
svd = TruncatedSVD(n_components=50)  # 사용할 주성분 개수 설정 (조정 가능)

In [57]:
# SVM 모델용 파이프라인 구성
svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # 전처리 (표준화 및 One-Hot 인코딩)
    ('svd', TruncatedSVD(n_components=50, random_state=42)),  # 차원 축소
    ('classifier', SVC(kernel='rbf', class_weight='balanced', random_state=42))  # SVM 모델
])

In [58]:
# 데이터 분할 (학습용: 80%, 테스트용: 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [61]:
# 언더샘플링을 통해 다수 클래스의 수를 줄임
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [62]:
# StratifiedKFold 설정 (과적합 방지를 위한 폴드 설정)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [63]:
# GridSearchCV 설정
param_grid = {
    'classifier__C': [0.1, 1, 10, 100],  # 파이프라인 내 SVM 모델의 하이퍼파라미터 설정
    'classifier__gamma': [1, 0.1, 0.01, 0.001],
}

In [68]:
grid_search = GridSearchCV(svm_pipeline, param_grid, cv=skf, scoring='f1', n_jobs=-1, verbose=3)
grid_search.fit(X_resampled, y_resampled)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [69]:
# 최적의 하이퍼파라미터 출력
print(f"Best parameters: {grid_search.best_params_}")

Best parameters: {'classifier__C': 100, 'classifier__gamma': 0.001}


In [70]:
# 최적의 모델로 예측 수행
best_svm_model = grid_search.best_estimator_
y_pred = best_svm_model.predict(X_test)

In [71]:
# 성능 평가
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.64      0.77     11447
           1       0.08      0.54      0.15       705

    accuracy                           0.63     12152
   macro avg       0.52      0.59      0.46     12152
weighted avg       0.91      0.63      0.73     12152

[[7322 4125]
 [ 322  383]]


In [72]:
df_test = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

In [73]:
# 모든 피처 사용
X_test = df_test[all_features]

In [74]:
test_pred = best_svm_model.predict(X_test)

In [75]:
test_pred_labels = ['AbNormal' if y == 1 else 'Normal' for y in test_pred]

test_pred_labels

['AbNormal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 

In [76]:
test_pred_series = pd.Series(test_pred_labels)
count_labels = test_pred_series.value_counts()

print(count_labels)

Normal      10844
AbNormal     6517
Name: count, dtype: int64


In [77]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred_labels

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

[CV 4/5] END classifier__C=0.1, classifier__gamma=1;, score=0.115 total time=   1.7s
[CV 5/5] END classifier__C=0.1, classifier__gamma=1;, score=0.127 total time=   1.7s
[CV 2/5] END classifier__C=0.1, classifier__gamma=0.01;, score=0.147 total time=   1.6s
[CV 1/5] END classifier__C=0.1, classifier__gamma=0.001;, score=0.112 total time=   1.6s
[CV 5/5] END classifier__C=0.1, classifier__gamma=0.001;, score=0.138 total time=   1.6s
[CV 4/5] END classifier__C=1, classifier__gamma=1;, score=0.167 total time=   1.6s
[CV 3/5] END classifier__C=1, classifier__gamma=0.1;, score=0.094 total time=   1.2s
[CV 1/5] END classifier__C=1, classifier__gamma=0.01;, score=0.108 total time=   1.5s
[CV 5/5] END classifier__C=1, classifier__gamma=0.01;, score=0.120 total time=   1.5s
[CV 4/5] END classifier__C=1, classifier__gamma=0.001;, score=0.110 total time=   1.6s
[CV 3/5] END classifier__C=10, classifier__gamma=1;, score=0.113 total time=   1.6s
[CV 2/5] END classifier__C=10, classifier__gamma=0.1;