In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
import tensorflow as tf

from imblearn.combine import SMOTEENN
from sklearn.decomposition import TruncatedSVD

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    precision_recall_curve,
    roc_curve
)

from tqdm import tqdm

2024-08-14 10:49:24.429572: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-14 10:49:24.471570: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [3]:
ROOT_DIR = "data"
RANDOM_STATE = 110

# Load data
data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
data

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,,,...,7,,,127,,,1,,,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,,,...,185,,,1,,,0,,,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,,,...,10,,,73,,,1,,,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,,,...,268,,,1,,,0,,,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,,,...,121,,,1,,,0,,,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240.0,,,...,318,,,1,,,0,,,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000.0,,,...,14,,,197,,,1,,,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240.0,,,...,1,,,27,,,1,,,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000.0,,,...,117,,,1,,,0,,,Normal


In [4]:
# 전체가 NaN인 컬럼만 제거
df_train = data.dropna(axis=1, how='all')

In [5]:
# 문자열 데이터를 식별하고 라벨 인코딩 수행
label_encoders = {}
for column in df_train.columns:
    if df_train[column].dtype == object:
        le = LabelEncoder()
        df_train.loc[:, column] = le.fit_transform(df_train[column])
        label_encoders[column] = le

In [6]:
X = df_train.drop('target', axis=1).values  # 피처
y = df_train['target'].values  # 레이블

In [7]:
# 데이터 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
y = y.astype(int)

In [9]:
# SMOTE-ENN 적용
smote_enn = SMOTEENN(random_state=42)
X_res, y_res = smote_enn.fit_resample(X_scaled, y)

In [10]:
# TruncatedSVD로 차원 축소
svd = TruncatedSVD(n_components=50, random_state=42)
X_res_reduced = svd.fit_transform(X_res)

In [11]:
# LSTM 모델에 입력할 수 있도록 데이터 재구성
X_res_reduced = X_res_reduced.reshape(X_res_reduced.shape[0], 1, X_res_reduced.shape[1])

In [14]:
# 훈련 및 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_res_reduced, y_res, test_size=0.2, random_state=42)

In [15]:
# LSTM 모델 구축
model = Sequential()
model.add(LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

2024-08-14 10:53:24.725470: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-08-14 10:53:24.727399: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-08-14 10:53:24.729581: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [16]:
# 모델 컴파일
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [17]:
# 모델 훈련
history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_data=(X_test, y_test), verbose=2)

Epoch 1/20


2024-08-14 10:53:37.872114: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-08-14 10:53:37.874619: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-08-14 10:53:37.877369: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

823/823 - 6s - loss: 0.6355 - accuracy: 0.6361 - val_loss: 0.6102 - val_accuracy: 0.6596 - 6s/epoch - 8ms/step
Epoch 2/20
823/823 - 3s - loss: 0.5934 - accuracy: 0.6780 - val_loss: 0.5746 - val_accuracy: 0.6870 - 3s/epoch - 4ms/step
Epoch 3/20
823/823 - 4s - loss: 0.5682 - accuracy: 0.6939 - val_loss: 0.5483 - val_accuracy: 0.7063 - 4s/epoch - 5ms/step
Epoch 4/20
823/823 - 4s - loss: 0.5501 - accuracy: 0.7081 - val_loss: 0.5318 - val_accuracy: 0.7209 - 4s/epoch - 5ms/step
Epoch 5/20
823/823 - 4s - loss: 0.5347 - accuracy: 0.7172 - val_loss: 0.5155 - val_accuracy: 0.7294 - 4s/epoch - 5ms/step
Epoch 6/20
823/823 - 4s - loss: 0.5226 - accuracy: 0.7240 - val_loss: 0.5042 - val_accuracy: 0.7353 - 4s/epoch - 4ms/step
Epoch 7/20
823/823 - 4s - loss: 0.5117 - accuracy: 0.7321 - val_loss: 0.4985 - val_accuracy: 0.7389 - 4s/epoch - 5ms/step
Epoch 8/20
823/823 - 4s - loss: 0.5020 - accuracy: 0.7371 - val_loss: 0.4796 - val_accuracy: 0.7477 - 4s/epoch - 5ms/step
Epoch 9/20
823/823 - 4s - loss: 0.4

In [18]:
y_pred = model.predict(X_test)
y_pred_labels = ['AbNormal' if y >= 0.5 else 'Normal' for y in y_pred]

2024-08-14 10:55:39.166081: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-08-14 10:55:39.168537: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-08-14 10:55:39.170200: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



In [19]:
y_pred

array([[0.87646484],
       [0.52885866],
       [0.14103173],
       ...,
       [0.00283291],
       [0.383916  ],
       [0.0186939 ]], dtype=float32)

In [23]:
from sklearn.metrics import roc_curve, auc

# ROC 곡선과 AUC 계산
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# 적절한 임계값 선택
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal threshold based on ROC curve: {optimal_threshold}")

Optimal threshold based on ROC curve: 0.4123239815235138


In [24]:
threshold = 0.4
y_pred_class = (y_pred >= threshold).astype(int)

In [25]:
# 성능 평가
print(confusion_matrix(y_test, y_pred_class))
print(classification_report(y_test, y_pred_class))

[[5780 1297]
 [1055 5022]]
              precision    recall  f1-score   support

           0       0.85      0.82      0.83      7077
           1       0.79      0.83      0.81      6077

    accuracy                           0.82     13154
   macro avg       0.82      0.82      0.82     13154
weighted avg       0.82      0.82      0.82     13154



In [26]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

In [27]:
# 훈련 데이터에 존재하는 컬럼만 남기기
common_columns = df_train.columns.intersection(test_data.columns)
test_data = test_data[common_columns]

In [28]:
# 동일한 전처리를 테스트 데이터에도 적용 (라벨 인코딩)
label_encoders = {}
for column in test_data.columns:
    if test_data[column].dtype == object:
        le = LabelEncoder()
        test_data.loc[:, column] = le.fit_transform(test_data[column])
        label_encoders[column] = le

In [34]:
X_test = test_data.drop(columns=['target'])
X_test = scaler.transform(X_test.values)
X_test = svd.fit_transform(X_test)

In [35]:
# LSTM 모델에 입력할 수 있도록 데이터 재구성
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

In [36]:
test_pred = model.predict(X_test)



In [37]:
test_pred_labels = ['AbNormal' if y >= threshold else 'Normal' for y in test_pred.flatten()]

test_pred_labels

['Normal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'Normal',
 'AbNormal',
 'AbNormal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'A

In [38]:
test_pred_series = pd.Series(test_pred_labels)
count_labels = test_pred_series.value_counts()

print(count_labels)

AbNormal    11460
Normal       5901
Name: count, dtype: int64


In [39]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred_labels

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)