# History

### Ver2
* VIF 분석을 통해 피처 선택을 수행하고 테스트합니다.
* VIF 분석 결과 제거할 피처가 없어 permutation_importance 함수를 사용 후 제거할 피쳐를 선택합니다.
* Perform feature selection through VIF analysis and test it.
* After conducting the VIF analysis and finding no features to remove, I will now use the permutation_importance function to determine which features to remove.

### Ver3
* RNF와 Type컬럼을 제거한 데이터에 대해서 모델을 학습하고 튜닝합니다.
* Train and fine-tune the model using the data excluding the 'RNF' and 'Type' columns.

### Ver4
* Machine failure에 대해 오버샘플링을 수행합니다.
* perform oversampling for machine failures.

In [1]:
# !pip install git+https://github.com/tooha289/DataAnalysisLibrary.git

# 사용 모듈 Import

In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from DataAnalysis import eda
from DataAnalysis import feature_engineering
from imblearn.over_sampling import ADASYN

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold

from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.python.keras.utils import np_utils
from tensorflow.keras.optimizers import Adam

import optuna

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score

# 0. Check GPU

In [6]:
import tensorflow as tf

# GPU 디바이스 목록 가져오기
gpu_devices = tf.config.experimental.list_physical_devices('GPU')

# GPU 사용 가능한 경우
if gpu_devices:
    for device in gpu_devices:
        print("GPU 사용 가능:", device)
else:
    print("GPU 사용 불가능")

GPU 사용 불가능


# 1. Load & Check Data

In [7]:
df = pd.read_csv('../data/train.csv')
df

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136424,136424,M22284,M,300.1,311.4,1530,37.5,210,0,0,0,0,0,0
136425,136425,H38017,H,297.5,308.5,1447,49.1,2,0,0,0,0,0,0
136426,136426,L54690,L,300.5,311.8,1524,38.5,214,0,0,0,0,0,0
136427,136427,L53876,L,301.7,310.9,1447,46.3,42,0,0,0,0,0,0


In [8]:
test_df = pd.read_csv('../data/test.csv')
test_df

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF
0,136429,L50896,L,302.3,311.5,1499,38.0,60,0,0,0,0,0
1,136430,L53866,L,301.7,311.0,1713,28.8,17,0,0,0,0,0
2,136431,L50498,L,301.3,310.4,1525,37.7,96,0,0,0,0,0
3,136432,M21232,M,300.1,309.6,1479,47.6,5,0,0,0,0,0
4,136433,M19751,M,303.4,312.3,1515,41.3,114,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
90949,227378,L51130,L,302.3,311.4,1484,40.4,15,0,0,0,0,0
90950,227379,L47783,L,297.9,309.8,1542,33.8,31,0,0,0,0,0
90951,227380,L48097,L,295.6,306.2,1501,41.4,187,0,0,0,0,0
90952,227381,L48969,L,298.1,307.8,1534,40.3,69,0,0,0,0,0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136429 entries, 0 to 136428
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       136429 non-null  int64  
 1   Product ID               136429 non-null  object 
 2   Type                     136429 non-null  object 
 3   Air temperature [K]      136429 non-null  float64
 4   Process temperature [K]  136429 non-null  float64
 5   Rotational speed [rpm]   136429 non-null  int64  
 6   Torque [Nm]              136429 non-null  float64
 7   Tool wear [min]          136429 non-null  int64  
 8   Machine failure          136429 non-null  int64  
 9   TWF                      136429 non-null  int64  
 10  HDF                      136429 non-null  int64  
 11  PWF                      136429 non-null  int64  
 12  OSF                      136429 non-null  int64  
 13  RNF                      136429 non-null  int64  
dtypes: f

In [10]:
df.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,136429.0,,,,68214.0,39383.804275,0.0,34107.0,68214.0,102321.0,136428.0
Product ID,136429.0,9976.0,L53257,139.0,,,,,,,
Type,136429.0,3.0,L,95354.0,,,,,,,
Air temperature [K],136429.0,,,,299.862776,1.862247,295.3,298.3,300.0,301.2,304.4
Process temperature [K],136429.0,,,,309.94107,1.385173,305.8,308.7,310.0,310.9,313.8
Rotational speed [rpm],136429.0,,,,1520.33111,138.736632,1181.0,1432.0,1493.0,1580.0,2886.0
Torque [Nm],136429.0,,,,40.348643,8.502229,3.8,34.6,40.4,46.1,76.6
Tool wear [min],136429.0,,,,104.408901,63.96504,0.0,48.0,106.0,159.0,253.0
Machine failure,136429.0,,,,0.015744,0.124486,0.0,0.0,0.0,0.0,1.0
TWF,136429.0,,,,0.001554,0.039389,0.0,0.0,0.0,0.0,1.0


In [11]:
df.isnull().sum()

id                         0
Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Machine failure            0
TWF                        0
HDF                        0
PWF                        0
OSF                        0
RNF                        0
dtype: int64

# 2. Separate columns

In [12]:
feature_cols = df.columns.difference(['id', 'Product ID', 'Machine failure'])
numeric_cols = df.columns.difference(['id', 'Product ID', 'Machine failure', 'Type'])
continuos_cols = feature_cols.difference(["TWF", "HDF", "PWF", "OSF", "RNF", "Type"])
discrete_cols = feature_cols.difference(continuos_cols)

# 3. FeatureEngineering

## 3.1 Preprocessing

In [13]:
dfp = feature_engineering.DataFramePreprocessor()
le = LabelEncoder()
stds = StandardScaler()

### 3.1.1 Train set

In [14]:
x_data = df.copy()
x_data = x_data.drop(labels=['id', 'Product ID'], axis=1)
x_data, _ = dfp.fit_transform_multiple_transformer(x_data, [le, stds], [["Type"], continuos_cols])
x_data = pd.concat([x_data, df[discrete_cols.difference(['Type'])]], axis=1)

x_data[discrete_cols] = x_data[discrete_cols].astype("int8")
x_data

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Tool wear [min],Torque [Nm],HDF,OSF,PWF,RNF,TWF
0,1,0.395880,-0.246230,0.545416,0.556417,-0.499711,0,0,0,0,0
1,2,1.469856,1.558605,1.720308,1.494433,-1.323028,0,0,0,0,0
2,1,-0.302204,-1.040358,2.051873,-1.241447,-1.628831,0,0,0,0,0
3,1,0.610675,0.692284,0.026445,1.447532,0.464745,0,0,0,0,0
4,2,-1.000288,-0.679391,0.869773,-1.100744,-0.582043,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
136424,2,0.127386,1.053251,0.069693,1.650769,-0.335048,0,0,0,0,0
136425,0,-1.268781,-1.040358,-0.528565,-1.601020,1.029305,0,0,0,0,0
136426,1,0.342182,1.342025,0.026445,1.713303,-0.217431,0,0,0,0,0
136427,1,0.986567,0.692284,-0.528565,-0.975676,0.699979,0,0,0,0,0


In [15]:
x_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136429 entries, 0 to 136428
Data columns (total 11 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Type                     136429 non-null  int8   
 1   Air temperature [K]      136429 non-null  float64
 2   Process temperature [K]  136429 non-null  float64
 3   Rotational speed [rpm]   136429 non-null  float64
 4   Tool wear [min]          136429 non-null  float64
 5   Torque [Nm]              136429 non-null  float64
 6   HDF                      136429 non-null  int8   
 7   OSF                      136429 non-null  int8   
 8   PWF                      136429 non-null  int8   
 9   RNF                      136429 non-null  int8   
 10  TWF                      136429 non-null  int8   
dtypes: float64(5), int8(6)
memory usage: 6.0 MB


In [16]:
y_data = df.copy().pop('Machine failure')
y_data = y_data.astype("int8")
y_data

0         0
1         0
2         0
3         0
4         0
         ..
136424    0
136425    0
136426    0
136427    0
136428    0
Name: Machine failure, Length: 136429, dtype: int8

### 3.1.2 Test set

In [17]:
x_test = test_df.copy()
x_test = x_test.drop(labels=['id', 'Product ID'], axis=1)
x_test, _ = dfp.fit_transform_multiple_transformer(x_test, [le, stds], [["Type"], continuos_cols])
x_test = pd.concat([x_test, test_df[discrete_cols.difference(['Type'])]], axis=1)

x_test[discrete_cols] = x_test[discrete_cols].astype("int8")
x_test

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Tool wear [min],Torque [Nm],HDF,OSF,PWF,RNF,TWF
0,1,1.313830,1.126570,-0.153806,-0.693494,-0.274579,0,0,0,0,0
1,1,0.990824,0.765635,1.375097,-1.366729,-1.356341,0,0,0,0,0
2,1,0.775487,0.332512,0.031949,-0.129855,-0.309854,0,0,0,0,0
3,2,0.129475,-0.244985,-0.296694,-1.554608,0.854217,0,0,0,0,0
4,2,1.906008,1.704068,-0.039496,0.151964,0.113445,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
90949,1,1.313830,1.054383,-0.260972,-1.398042,0.007620,0,0,0,0,0
90950,1,-1.054879,-0.100611,0.153403,-1.147536,-0.768427,0,0,0,0,0
90951,1,-2.293068,-2.699348,-0.139517,1.294897,0.125203,0,0,0,0,0
90952,1,-0.947211,-1.544354,0.096248,-0.552584,-0.004138,0,0,0,0,0


In [18]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90954 entries, 0 to 90953
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Type                     90954 non-null  int8   
 1   Air temperature [K]      90954 non-null  float64
 2   Process temperature [K]  90954 non-null  float64
 3   Rotational speed [rpm]   90954 non-null  float64
 4   Tool wear [min]          90954 non-null  float64
 5   Torque [Nm]              90954 non-null  float64
 6   HDF                      90954 non-null  int8   
 7   OSF                      90954 non-null  int8   
 8   PWF                      90954 non-null  int8   
 9   RNF                      90954 non-null  int8   
 10  TWF                      90954 non-null  int8   
dtypes: float64(5), int8(6)
memory usage: 4.0 MB


## 3.2 Feature selecting

### 3.2.1 VIF analysis

In [19]:
fs = feature_engineering.FeatureSelector()

In [20]:
formula_cols = pd.Series(feature_cols).apply(lambda col: f"Q('{col}')")
formula_cols

0         Q('Air temperature [K]')
1                         Q('HDF')
2                         Q('OSF')
3                         Q('PWF')
4     Q('Process temperature [K]')
5                         Q('RNF')
6      Q('Rotational speed [rpm]')
7                         Q('TWF')
8             Q('Tool wear [min]')
9                 Q('Torque [Nm]')
10                       Q('Type')
dtype: object

In [21]:
formula = "Q('Machine failure')~" + "+".join(formula_cols)
formula+="-1"
formula

"Q('Machine failure')~Q('Air temperature [K]')+Q('HDF')+Q('OSF')+Q('PWF')+Q('Process temperature [K]')+Q('RNF')+Q('Rotational speed [rpm]')+Q('TWF')+Q('Tool wear [min]')+Q('Torque [Nm]')+Q('Type')-1"

In [22]:
data = pd.concat([x_data, y_data], axis=1)

In [23]:
fs.get_vif_dataframe(formula, data)

Unnamed: 0,features,VIF Factor
0,Q('Air temperature [K]'),3.806145
4,Q('Process temperature [K]'),3.77247
9,Q('Torque [Nm]'),2.629339
6,Q('Rotational speed [rpm]'),2.608535
1,Q('HDF'),1.038264
3,Q('PWF'),1.037175
2,Q('OSF'),1.030903
10,Q('Type'),1.011599
8,Q('Tool wear [min]'),1.006448
7,Q('TWF'),1.006107


### 3.2.2 Get permutation importance

In [24]:
x_train, x_valid, y_train, y_valid = train_test_split(x_data, y_data, test_size=0.25, stratify=y_data)

In [25]:
dtc = DecisionTreeClassifier()
dtc.fit(x_train.values, y_train.values)

In [26]:
fs.get_permutation_importance(dtc, x_valid, y_valid)

Unnamed: 0,feature,perm_importance
0,Type,7.3e-05
1,Air temperature [K],0.008221
2,Process temperature [K],0.008037
3,Rotational speed [rpm],0.002595
4,Tool wear [min],0.0007
5,Torque [Nm],0.002802
6,HDF,0.00794
7,OSF,0.006712
8,PWF,0.00429
9,RNF,0.0


* 위 결과를 바탕으로 우선 RNF값을 제거하고 성능을 평가합니다.
* 그 후 Type에 대해서도 제거를 수행해봅니다.

* 모든 데이터 사용시

In [27]:
# Accuracy: 0.9960052774316499, f1_score: 0.8593548387096774, Roc-Auc: 0.8878698655479385
# Accuracy: 0.9960052774316499, f1_score: 0.858257477243173, Roc-Auc: 0.8835533699108423
# Accuracy: 0.9965916587260867, f1_score: 0.8824273072060682, Roc-Auc: 0.9055905397260844
# Accuracy: 0.9957120867844316, f1_score: 0.845442536327609, Roc-Auc: 0.8719626985611462
# Accuracy: 0.9958218801539307, f1_score: 0.8523316062176166, Roc-Auc: 0.8831892340605477
# array([0.99602724, 0.85956275, 0.88643314])

In [28]:
x_data.drop('RNF', axis=1, inplace=True)
x_test.drop('RNF', axis=1, inplace=True)

* RNF 제거 시 스코어

In [29]:
# Accuracy: 0.9960052774316499, f1_score: 0.8597168597168596, Roc-Auc: 0.8890167495931356
# Accuracy: 0.9960419262625523, f1_score: 0.859375, Roc-Auc: 0.8835719877243662
# Accuracy: 0.9966649563878912, f1_score: 0.883780332056194, Roc-Auc: 0.9021952567006809
# Accuracy: 0.9956021402917247, f1_score: 0.841688654353562, Roc-Auc: 0.8707626722364238
# Accuracy: 0.9958218801539307, f1_score: 0.8523316062176166, Roc-Auc: 0.8831892340605477
# array([0.99602724, 0.85937849, 0.88574718])

In [30]:
x_data.drop('Type', axis=1, inplace=True)
x_test.drop('Type', axis=1, inplace=True)

* 추가로 Type 제거 시 스코어

In [31]:
# Accuracy: 0.9960419262625523, f1_score: 0.8608247422680411, Roc-Auc: 0.8890353667134395
# Accuracy: 0.9960785750934545, f1_score: 0.8601307189542484, Roc-Auc: 0.8824464326537399
# Accuracy: 0.9966649563878912, f1_score: 0.8834827144686299, Roc-Auc: 0.9010510838165307
# Accuracy: 0.9955288426299201, f1_score: 0.8390501319261213, Roc-Auc: 0.8695812637252253
# Accuracy: 0.995968480850284, f1_score: 0.8567708333333334, Roc-Auc: 0.8832637053146436
# array([0.99605656, 0.86005183, 0.88507557])

In [84]:
# X_train과 y_train은 훈련 데이터의 특성과 레이블을 나타내는 배열이라고 가정합니다.
# X_train: 훈련 데이터의 특성
# y_train: 훈련 데이터의 레이블 (0 또는 1)

# ADASYN 객체 생성
oversampler = ADASYN(sampling_strategy=0.1)  # 적은 클래스를 90%로 맞출 수 있도록 설정

# 오버샘플링 적용
x_train_resampled, y_train_resampled = oversampler.fit_resample(x_data, y_data)

In [85]:
y_data.value_counts()

Machine failure
0    134281
1      2148
Name: count, dtype: int64

In [86]:
y_train_resampled.value_counts()

Machine failure
0    134281
1     13276
Name: count, dtype: int64

In [87]:
x_train_resampled.shape, y_train_resampled.shape

((147557, 9), (147557,))

# 4. Modeling

In [88]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train_resampled, y_train_resampled, test_size=0.25, stratify=y_train_resampled)

## 4.1 LogisticRegression

In [89]:
lr = LogisticRegression()
lr.fit(x_train.values, y_train.values)

In [90]:
acc_score = lr.score(x_train.values, y_train)
acc_score

0.9340905599681929

In [91]:
f1score = f1_score(y_train, lr.predict(x_train.values))
f1score

0.4308676654182272

In [92]:
acc_score = lr.score(x_valid.values, y_valid)
acc_score

0.9343182434264028

In [93]:
f1score = f1_score(y_valid, lr.predict(x_valid.values))
f1score

0.43532975996271267

## 4.2 XGBoost

In [98]:
# Define the objective function for Optuna
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'tree_method': 'gpu_hist',  # GPU 가속 사용
        'gpu_id': 0,  # 사용할 GPU 장치 ID 설정 (0 또는 다른 GPU ID)
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    valid_score = []

    for train_index, test_index in skf.split(x_train_resampled, y_train_resampled):
        x_train, x_valid = x_train_resampled.iloc[train_index], x_train_resampled.iloc[test_index]
        y_train, y_valid = y_train_resampled.iloc[train_index], y_train_resampled.iloc[test_index]

        xgb = XGBClassifier(**params)
        # Early stopping 설정
        xgb.fit(x_train.values, y_train.values, verbose=0, eval_set=[(x_valid.values, y_valid.values)],
                early_stopping_rounds=10)

        acc_score = xgb.score(x_valid, y_valid)
        predict_val = xgb.predict(x_valid)
        f1score = f1_score(y_valid, predict_val)
        roc_auc = roc_auc_score(y_valid, predict_val)
        valid_score.append((acc_score, f1score, roc_auc))

    # Calculate and report the mean F1-score for this trial
    mean_f1_score = sum([score[1] for score in valid_score]) / len(valid_score)
    trial.report(mean_f1_score, step=trial.number)

    return mean_f1_score

In [99]:
# Create an Optuna study and optimize
study = optuna.create_study(direction='maximize')  # Maximize the F1-score
study.optimize(objective, n_trials=50)  # Number of trials can be adjusted

# Get the best parameters
best_params = study.best_params
print(f"Best Parameters: {best_params}")

# Get the best F1-score achieved during the optimization
best_f1_score = study.best_value
print(f"Best F1-Score: {best_f1_score}")

[I 2023-09-22 15:02:59,247] A new study created in memory with name: no-name-97cc9584-0d48-44a1-8727-6487605efe98
[I 2023-09-22 15:03:05,450] Trial 0 finished with value: 0.5948066071306004 and parameters: {'n_estimators': 343, 'max_depth': 3, 'learning_rate': 0.04238040764012874, 'subsample': 0.6719045403896953, 'colsample_bytree': 0.8377409311830815}. Best is trial 0 with value: 0.5948066071306004.
[I 2023-09-22 15:03:49,921] Trial 1 finished with value: 0.020265764009305975 and parameters: {'n_estimators': 362, 'max_depth': 19, 'learning_rate': 0.001467481966499766, 'subsample': 0.7277640009054748, 'colsample_bytree': 0.8246449197040645}. Best is trial 0 with value: 0.5948066071306004.
[I 2023-09-22 15:04:03,112] Trial 2 finished with value: 0.848568429128527 and parameters: {'n_estimators': 731, 'max_depth': 5, 'learning_rate': 0.04280357197124076, 'subsample': 0.6884699916650306, 'colsample_bytree': 0.5692046147449066}. Best is trial 2 with value: 0.848568429128527.
[I 2023-09-22 

[I 2023-09-22 15:28:23,641] Trial 28 finished with value: 0.9399967905058428 and parameters: {'n_estimators': 927, 'max_depth': 11, 'learning_rate': 0.03235463876101333, 'subsample': 0.7175121997442296, 'colsample_bytree': 0.6600861652484128}. Best is trial 22 with value: 0.9508210475094018.
[I 2023-09-22 15:29:25,458] Trial 29 finished with value: 0.9475538636265621 and parameters: {'n_estimators': 752, 'max_depth': 13, 'learning_rate': 0.047924527707289305, 'subsample': 0.6983298604622747, 'colsample_bytree': 0.6091072288053421}. Best is trial 22 with value: 0.9508210475094018.
[I 2023-09-22 15:30:16,415] Trial 30 finished with value: 0.9500321255686973 and parameters: {'n_estimators': 447, 'max_depth': 17, 'learning_rate': 0.07593791490033783, 'subsample': 0.7603875354206797, 'colsample_bytree': 0.6775637072668355}. Best is trial 22 with value: 0.9508210475094018.
[I 2023-09-22 15:31:16,471] Trial 31 finished with value: 0.9509052582692149 and parameters: {'n_estimators': 945, 'max_

Best Parameters: {'n_estimators': 945, 'max_depth': 10, 'learning_rate': 0.0702163447946074, 'subsample': 0.7809047205154288, 'colsample_bytree': 0.702138691597362}
Best F1-Score: 0.9509052582692149


### 4.2.1 Create submission.csv

In [106]:
x_train, x_valid, y_train, y_valid = train_test_split(x_data, y_data, test_size=0.25, stratify=y_data)

In [107]:
xgb = XGBClassifier(**study.best_params)
xgb.fit(x_train.values, y_train.values, verbose=2)

In [108]:
prediction = xgb.predict_proba(x_test)[:,1]
len(prediction)

90954

In [109]:
result = test_df.copy()
result = result.drop(result.columns.difference(['id']), axis=1)
result['Machine failure'] = prediction
result

Unnamed: 0,id,Machine failure
0,136429,0.000008
1,136430,0.000180
2,136431,0.000002
3,136432,0.000014
4,136433,0.000029
...,...,...
90949,227378,0.000004
90950,227379,0.000061
90951,227380,0.000021
90952,227381,0.000003


In [110]:
result.to_csv('../data/submission.csv', index = False)

## 4.3 TensorFlow

In [40]:
# EarlyStopping 콜백 정의
early_stopping = EarlyStopping(monitor='val_loss', patience=10, min_delta=0.001, restore_best_weights=True)

In [60]:
def objective(trial):
    # 하이퍼파라미터 추천
    num_hidden_layers = trial.suggest_int('num_hidden_layers', 1, 3)
    num_units = trial.suggest_int('num_units', 32, 256, log=True)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
    
    f1_scores = []  # 각 폴드의 F1 스코어를 저장할 리스트

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    for train_index, test_index in skf.split(x_train_resampled, y_train_resampled):
        x_train, x_valid = x_train_resampled.iloc[train_index], x_train_resampled.iloc[test_index]
        y_train, y_valid = y_train_resampled.iloc[train_index], y_train_resampled.iloc[test_index]

        # 모델 구성
        model = Sequential()
        model.add(Dense(num_units, activation='relu', input_shape=(x_data.shape[1],)))
        for i in range(1, num_hidden_layers):
            model.add(Dense(num_units, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))

        # 모델 컴파일
        model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=learning_rate), metrics=['binary_accuracy'])

        # 모델 훈련 및 평가
        history = model.fit(x_train, y_train, epochs=100, validation_data=(x_valid, y_valid), callbacks=[early_stopping], batch_size=len(x_train), verbose=0)

        # 검증 데이터에 대한 예측을 생성
        y_pred = model.predict(x_valid)
        y_pred = (y_pred > 0.5).astype(int)

        # F1 스코어 계산 및 저장
        val_f1_score = f1_score(y_valid, y_pred)
        f1_scores.append(val_f1_score)

    # 각 폴드의 F1 스코어 평균을 반환
    mean_f1_score = sum(f1_scores) / len(f1_scores)
    
    return mean_f1_score

In [None]:
study = optuna.create_study(direction='maximize')  # 최대화 문제로 설정
study.optimize(objective, n_trials=100)  # n_trials은 시도할 하이퍼파라미터 조합의 수

best_params = study.best_params
best_accuracy = study.best_value

print("Best Hyperparameters:", best_params)
print("Best Validation Accuracy:", best_accuracy)

[I 2023-09-22 14:14:07,395] A new study created in memory with name: no-name-509102ac-b3cd-45a1-ae87-2e8468688903
[I 2023-09-22 14:14:28,029] Trial 0 finished with value: 0.028292493018307552 and parameters: {'num_hidden_layers': 1, 'num_units': 32, 'learning_rate': 1.8915553718164383e-05}. Best is trial 0 with value: 0.028292493018307552.
[I 2023-09-22 14:14:41,671] Trial 1 finished with value: 0.8618533868236232 and parameters: {'num_hidden_layers': 1, 'num_units': 130, 'learning_rate': 0.09262254274173692}. Best is trial 1 with value: 0.8618533868236232.
[I 2023-09-22 14:15:03,931] Trial 2 finished with value: 0.0 and parameters: {'num_hidden_layers': 3, 'num_units': 33, 'learning_rate': 0.0018050419395315598}. Best is trial 1 with value: 0.8618533868236232.
[I 2023-09-22 14:15:22,744] Trial 3 finished with value: 0.862770544775411 and parameters: {'num_hidden_layers': 2, 'num_units': 81, 'learning_rate': 0.027608821975327918}. Best is trial 3 with value: 0.862770544775411.
[I 2023-

[I 2023-09-22 14:26:24,461] Trial 37 finished with value: 0.721460062750244 and parameters: {'num_hidden_layers': 2, 'num_units': 52, 'learning_rate': 0.017158682763575385}. Best is trial 12 with value: 0.8628645456387151.
[I 2023-09-22 14:26:41,872] Trial 38 finished with value: 0.8618678257254999 and parameters: {'num_hidden_layers': 1, 'num_units': 47, 'learning_rate': 0.0977424639247716}. Best is trial 12 with value: 0.8628645456387151.
[I 2023-09-22 14:27:01,951] Trial 39 finished with value: 0.8613517803035607 and parameters: {'num_hidden_layers': 2, 'num_units': 77, 'learning_rate': 0.029072636064511676}. Best is trial 12 with value: 0.8628645456387151.
[I 2023-09-22 14:27:25,110] Trial 40 finished with value: 0.8545375185197773 and parameters: {'num_hidden_layers': 2, 'num_units': 95, 'learning_rate': 0.011542190235913647}. Best is trial 12 with value: 0.8628645456387151.
[I 2023-09-22 14:27:39,129] Trial 41 finished with value: 0.8621536433939028 and parameters: {'num_hidden_l

### 4.3.1 Create submission.csv

In [None]:
predicted_probabilities = model.predict(x_test)
len(prediction)

In [None]:
result = test_df.copy()
result = result.drop(result.columns.difference(['id']), axis=1)
result['Machine failure'] = predicted_probabilities
result

In [None]:
result.to_csv('../data/submission.csv', index = False)