In [1]:
!pip install catboost

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
from pprint import pprint

from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [3]:
ROOT_DIR = "data"
RANDOM_STATE = 42

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))
train_data

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,,,...,7,,,127,,,1,,,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,,,...,185,,,1,,,0,,,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,,,...,10,,,73,,,1,,,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,,,...,268,,,1,,,0,,,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,,,...,121,,,1,,,0,,,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,1,OK,240.0,,,...,318,,,1,,,0,,,Normal
40502,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1,OK,1000.0,,,...,14,,,197,,,1,,,Normal
40503,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,1,OK,240.0,,,...,1,,,27,,,1,,,Normal
40504,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1,OK,1000.0,,,...,117,,,1,,,0,,,Normal


In [4]:
# 모든 값이 동일할 때 컬럼을 제거
train_data = train_data.loc[:, (train_data != train_data.iloc[0]).any()]
test_data = test_data.loc[:, (test_data != test_data.iloc[0]).any()]

In [5]:
# 모든 값이 NAN인 열 제거
train_data = train_data.dropna(axis=1, how='all')
test_data = test_data.dropna(axis=1, how='all')

In [6]:
# 데이터프레임에서 문자열 데이터가 있는 열에 대해 Label Encoding 적용
label_encoders = {}

for column in train_data.columns:
    if train_data[column].dtype == 'object':  # 문자열 데이터를 확인
        le = LabelEncoder()
        train_data[column] = le.fit_transform(train_data[column].astype(str))  # Label Encoding 적용
        label_encoders[column] = le  # 나중에 디코딩할 수 있도록 레이블 인코더 저장

for column in test_data.columns:
    if test_data[column].dtype == 'object':  # 문자열 데이터를 확인
        le = LabelEncoder()
        test_data[column] = le.fit_transform(test_data[column].astype(str))  # Label Encoding 적용
        label_encoders[column] = le  # 나중에 디코딩할 수 있도록 레이블 인코더 저장

In [7]:
# correlation이 1인 컬럼들을 하나만 남기고 제거
cor = train_data.corr()
for i in range(len(cor)):
    for j in range(i+1, len(cor)):
        if cor.iloc[i, j] == 1:
            if cor.columns[i] in train_data.columns:
                train_data = train_data.drop(columns=cor.columns[i])
                test_data = test_data.drop(columns=cor.columns[i])
                break


In [8]:
train_data

Unnamed: 0,Equipment_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,0,240.0,2.5,100,1030,-90,16,14.9,8.4,14.7,...,50.0,91.8,270,50,114.612,19.9,7,127,1,1
1,0,240.0,2.5,70,1030,-90,10,21.3,4.9,21.3,...,91.8,270.0,50,85,19.600,7.0,185,1,0,1
2,1,1000.0,12.5,85,280,90,16,14.7,8.5,14.7,...,50.0,91.8,270,50,114.612,19.8,10,73,1,1
3,1,1000.0,12.5,70,280,90,10,21.3,8.4,21.3,...,91.8,270.0,50,85,19.900,12.0,268,1,0,1
4,0,240.0,2.5,70,1030,-90,10,9.7,4.9,9.6,...,91.8,270.0,50,85,19.700,8.0,121,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,0,240.0,2.5,70,1030,-90,10,17.0,5.0,17.0,...,91.8,270.0,50,85,19.200,1.0,318,1,0,1
40502,1,1000.0,12.5,100,280,90,16,14.9,8.5,14.7,...,50.0,91.8,270,50,114.612,20.5,14,197,1,1
40503,0,240.0,2.5,100,1030,-90,16,14.2,8.2,14.3,...,50.0,91.8,270,50,85.000,19.7,1,27,1,1
40504,1,1000.0,12.5,70,280,90,10,9.7,4.9,9.7,...,91.8,270.0,50,85,20.100,13.0,117,1,0,1


In [9]:
train_data # normal:1, abnormal:0

Unnamed: 0,Equipment_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,0,240.0,2.5,100,1030,-90,16,14.9,8.4,14.7,...,50.0,91.8,270,50,114.612,19.9,7,127,1,1
1,0,240.0,2.5,70,1030,-90,10,21.3,4.9,21.3,...,91.8,270.0,50,85,19.600,7.0,185,1,0,1
2,1,1000.0,12.5,85,280,90,16,14.7,8.5,14.7,...,50.0,91.8,270,50,114.612,19.8,10,73,1,1
3,1,1000.0,12.5,70,280,90,10,21.3,8.4,21.3,...,91.8,270.0,50,85,19.900,12.0,268,1,0,1
4,0,240.0,2.5,70,1030,-90,10,9.7,4.9,9.6,...,91.8,270.0,50,85,19.700,8.0,121,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,0,240.0,2.5,70,1030,-90,10,17.0,5.0,17.0,...,91.8,270.0,50,85,19.200,1.0,318,1,0,1
40502,1,1000.0,12.5,100,280,90,16,14.9,8.5,14.7,...,50.0,91.8,270,50,114.612,20.5,14,197,1,1
40503,0,240.0,2.5,100,1030,-90,16,14.2,8.2,14.3,...,50.0,91.8,270,50,85.000,19.7,1,27,1,1
40504,1,1000.0,12.5,70,280,90,10,9.7,4.9,9.7,...,91.8,270.0,50,85,20.100,13.0,117,1,0,1


In [10]:
from imblearn.under_sampling import EditedNearestNeighbours
from sklearn.metrics import classification_report

X = train_data.drop("target", axis = 1)
y = train_data["target"]

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

enn = EditedNearestNeighbours(n_neighbors=4)
X_data, y_data = enn.fit_resample(X, y)

In [11]:
normal = []
abnormal = []
for i in y_data:
    if i ==1:
        normal.append(i)
    else:
        abnormal.append(i)

print(len(normal) + len(abnormal))
print(len(normal))
print(len(abnormal))

33592
31242
2350


In [12]:
from imblearn.over_sampling import RandomOverSampler

# 랜덤 오버샘플링 적용
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_data, y_data)

In [13]:
normal = []
abnormal = []
for i in y_resampled:
    if i ==1:
        normal.append(i)
    else:
        abnormal.append(i)

print(len(normal) + len(abnormal))
print(len(normal))
print(len(abnormal))

62484
31242
31242


In [14]:
X_resampled = X_resampled

In [15]:
catboost_model = CatBoostClassifier(random_seed=42, verbose=False)
xgboost_model = XGBClassifier(random_seed = 42)
randomforest_model = RandomForestClassifier(random_state=42) # class_weight 적용 가능

In [16]:
# from sklearn.feature_selection import SelectFromModel
# sfm_cat = SelectFromModel(catboost_model,  threshold='1.25*median')
# sfm_xg = SelectFromModel(xgboost_model,  threshold='1.25*median')
# sfm_rf = SelectFromModel(randomforest_model,  threshold='1.25*median')

# selected_features = X_resampled.columns[sfm_cat.get_support()]
# selected_features = X_resampled.columns[sfm_xg.get_support()]
# selected_features = X_resampled.columns[sfm_rf.get_support()]

# selected_train_x_cat= X_resampled[selected_features]
# selected_train_x_xg= X_resampled[selected_features]
# selected_train_x_rf= X_resampled[selected_features]


# cat = catboost_model.fit(selected_train_x_cat, train_y)
# xg = xgboost_model.fit(selected_train_x_xg, train_y)
# rf = randomforest_model.fit(selected_train_x_rf, train_y)

In [17]:
voting_model = VotingClassifier(estimators=[
    ('catboost', catboost_model),
    ('xgboost', xgboost_model),
    ('randomforest', randomforest_model)
], voting='soft')# 'soft'는 각 모델의 예측 확률 평균을 사용, 'hard'는 각 모델의 최종 예측 결과의 다수결을 사용

In [18]:
voting_model.fit(X_resampled, y_resampled)

Parameters: { "random_seed" } are not used.



In [19]:
test_pred = voting_model.predict(test_data.drop(columns='Set ID'))
test_pred

array([1, 1, 1, ..., 1, 1, 1])

In [20]:
result = ['Normal' if pred == 1 else 'AbNormal' for pred in test_pred]
result

['Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNormal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'AbNorm

In [21]:
nor = []
ab = []
for i in result:
    if i == 'Normal':
        nor.append(i)
    else:
        ab.append(i)
print(len(nor))
print(len(ab))

15746
1615


In [22]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = result

df_sub

Unnamed: 0,Set ID,target
0,0001be084fbc4aaa9d921f39e595961b,Normal
1,0005bbd180064abd99e63f9ed3e1ac80,Normal
2,000948934c4140d883d670adcb609584,Normal
3,000a6bfd02874c6296dc7b2e9c5678a7,Normal
4,0018e78ce91343678716e2ea27a51c95,Normal
...,...,...
17356,ffea508b59934d689b540f95eb3fa730,Normal
17357,ffed8923c8a448a98afc641b770be153,Normal
17358,fff1e73734da40adbe805359b3efb462,Normal
17359,fff8e38bdd09470baf95f71e92075dec,Normal


In [23]:
df_sub.to_csv("submission.csv", index=False)