# 제품 이상여부 판별 프로젝트

## 1. 데이터 불러오기

### 필수 라이브러리

In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import KFold,train_test_split
from tqdm import tqdm
from catboost import CatBoostClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import random

from sklearn.preprocessing import LabelEncoder

import xgboost as xgb

### 필요한 파일 불러오기

In [2]:
X = pd.read_csv("X.csv", low_memory = False)

In [3]:
df_merged = pd.read_csv("ORIGINAL.csv", low_memory=False)

### 데이터 전처리

데이타 불균형을 해결하기 위해 언더/오버 샘플링을 진행합니다.

In [4]:
def sampling(RANDOM_STATE):
    df_normal = df_merged[df_merged["target"] == "Normal"]
    df_abnormal = df_merged[df_merged["target"] == "AbNormal"]

    num_normal = len(df_normal)
    num_abnormal = len(df_abnormal)
    normal_ratio = num_normal / num_abnormal # 1.0 means 1:1 ratio    
    print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

    df_normal = df_normal.sample(
        n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE
    )
    df_concat = pd.concat([df_normal, df_abnormal], axis=0).reset_index(drop=True)
    df_concat.value_counts("target")
    df_concat = df_concat.sort_values(by=["Collect Date - Dam"])
    return df_concat

### 데이터 분할

In [5]:
RANDOM_STATE =1706
df_concat = sampling(RANDOM_STATE)

df_train, df_val = train_test_split(
    df_concat,
    test_size=0.3,
    stratify=df_concat["target"],
    random_state=RANDOM_STATE,
)

features = []

for col in df_concat.columns:
    try:
        # 고유 값의 수가 1이면 건너뛰기
#         if df_concat[col].nunique() > 1:            
            df_concat[col] = df_concat[col].astype(int)
            features.append(col)
    except:
        continue

if "Set ID" in features:
    features.remove("Set ID")


  Total: Normal: 38156, AbNormal: 2350


In [6]:
train_x = df_train[features]
train_y = df_train["target"]
val_x = df_val[features]
val_y = df_val["target"]


print(train_x.shape, val_x.shape)

(28354, 149) (12152, 149)


In [7]:
le = LabelEncoder()

y_train_encoded = le.fit_transform(train_y)
y_val_encoded = le.fit_transform(val_y)

In [8]:
# CatBoost with Class Weighting
weight_for_class_0 = sum(train_y == 'AbNormal') / len(train_y)
weight_for_class_1 = sum(train_y == 'Normal') / len(train_y)
model = CatBoostClassifier(class_weights=[weight_for_class_0, weight_for_class_1], random_state=80, verbose=0)

In [9]:
# model = xgb.XGBClassifier(
#     n_estimators=100,
#     learning_rate=0.1,
#     max_depth=6,
#     objective='binary:logistic',
#     eval_metric='logloss'
# )

## 3. 모델 학습 중 최고만 뽑기

In [10]:
df_test_y = pd.read_csv(os.path.join("submission.csv"))

In [11]:
df_test = pd.merge(X, df_test_y, "inner", on="Set ID")
df_test_x = df_test[features]

for col in df_test_x.columns:
    try:
        df_test_x.loc[:, col] = df_test_x[col].astype(int)
    except:
        continue

### 모델 학습

In [17]:
RANDOM_STATE = 881
# 언더샘플링 및 오버샘플링을 위한 파이프라인 구성
over = SMOTE(random_state=RANDOM_STATE)
under = RandomUnderSampler(random_state=RANDOM_STATE)
pipeline = Pipeline(steps=[('u', under),('o', over)])

# 오버샘플링 및 언더샘플링 적용
train_x_resampled, train_y_resampled = pipeline.fit_resample(train_x, y_train_encoded)

# 재학습
model.fit(train_x_resampled, train_y_resampled)

# 재평가
train_pred_resampled = model.predict(train_x_resampled)
val_pred_resampled = model.predict(val_x)

# print(RANDOM_STATE)
now_best = f1_score(y_val_encoded, val_pred_resampled, pos_label=1)
print("Resampled Validation F1 Score:", now_best)

Resampled Validation F1 Score: 0.9679563661155617


In [15]:
#제출 데이터 생성하기    
test_pred = model.predict(df_test_x)
y_pred_labels = le.inverse_transform(test_pred)
y_pred_labels
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = y_pred_labels

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

0.9679563661155617 is best validationi f1-score


## 4. 제출하기

### 제출 파일 작성

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**