# 제품 이상여부 판별 프로젝트 - AutoGluon + ADASYN

# 1. 사전 준비 사항

## [ 모두 ] 필수 라이브러리 로드

In [None]:
#!pip install -q autogluon
#!pip install -q imblearn
#!pip install -U ipywidgets

In [None]:
import os
import shutil
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score

from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.core.metrics import make_scorer

from imblearn.over_sampling import ADASYN

## 디스크 정리
### 필요 시 사용

In [None]:
total, used, free = shutil.disk_usage("/")

used_percent = (used / total) * 100

print(f"Total: {total // (2**30)} GB")
print(f"Used: {used // (2**30)} GB ({used_percent:.2f}%)")

In [None]:
directory = "AutogluonModels"

if os.path.exists(directory) and os.path.isdir(directory):
    shutil.rmtree(directory)
    print(f"'{directory}' 디렉토리가 삭제되었습니다.")
else:
    print(f"'{directory}' 디렉토리가 존재하지 않습니다.")

In [None]:
directory = "sampled_data"

if os.path.exists(directory) and os.path.isdir(directory):
    shutil.rmtree(directory)
    print(f"'{directory}' 디렉토리가 삭제되었습니다.")
else:
    print(f"'{directory}' 디렉토리가 존재하지 않습니다.")

## 데이터 읽어오기

In [None]:
ROOT_DIR = "data"
NEW_ROOT_DIR = "sampled_data"
RANDOM_STATE = 110

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
train_data

# 2. Preprocessing

In [None]:
# 데이터 오류 수정

# 'OK' 값을 결측치로 변환할 열들
columns_to_replace = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'
]
## 각 열에 대해 'OK' 값을 결측치로 변환
train_data[columns_to_replace] = train_data[columns_to_replace].replace('OK', pd.NA)
## 수치형으로 변환
train_data[columns_to_replace] = train_data[columns_to_replace].apply(pd.to_numeric)

## 결측치를 가지는 변수 삭제

In [None]:
# drop NaN
train_data = train_data.dropna(axis=1, how='any')

## 고유값을 가지는 변수 삭제

In [None]:
train_data = train_data.drop(columns=train_data.columns[train_data.nunique() == 1])

## 중복 변수 제거

In [None]:
# 중복 변수 제거
columns_to_drop = [
    #'Stage1 Circle2 Distance Speed Collect Result_Dam',
    'Stage1 Circle3 Distance Speed Collect Result_Dam', 
    'Stage1 Circle4 Distance Speed Collect Result_Dam', 
    
    #'Stage2 Circle2 Distance Speed Collect Result_Dam', 
    'Stage2 Circle3 Distance Speed Collect Result_Dam', 
    'Stage2 Circle4 Distance Speed Collect Result_Dam', 
    
    #'Stage3 Circle2 Distance Speed Collect Result_Dam', 
    'Stage3 Circle3 Distance Speed Collect Result_Dam', 
    'Stage3 Circle4 Distance Speed Collect Result_Dam',

    #'Model.Suffix_Dam', 
    'Model.Suffix_AutoClave', 
    'Model.Suffix_Fill1', 
    'Model.Suffix_Fill2',
    
    #'Workorder_Dam', 
    'Workorder_AutoClave', 
    'Workorder_Fill1', 
    'Workorder_Fill2'
]

train_data = train_data.drop(columns=columns_to_drop)

In [None]:
train_df = train_data
train_df

## Feature Selection

In [None]:
# train 데이터셋을 feature와 target으로 나누기
train_x = train_df.drop('target', axis=1)
train_y = train_df['target']

In [None]:
# feature를 범주형과 수치형으로 나누기
categorical = train_x.select_dtypes(include=['object', 'category']).columns
numerical = train_x.select_dtypes(include=['number']).columns

In [None]:
# 범주형 feature들 간의 관계 확인
def cat_relation(df):
    relation = {}
    
    for col1 in categorical:
        unique = df[col1].unique()
        
        for value in unique:
            a = df[df[col1] == value]
            
            if len(a) <= 1:
                continue
            
            for col2 in categorical:
                if col2 == col1:
                    continue
                
                unique2 = a[col2].unique()
                
                if len(unique2) == 1:
                    relation[col1] = col2
    
    return relation

# 수치형 feature들 간의 관계 확인
def num_relation(df):
    relation = {}
    
    for col1 in numerical:
        group_size = df.groupby(col1).size()
        
        if group_size.size >= len(df):
            continue
        
        for col2 in numerical:
            if col2 == col1:
                continue
            
            unique = df.groupby(col1)[col2].nunique().max()
            
            if unique == 1:
                relation[col1] = col2
    
    return relation

# feature 선택
def select_features(aa):
    features = set()
    all_features = set()

    for key, value in aa.items():
        if key not in all_features and value not in all_features:
            features.update([key, value])
            all_features.update([key, value])
    
    selected_features = {var for var in features 
                           if not any(var in rel for rel in aa.values())}
    
    return list(selected_features)

In [None]:
cat_relationship = cat_relation(train_x)
num_relationship = num_relation(train_x)

selected_categorical = select_features(cat_relationship)
selected_numerical = select_features(num_relationship)

In [None]:
# 전체 feature 집합과 관계가 있는 feature
all_features = set(train_x.columns)
cat_related = set(cat_relationship.keys()).union(set(cat_relationship.values()))
num_related = set(num_relationship.keys()).union(set(num_relationship.values()))
related_features = cat_related.union(num_related)

# 관계가 없는 feature
unrelated_features = all_features - related_features

# 최종 선택된 feature 집합 생성
features = set(selected_categorical).union(set(selected_numerical))
selected_features = list(features.union(unrelated_features))

In [None]:
print(f'{len(selected_features)}개')
print(selected_features)

In [None]:
df = train_x[selected_features].copy()

# workorder의 앞 3자리만 사용 (데이터 크기를 줄이는 역할)
df['Workorder_Dam'] = df['Workorder_Dam'].str[:3]

## One-Hot Encoding

In [None]:
categorical = df.select_dtypes(include=['object', 'category'])
numerical = df.select_dtypes(include=['number'])

encoder = OneHotEncoder(sparse_output=False, drop='first')

encoded_cat = encoder.fit_transform(categorical)

encoded_df = pd.DataFrame(encoded_cat, columns=encoder.get_feature_names_out(categorical.columns))

train_x = pd.concat([numerical, encoded_df], axis=1)

In [None]:
train_x

## 불균형 데이터 샘플링 (ADASYN + K-Fold)

In [None]:
os.makedirs('sampled_data', exist_ok=True)

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# K-Fold 교차 검증
for fold, (train_idx, val_idx) in enumerate(kf.split(train_x, train_y)):
    # 데이터 분할
    X_train, X_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
    y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

    # ADASYN 적용
    adasyn = ADASYN(random_state=RANDOM_STATE)
    X_adasyn_train, y_adasyn_train = adasyn.fit_resample(X_train, y_train)

    # DataFrame으로 변환
    X_adasyn_train_df = pd.DataFrame(X_adasyn_train, columns=train_x.columns)
    y_adasyn_train_df = pd.DataFrame(y_adasyn_train, columns=['target'])
    X_val_df = pd.DataFrame(X_val, columns=train_x.columns)
    y_val_df = pd.DataFrame(y_val, columns=['target'])

    # 학습용 데이터 저장
    train_df = pd.concat([X_adasyn_train_df, y_adasyn_train_df], axis=1)
    train_df.to_csv(f'sampled_data/train_{fold + 1}.csv', index=False)

    # 검증용 데이터 저장
    val_df = pd.concat([X_val_df, y_val_df], axis=1)
    val_df.to_csv(f'sampled_data/val_{fold + 1}.csv', index=False)

## Test 데이터 Preprocessing

In [None]:
test_data = pd.read_csv("data/test.csv")

In [None]:
test_x = test_data[selected_features].copy()

# workorder의 앞 3자리만 사용 (데이터 크기를 줄이는 역할)
test_x['Workorder_Dam'] = test_x['Workorder_Dam'].str[:3]

In [None]:
categorical = test_x.select_dtypes(include=['object', 'category'])
numerical = test_x.select_dtypes(include=['number'])

encoder = OneHotEncoder(sparse_output=False, drop='first')

encoded_cat = encoder.fit_transform(categorical)

encoded_df = pd.DataFrame(encoded_cat, columns=encoder.get_feature_names_out(categorical.columns))

df_test = pd.concat([numerical, encoded_df], axis=1)

In [None]:
df_test

In [None]:
df_test.to_csv('sampled_data/preprocessed_test.csv', index=False)

# 3. Model

## preprocessed_test 데이터 불러오기

In [None]:
test_df = pd.read_csv('sampled_data/preprocessed_test.csv')
test_df

## 제출용 submission 데이터 불러오기

In [None]:
sub_df = pd.read_csv("submission.csv")
sub_df

## f1_scorer 생성

In [None]:
# f1_scorer 생성
f1_scorer = make_scorer(
    name='f1',
    score_func=f1_score,
    greater_is_better=True,
    needs_class=True,
    optimum=1,
    pos_label=0
)

## Model 학습

In [None]:
df_list = []
threshold_list = []

# Fold 별로 학습
for fold in range(1, 6):
    train_df = pd.read_csv(f'sampled_data/train_{fold}.csv')
    val_df = pd.read_csv(f'sampled_data/val_{fold}.csv')
    
    train = TabularDataset(train_df)
    val = TabularDataset(val_df)

    predictor = TabularPredictor(label='target', eval_metric=f1_scorer).fit(train, num_cpus=4)
    
    test_pred = predictor.predict_proba(test_df)
    sub_df['target_proba'] = test_pred.iloc[:, 0]
    df_list.append(sub_df[['Set ID', 'target_proba']])
    
    threshold = predictor.calibrate_decision_threshold(val, f1_scorer)
    threshold_list.append(threshold)

In [None]:
df_list

## Fold(5개)에 대한 임계값 평균 구하기

In [None]:
threshold_list

In [None]:
avr = sum(threshold_list) / len(threshold_list)
result = 1 - avr

print(result)

## 앙상블 (Soft Voting)

In [None]:
merged = df_list[0][['Set ID', 'target_proba']].copy()

# df를 'Set ID' 기준으로 병합
for i, df in enumerate(df_list[1:], start=2):
    merged = merged.merge(df[['Set ID', 'target_proba']], on='Set ID', suffixes=(f'_{i-1}', f'_{i}'))

# Soft Voting을 위해 평균 계산
proba_cols = [col for col in merged.columns if 'target_proba' in col]
merged['average_target'] = merged[proba_cols].mean(axis=1)

# Fold에 대한 임계값을 기준으로 평균과 비교
merged['final_target'] = merged['average_target'].apply(lambda x: 'Normal' if x < result else 'AbNormal')

submission = merged[['Set ID', 'final_target', 'average_target']]
submission.columns = ['Set ID', 'target', 'target_proba']

submission_df = submission.copy()

In [None]:
submission_df.drop('target_proba', axis=1, inplace=True)

submission_df['target'].value_counts()

In [None]:
submission_df

# 4. 제출

In [None]:
submission_df.to_csv('submission.csv', index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**


## 학습된 Model, data 압축

In [None]:
shutil.make_archive('AutogluonModels', 'zip', 'AutogluonModels')

In [None]:
shutil.make_archive('sampled_data', 'zip', 'sampled_data')