# Semi_defect_finder
반도체 공정/계측 데이터 기반 **결함 여부(불량/정상)** 이진 분류 모델 구축 노트북


In [1]:
# =========================================
# 0) CONFIG (여기만 바꿔 끼우면 됩니다)
# =========================================
CONFIG = {

    # 타깃 후보 (우선순위대로 탐색)
    "target_candidates": ["Defect"],
    
    # 공정(설비) 분리 기준 컬럼
    "tool_type_candidates": ["Tool_Type", "tool_type"],

    # Join_Status를 타깃으로 쓰고 싶다면 True로 (Defect와 중복/누수 확인 로직 포함)
    "use_join_status_as_target": False,
    "join_status_col_candidates": ["Join_Status"],
    "join_status_positive_values": ["Non-Joining", "FAIL", "Fail", "NG", "Bad", "Defect", "1"],  # 필요시 수정

    # 누수 위험 컬럼/ID류/시간류 (피처에서 제거)
    "drop_cols_always": ["Process_ID", "Timestamp"],

    # Etch_Depth 후보 (A/B 시나리오 분기)
    "etch_depth_candidates": ["Etch_Depth"],

    # 공정(설비) 분리 기준 컬럼
    "tool_type_candidates": ["Tool_Type", "tool_type"],

    # 그룹 분할 우선순위: wafer/lot 단위
    "group_candidates": ["Wafer_ID", "wafer_id", "Lot_ID", "lot_id", "die_id", "Die_ID"],

    # 시간 분할 컬럼 후보 (분할에만 사용)
    "time_candidates": ["Timestamp", "timestamp", "DateTime", "datetime", "time", "Time"],

    # 모델/검증
    "random_state": 42,
    "test_size": 0.2,
    "cv_splits": 5,
    "n_iter_search": 30,
    "scoring_primary": "average_precision",  # PR-AUC(AP)
    "precision_constraint": 0.90,            # threshold 선택 시 precision 최소
    "calibration": None,                     # None / "sigmoid" / "isotonic"

    # 전처리 옵션
    "use_iterative_imputer": False,          # True로 바꾸면 IterativeImputer 사용
    "winsorize_limits": (0.01, 0.01),        # 상하 1% winsorization
    "use_isolation_forest": False,           # 선택 옵션(훈련셋 필터링 원칙)

    # 출력/저장
    "model_output_dir": "./artifacts",
    "model_name_prefix": "semi_defect_model",
}



In [2]:
# =========================================
# 1) Imports (preprocess & modeling)
# =========================================
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

from dataclasses import dataclass
from typing import List, Optional

from sklearn.model_selection import StratifiedKFold, GroupKFold, TimeSeriesSplit, RandomizedSearchCV, train_test_split
from sklearn.metrics import (
    average_precision_score, precision_recall_curve, roc_auc_score,
    f1_score, precision_score, recall_score, confusion_matrix,
    ConfusionMatrixDisplay, RocCurveDisplay, PrecisionRecallDisplay
)

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.experimental import enable_iterative_imputer  # noqa: F401
from sklearn.impute import IterativeImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.base import BaseEstimator, TransformerMixin



In [1]:
# =========================================
# 2) Load data & basic checks
# =========================================

from pathlib import Path
import pandas as pd

# 프로젝트 루트(.git) 탐색
root = Path.cwd()
while root != root.parent and not (root / ".git").exists():
    root = root.parent

csv_path = root / "SECOM" / "uci-secom.csv"
print("Resolved csv_path:", csv_path)

df = pd.read_csv(csv_path)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

target_col = "Defect"
tool_col = "Tool_Type"

print("Target col:", target_col)
print("Tool_Type col:", tool_col)

print("Missing ratio (top 10):")
print(df.isna().mean().sort_values(ascending=False).head(16))

df.head()


Resolved csv_path: /Users/parkjunbeom/Library/CloudStorage/GoogleDrive-yrtny70127@gmail.com/내 드라이브/Colab Notebooks/Github/Project_Semiconductor/Semiconductor_Quality_Prospect/SECOM/uci-secom.csv
Shape: (1567, 592)
Columns: ['Time', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124'

Unnamed: 0,Time,0,1,2,3,4,5,6,7,8,...,581,582,583,584,585,586,587,588,589,Pass/Fail
0,2008-07-19 11:55:00,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,...,,0.5005,0.0118,0.0035,2.363,,,,,-1
1,2008-07-19 12:32:00,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,...,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045,-1
2,2008-07-19 13:17:00,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,...,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602,1
3,2008-07-19 14:43:00,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,...,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432,-1
4,2008-07-19 15:22:00,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,...,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432,-1
