# 제품 이상여부 판별 프로젝트

## 1. 데이터 불러오기

### 필수 라이브러리

In [3]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import KFold,train_test_split
from tqdm import tqdm
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import random
from catboost import CatBoostClassifier
from imblearn.under_sampling import TomekLinks, CondensedNearestNeighbour
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import OneSidedSelection
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
import joblib
random_state = 42  # Random state for reproducibility

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

### 필요한 파일 불러오기

In [1]:
X = pd.read_csv("MERGED.csv", low_memory = False)

NameError: name 'pd' is not defined

### 데이터 이상치 제거

In [7]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture

def remove_outliers_with_gmm(file_path, n_components, random_state):
    """
    Remove outliers using Gaussian Mixture Models (GMM).

    Parameters:
    - file_path: str, path to the CSV file.
    - n_components: int, the number of mixture components to use for GMM.
    - random_state: int, random state for reproducibility.

    Returns:
    - df_clean: DataFrame with the outliers removed.
    """
    # Load the dataset
    df = pd.read_csv(file_path)

    # Store original column order
    original_columns = df.columns.tolist()

    # Separate features and target
    features = df.drop(columns=['target'])
    target = df['target']

    # Fit a GMM model to the data
    gmm = GaussianMixture(n_components=n_components, random_state=random_state)
    gmm.fit(features)

    # Get the log-likelihood of each sample
    log_likelihood = gmm.score_samples(features)

    # Define a threshold to identify outliers
    threshold = np.percentile(log_likelihood, 5)  # Removing the bottom 5% as outliers

    # Identify outliers
    outliers = log_likelihood < threshold

    # Remove outliers from the dataset
    df_clean = df[~outliers]

    # Ensure the column order is maintained
    df_clean = df_clean[original_columns]

    return df_clean, df, outliers

# Example usage:
file_path = 'PROCESSED.csv'  # Path to your CSV file
n_components = 2  # Number of mixture components

df_clean, df, outliers = remove_outliers_with_gmm(file_path, n_components, random_state)

# Verify that column order is maintained
print("Original columns:", df.columns.tolist())
print("Cleaned columns:", df_clean.columns.tolist())

#df_clean.to_csv('CLEANED_PROCESSED.csv', index=False)

# Print the number of removed outliers and the shape of the cleaned dataframe
print(f"Number of removed outliers: {len(df) - len(df_clean)}")
print(f"Shape of cleaned dataframe: {df_clean.shape}")

# Print class distribution before and after outlier removal
print("\nClass distribution before outlier removal:")
print(df['target'].value_counts())

print("\nClass distribution after outlier removal:")
print(df_clean['target'].value_counts())

# Print the total number of samples before and after outlier removal
print(f"\nTotal number of samples before outlier removal: {len(df)}")
print(f"Total number of samples after outlier removal: {len(df_clean)}")

Original columns: ['Dam dispensing_Equipment', 'Dam dispensing_Model.Suffix', 'Dam dispensing_Workorder', 'Dam dispensing_Collect Date', 'Dam dispensing_Collect Result', 'Dam dispensing_Collect Result.1', 'Dam dispensing_Collect Result.2', 'Dam dispensing_Collect Result.3', 'Dam dispensing_Collect Result.7', 'Dam dispensing_Collect Result.9', 'Dam dispensing_Collect Result.10', 'Dam dispensing_Collect Result.11', 'Dam dispensing_Collect Result.12', 'Dam dispensing_Collect Result.13', 'Dam dispensing_Collect Result.14', 'Dam dispensing_Collect Result.15', 'Dam dispensing_Collect Result.16', 'Dam dispensing_Collect Result.17', 'Dam dispensing_Collect Result.18', 'Dam dispensing_Collect Result.19', 'Dam dispensing_Collect Result.20', 'Dam dispensing_Collect Result.21', 'Dam dispensing_Collect Result.22', 'Dam dispensing_Collect Result.23', 'Dam dispensing_Collect Result.24', 'Dam dispensing_Collect Result.25', 'Dam dispensing_Collect Result.26', 'Dam dispensing_Collect Result.27', 'Dam di

### 데이터 보간

데이타 불균형을 해결하기 위해 언더/오버 샘플링을 진행합니다.

In [8]:
def oss_resample(features, target, RANDOM_STATE, step_ratio=0.02):
    """
    Perform One-Sided Selection (OSS) under-sampling.
    """
    oss = OneSidedSelection(random_state=RANDOM_STATE)
    X_oss, y_oss = oss.fit_resample(features, target)
    
    print('Performing OSS...')

    # Calculate the number of samples needed to achieve the step ratio
    initial_count_major = (y_oss == 'Normal').sum()
    initial_count_minor = (y_oss == 'AbNormal').sum()

    target_count_major = int(initial_count_major * (1 - step_ratio))
    count_major = initial_count_major
    print('Target ratio: ', target_count_major/initial_count_minor)

    while count_major > target_count_major:
        X_oss, y_oss = oss.fit_resample(X_oss, y_oss)
        count_major = (y_oss == 'Normal').sum()
        count_minor = (y_oss == 'AbNormal').sum()
        print('count_major: ', count_major, 'count_minor: ', count_minor, 'Updated ratio:', count_major/count_minor)

    return X_oss, y_oss

def adasyn_resample(features, target, RANDOM_STATE, step_ratio=0.05):
    """
    Perform ADASYN over-sampling.
    """
    adasyn = ADASYN(random_state=RANDOM_STATE)

    X_adasyn, y_adasyn = adasyn.fit_resample(features, target)
    
    print('Performing ADASYN...')
    
    # Calculate the number of samples needed to achieve the step ratio
    initial_count_major = (y_adasyn == 'Normal').sum()
    initial_count_minor = (y_adasyn == 'AbNormal').sum()

    target_count_minor = int(initial_count_minor * (1 + step_ratio))
    count_minor = initial_count_minor
    print('initial count minor: ', initial_count_minor)
    print('Target ratio: ', initial_count_major/target_count_minor)

    '''while count_minor < target_count_minor:
        X_adasyn, y_adasyn = adasyn.fit_resample(X_adasyn, y_adasyn)
        count_major = (y_adasyn == 'Normal').sum()
        count_minor = (y_adasyn == 'AbNormal').sum()
        print('count_major: ', count_major, 'count_minor: ', count_minor, 'Updated ratio:', count_major/count_minor)'''

    return X_adasyn, y_adasyn

def sampling(df, RANDOM_STATE, target_ratio=1.5):
    """
    Perform under-sampling and over-sampling to adjust the class distribution.

    Parameters:
    - df: DataFrame containing the data with 'target' column.
    - RANDOM_STATE: int, random state for reproducibility.
    - target_ratio: float, the desired ratio of Normal to AbNormal after sampling.
    - step_ratio: float, the step ratio to apply in each under-sampling and over-sampling step.

    Returns:
    - df_concat: DataFrame with the sampled data.
    """
    
    # Separate the target column
    target = df['target']
    features = df.drop(columns=['target'])

    # Initial class distribution
    count_major = (target == 'Normal').sum()
    count_minor = (target == 'AbNormal').sum()
    current_ratio = count_major / count_minor
    print('Initial class distribution - Normal:', count_major, 'AbNormal:', count_minor)
    print('Initial ratio:', current_ratio)

    while current_ratio > target_ratio:
        # Perform One-Sided Selection (OSS) under-sampling
        X_oss, y_oss = oss_resample(features, target, RANDOM_STATE, step_ratio=0.02)
        df_oss = pd.concat([pd.DataFrame(X_oss), pd.Series(y_oss, name='target')], axis=1)

        # Perform ADASYN over-sampling
        X_adasyn, y_adasyn = adasyn_resample(df_oss.drop(columns=['target']), df_oss['target'], RANDOM_STATE, step_ratio=0.05)
        df_adasyn = pd.concat([pd.DataFrame(X_adasyn), pd.Series(y_adasyn, name='target')], axis=1)

        features = df_adasyn.drop(columns=['target'])
        target = df_adasyn['target']

        # Update class distribution
        count_major = (target == 'Normal').sum()
        count_minor = (target == 'AbNormal').sum()
        current_ratio = count_major / count_minor
        print('Updated class distribution - Normal:', count_major, 'AbNormal:', count_minor)
        print('Updated ratio:', current_ratio)

    df_concat = df_adasyn

    # Reorder the columns to place 'target' at the front
    columns = ['target'] + [col for col in df_concat.columns if col != 'target']
    df_concat = df_concat[columns]

    # Sort by date if needed
    if 'Collect Date - Dam' in df_concat.columns:
        df_concat = df_concat.sort_values(by=["Collect Date - Dam"])

    # Print the final counts
    print(df_concat['target'].value_counts())

    return df_concat




In [9]:
# Example usage:
print('Data merging...')
sampled_df = sampling(df_clean, random_state, target_ratio=1.5)

#processed_file_path = './SAMPLED_PROCESSED.csv'  # Path to save the sampled CSV file
#sampled_df.to_csv(processed_file_path, index=False)
#print(f"Sampled data saved to {processed_file_path}")


Data merging...
Initial class distribution - Normal: 36400 AbNormal: 2080
Initial ratio: 17.5
Performing OSS...
Target ratio:  16.59326923076923
count_major:  34722 count_minor:  2080 Updated ratio: 16.693269230769232
count_major:  34536 count_minor:  2080 Updated ratio: 16.603846153846153
count_major:  34390 count_minor:  2080 Updated ratio: 16.533653846153847
Performing ADASYN...
initial count minor:  34297
Target ratio:  0.9549859765071783
Updated class distribution - Normal: 34390 AbNormal: 34297
Updated ratio: 1.0027116074292213
target
Normal      34390
AbNormal    34297
Name: count, dtype: int64


### 데이터 분할

In [10]:
#df = pd.read_csv("SAMPLED_PROCESSED.csv", low_memory=False)
df_train, df_val = train_test_split(
    sampled_df,
    test_size=0.1,
    stratify=sampled_df["target"],
    random_state=random_state,
)
features = sampled_df.drop(columns=['target'])
features = []

for col in sampled_df.columns:
    try:
        df_train[col] = df_train[col].astype(int)
        features.append(col)
    except:
        continue


In [11]:
train_x = df_train[features]
train_y = df_train["target"]
val_x = df_val[features]
val_y = df_val["target"]


print(train_x.shape, val_x.shape)

(61818, 152) (6869, 152)


In [12]:
# CatBoost with Class Weighting
weight_for_class_0 = sum(train_y == 'AbNormal') / len(train_y)
weight_for_class_1 = sum(train_y == 'Normal') / len(train_y)
model = CatBoostClassifier(class_weights=[weight_for_class_0, weight_for_class_1], random_state=random_state, verbose=0)

## 3. 모델 학습 중 최고만 뽑기

In [13]:
df_test_y = pd.read_csv(os.path.join("submission.csv"))
df_test_y.head()

Unnamed: 0,Set ID,target
0,OP753345013050000002,AbNormal
1,OP753345013050000005,AbNormal
2,OP753345013050000006,AbNormal
3,OP753345013050000008,AbNormal
4,OP753345013050000009,AbNormal


In [14]:
# 공통으로 존재하는 'Set ID' 값 확인
test = pd.read_csv(os.path.join("MERGED_TEST.csv"))
common_ids = set(test['Set ID']).intersection(set(df_test_y['Set ID']))
print("공통으로 존재하는 'Set ID' 값 개수:", len(common_ids))
print("공통으로 존재하는 'Set ID' 값 예시:", list(common_ids)[:10])

df_test_x = test.drop(columns=['target'])

공통으로 존재하는 'Set ID' 값 개수: 17361
공통으로 존재하는 'Set ID' 값 예시: ['OP753345013110001769', 'OP753345013100004575', 'OP753345083120000005', 'OP753345013050000332', 'OP753345014010000225', 'OP753345054040000518', 'OP753345013050003472', 'OP753345013120006858', 'OP753345013060003825', 'OP753345013100001433']


  test = pd.read_csv(os.path.join("MERGED_TEST.csv"))


In [15]:
# 가정: df_test_x는 이미 로드된 상태

# Step 1: Remove columns where all rows have the same value
df_test_x = df_test_x.loc[:, (df_test_x != df_test_x.iloc[0]).any()]

# Step 2: Remove columns where all rows have missing values
df_test_x = df_test_x.dropna(axis=1, how='all')

# Step 3: Categorize non-numerical columns
non_numeric_features = df_test_x.select_dtypes(exclude=[float, int])
for column in non_numeric_features.columns:
    df_test_x[column] = pd.Categorical(df_test_x[column]).codes

# Step 4: Scale numerical columns
numeric_features = df_test_x.select_dtypes(include=[float, int])
scaler = StandardScaler()
scaled_numeric_features = scaler.fit_transform(numeric_features)

# Replace original numeric columns with scaled columns
df_test_x[numeric_features.columns] = scaled_numeric_features

# 최종 처리된 DataFrame 출력
print(df_test_x.head())


   Dam dispensing_Equipment  Dam dispensing_Model.Suffix  \
0                         0                            0   
1                         0                            0   
2                         0                            0   
3                         0                            0   
4                         0                            0   

   Dam dispensing_Workorder  Dam dispensing_LOT ID  Set ID  \
0                         0                      0       0   
1                         0                      1       1   
2                         0                      2       2   
3                         0                      3       3   
4                         0                      4       4   

   Dam dispensing_Collect Date  Dam dispensing_Collect Result  \
0                            0                      -0.785738   
1                            1                      -0.785738   
2                            2                      -0.785738   
3     

### 모델 학습

In [16]:
# 언더샘플링 및 오버샘플링을 위한 파이프라인 구성
'''over = SMOTE(random_state=RANDOM_STATE)
under = RandomUnderSampler(random_state=RANDOM_STATE)
pipeline = Pipeline(steps=[('o', over), ('u', under)])

# 오버샘플링 및 언더샘플링 적용
train_x_resampled, train_y_resampled = pipeline.fit_resample(train_x, train_y)'''

# 클래스 가중치 계산
weight_for_class_0 = sum(train_y == 'AbNormal') / len(train_y)
weight_for_class_1 = sum(train_y == 'Normal') / len(train_y)


# 재학습
#model.fit(train_x_resampled, train_y_resampled)
model.fit(train_x, train_y)

# 재평가
train_pred = model.predict(train_x)
val_pred = model.predict(val_x)

# print(RANDOM_STATE)
now_best = f1_score(val_y, val_pred, pos_label='AbNormal')
print("Resampled Validation F1 Score:", now_best)

Resampled Validation F1 Score: 0.5349777598059038


### 테스트 데이터 불러오기 및 제출

In [21]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.metrics import f1_score

# RANDOM_STATE를 설정합니다.
RANDOM_STATE = 881

# 데이터셋 및 모델 설정
# df_concat: 전체 데이터프레임
# features: 사용하려는 특성(컬럼) 목록
# model: 사용할 모델 객체 (예: RandomForestClassifier, XGBoostClassifier 등)
X = sampled_df[features]
y = sampled_df["target"]

# StratifiedKFold 객체 생성
skf = StratifiedKFold(n_splits=9, shuffle=True, random_state=RANDOM_STATE)
best = 0
best_model = None

# 교차 검증을 위한 데이터셋 분할 및 반복
for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    print(f"Fold {fold + 1}/{skf.n_splits}")
    
    train_x, val_x = X.iloc[train_index], X.iloc[val_index]
    train_y, val_y = y.iloc[train_index], y.iloc[val_index]
    print(train_x.shape, val_x.shape)
    
    # 언더샘플링 및 오버샘플링을 위한 파이프라인 구성
    over = SMOTE(random_state=RANDOM_STATE)
    under = RandomUnderSampler(random_state=RANDOM_STATE)
    pipeline = Pipeline(steps=[('o', over), ('u', under)])
    
    # 오버샘플링 및 언더샘플링 적용
    train_x_resampled, train_y_resampled = pipeline.fit_resample(train_x, train_y)
    
    # 클래스 가중치 계산
    weight_for_class_0 = sum(train_y == 'AbNormal') / len(train_y)
    weight_for_class_1 = sum(train_y == 'Normal') / len(train_y)
    
    # 모델 재학습
    model.fit(train_x_resampled, train_y_resampled)
    
    # 재평가
    train_pred_resampled = model.predict(train_x_resampled)
    val_pred_resampled = model.predict(val_x)
    
    # F1 Score 계산
    now_best = f1_score(val_y, val_pred_resampled, pos_label='AbNormal')
    print("Resampled Validation F1 Score:", now_best)
    
    # 최고의 모델 업데이트
    if now_best > best:
        best = now_best
        best_model = model
        print("Best model updated with F1 Score:", best)

# 최고의 모델을 저장하거나 필요한 후처리를 진행합니다.
# 예: 모델을 파일에 저장

joblib.dump(best_model, 'best_model.pkl')

print(f"\nBest F1 Score achieved: {best}")


Fold 1/9
(61055, 152) (7632, 152)
Resampled Validation F1 Score: 0.9701472376063758
Best model updated with F1 Score: 0.9701472376063758
Fold 2/9
(61055, 152) (7632, 152)
Resampled Validation F1 Score: 0.9687542269714595
Fold 3/9
(61055, 152) (7632, 152)
Resampled Validation F1 Score: 0.971266693646297
Best model updated with F1 Score: 0.971266693646297
Fold 4/9
(61055, 152) (7632, 152)
Resampled Validation F1 Score: 0.9697379086733315
Fold 5/9
(61055, 152) (7632, 152)
Resampled Validation F1 Score: 0.9670865501828525
Fold 6/9
(61055, 152) (7632, 152)
Resampled Validation F1 Score: 0.9644021739130435
Fold 7/9
(61055, 152) (7632, 152)
Resampled Validation F1 Score: 0.9720836142953473
Best model updated with F1 Score: 0.9720836142953473
Fold 8/9
(61055, 152) (7632, 152)
Resampled Validation F1 Score: 0.9700162074554295
Fold 9/9
(61056, 152) (7631, 152)
Resampled Validation F1 Score: 0.9668066657634466

Best F1 Score achieved: 0.9720836142953473


In [22]:
# 저장된 모델을 불러와서 val_x와 val_y로 검증 F1 스코어를 측정합니다.
loaded_model = joblib.load('best_model.pkl')
val_pred_final = loaded_model.predict(val_x)
final_f1_score = f1_score(val_y, val_pred_final, pos_label='AbNormal')
print(f"Final Validation F1 Score with best model: {final_f1_score}")

Final Validation F1 Score with best model: 0.9668066657634466


In [23]:
test_pred = loaded_model.predict(df_test_x)
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)
print(now_best ,"is best")

0.9668066657634466 is best


## 4. 제출하기

### 제출 파일 작성

In [None]:
# # StratifiedKFold 객체 생성
# skf = KFold(n_splits=9, shuffle=True, random_state=RANDOM_STATE)
# best = 0
# # features와 target 설정
# X = df_concat[features]
# y = df_concat["target"]

# # 교차 검증을 위한 데이터셋 분할 및 반복
# for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
#     train_x, val_x = X.iloc[train_index], X.iloc[val_index]
#     train_y, val_y = y.iloc[train_index], y.iloc[val_index]
#     print(train_x.shape, val_x.shape)
#     RANDOM_STATE = 881
#     # 언더샘플링 및 오버샘플링을 위한 파이프라인 구성
#     over = SMOTE(random_state=RANDOM_STATE)
#     under = RandomUnderSampler(random_state=RANDOM_STATE)
#     pipeline = Pipeline(steps=[('o', over), ('u', under)])
    
#     # 오버샘플링 및 언더샘플링 적용
#     train_x_resampled, train_y_resampled = pipeline.fit_resample(train_x, train_y)
    
#     # 클래스 가중치 계산
#     weight_for_class_0 = sum(train_y == 'AbNormal') / len(train_y)
#     weight_for_class_1 = sum(train_y == 'Normal') / len(train_y)
    
    
#     # 재학습
#     model.fit(train_x_resampled, train_y_resampled)
    
#     # 재평가
#     train_pred_resampled = model.predict(train_x_resampled)
#     val_pred_resampled = model.predict(val_x)
    
#     # print(RANDOM_STATE)
#     now_best = f1_score(val_y, val_pred_resampled, pos_label='AbNormal')
#     print("Resampled Validation F1 Score:", now_best)
#     if now_best>best:
#         best = now_best
#         best_model = model
#         print("best updated!")

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**