### Import

In [9]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier

### Data Load

In [10]:
train = pd.read_csv('selected_features_cleaned.csv').drop(columns=['ID'])
test = pd.read_csv('test.csv').drop(columns=['ID'])

missing_cols = set(test.columns) - set(train.columns)
test = test.drop(columns=missing_cols)

In [11]:
X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

### Sampling

In [12]:

# 필요한 라이브러리 설치 (첫 실행 시 필요)
# !pip install imbalanced-learn

import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight

# 클래스 불균형 확인
print("Original class distribution:", Counter(y))

# 1. 범주형 데이터 인코딩 (문자형 데이터를 숫자로 변환)
X_encoded = pd.get_dummies(X)  # 원-핫 인코딩 적용

# 2. 오버샘플링 (SMOTE 적용)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_encoded, y)
print("After SMOTE oversampling:", Counter(y_resampled))

# 3. 가중치 조정 (클래스 불균형 고려)
class_weights = compute_class_weight("balanced", classes=np.unique(y_resampled), y=y_resampled)
class_weight_dict = {c: w for c, w in zip(np.unique(y_resampled), class_weights)}
print("Computed class weights:", class_weight_dict)


Original class distribution: Counter({0: 47866, 1: 16654})
After SMOTE oversampling: Counter({0: 47866, 1: 47866})
Computed class weights: {np.int64(0): np.float64(1.0), np.int64(1): np.float64(1.0)}


### Solve Overfitting

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score

# 데이터 분할 (Train:Test = 80:20)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# 모델 학습 (과적합 방지 Hyperparameter 적용)
model = RandomForestClassifier(
    n_estimators=100,         # 트리 개수
    max_depth=10,             # 트리 최대 깊이 제한 (과적합 방지)
    min_samples_split=5,      # 노드를 나누기 위한 최소 샘플 수 (일반적으로 2~10 사이)
    class_weight="balanced",  # 클래스 불균형 보정
    random_state=42
)

# 모델 학습
model.fit(X_train, y_train)

# 예측 수행
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# 확률 예측 (ROC-AUC 계산용)
y_train_proba = model.predict_proba(X_train)[:, 1]
y_test_proba = model.predict_proba(X_test)[:, 1]

# 성능 평가
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
train_roc_auc = roc_auc_score(y_train, y_train_proba)
test_roc_auc = roc_auc_score(y_test, y_test_proba)

print(f"Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}")
print(f"Train ROC-AUC: {train_roc_auc:.4f}, Test ROC-AUC: {test_roc_auc:.4f}")

# 교차 검증 수행 (K=5)
cv_scores = cross_val_score(model, X_resampled, y_resampled, cv=5, scoring='roc_auc')
print(f"Cross Validation ROC-AUC Scores: {cv_scores}")
print(f"Mean CV ROC-AUC Score: {cv_scores.mean():.4f}")

Train Accuracy: 0.7405, Test Accuracy: 0.7308
Train ROC-AUC: 0.8384, Test ROC-AUC: 0.8278
Cross Validation ROC-AUC Scores: [0.65725217 0.73503059 0.90326879 0.90079608 0.90155341]
Mean CV ROC-AUC Score: 0.8196


### Submission

In [19]:
import pandas as pd

# 원본 테스트 데이터 불러오기
test = pd.read_csv('test.csv').drop(columns=['ID'])

# 원-핫 인코딩 (훈련 데이터와 동일한 방식 적용)
test_encoded = pd.get_dummies(test)

# train 데이터(`X_encoded`)와 컬럼 개수가 다른 경우 align으로 맞추기
X_train_encoded, test_encoded = X_encoded.align(test_encoded, join='left', axis=1, fill_value=0)

# 예측 수행 (테스트 데이터에 대한 확률 예측)
pred_proba = model.predict_proba(test_encoded)[:, 1]

# sample_submission.csv 불러오기
sample_submission = pd.read_csv('sample_submission.csv')

# 길이 확인 (에러 방지를 위해 크기 확인)
print(f"test_encoded shape: {test_encoded.shape}")
print(f"sample_submission shape: {sample_submission.shape}")

# 길이 불일치 해결: 크기 비교 후 적용
if len(sample_submission) == len(pred_proba):
    sample_submission['probability'] = pred_proba
    sample_submission.to_csv('./sampling_test_submit.csv', index=False)
    print("✅ 예측 결과 저장 완료: sampling_test_submit.csv")
else:
    print("❌ 오류 발생: 예측값 개수와 제출 파일 크기가 일치하지 않습니다.")
    print(f"예측값 개수: {len(pred_proba)}, 제출 파일 크기: {len(sample_submission)}")

test_encoded shape: (90067, 140)
sample_submission shape: (90067, 2)
✅ 예측 결과 저장 완료: sampling_test_submit.csv


In [None]:
pred_proba = model.predict_proba(test_)[:, 1]

NameError: name 'test_encoded' is not defined

In [16]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission['probability'] = pred_proba

ValueError: Length of values (19147) does not match length of index (90067)

In [None]:
sample_submission.to_csv('./sampling_test_submit.csv', index=False)

### Train -> Overfitting

In [5]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_resampled, y_resampled)

In [8]:
from sklearn.metrics import accuracy_score, roc_auc_score

# 예측
y_train_pred = model.predict(X_resampled)
y_train_proba = model.predict_proba(X_resampled)[:, 1]  # ROC-AUC Score 계산용

# 평가
accuracy = accuracy_score(y_resampled, y_train_pred)
roc_auc = roc_auc_score(y_resampled, y_train_proba)

# 출력
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")

Accuracy: 1.0000
ROC-AUC Score: 1.0000
