In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [11]:
# 데이터 로딩
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')

In [12]:
print(f"Train 데이터 크기: {train_data.shape}")
print(f"Test 데이터 크기: {test_data.shape}")
print(f"결측치 확인: {train_data.isnull().sum().sum()}")

Train 데이터 크기: (165034, 14)
Test 데이터 크기: (110023, 13)
결측치 확인: 0


In [13]:
# 기본 데이터 정보
print(train_data.head())
print(f"\n타겟 분포:\n{train_data['Exited'].value_counts(normalize=True)}")

   id  CustomerId         Surname  CreditScore Geography Gender   Age  Tenure  \
0   0    15674932  Okwudilichukwu          668    France   Male  33.0       3   
1   1    15749177   Okwudiliolisa          627    France   Male  33.0       1   
2   2    15694510           Hsueh          678    France   Male  40.0      10   
3   3    15741417             Kao          581    France   Male  34.0       2   
4   4    15766172       Chiemenam          716     Spain   Male  33.0       5   

     Balance  NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  \
0       0.00              2        1.0             0.0        181449.97   
1       0.00              2        1.0             1.0         49503.50   
2       0.00              2        1.0             0.0        184866.69   
3  148882.54              1        1.0             1.0         84560.88   
4       0.00              2        1.0             1.0         15068.83   

   Exited  
0       0  
1       0  
2       0  
3       0  
4 

### 데이터 설명
- Customer ID: A unique identifier for each customer
- Surname: The customer's surname or last name
- Credit Score: A numerical value representing the customer's credit score
- Geography: The country where the customer resides
- Gender: The customer's gender
- Age: The customer's age.
- Tenure: The number of years the customer has been with the bank
- Balance: The customer's account balance
- NumOfProducts: The number of bank products the customer uses (e.g., savings account, credit card) / 고객이 사용하는 은행 상품 수(예: 저축 계좌, 신용 카드)
- HasCrCard: Whether the customer has a credit card
- IsActiveMember: Whether the customer is an active member
- EstimatedSalary: The estimated salary of the customer
- Exited: Whether the customer has churned (Target Variable)

In [14]:
# 최소한의 전처리
# 1. 불필요한 컬럼 제거 (ID, CustomerId, Surname)
X = train_data.drop(['id', 'CustomerId', 'Surname', 'Exited'], axis=1)
y = train_data['Exited']

In [16]:
# 2. 범주형 변수 One-Hot Encoding
categorical_columns = ['Geography', 'Gender']

print("One-Hot Encoding 전:")
print(f"Geography 고유값: {X['Geography'].unique()}")
print(f"Gender 고유값: {X['Gender'].unique()}")

# One-Hot Encoding 적용
X_encoded = pd.get_dummies(X, columns=categorical_columns, drop_first=False)

print(f"\nOne-Hot Encoding 후:")
print(f"전처리 후 X 모양: {X_encoded.shape}")
print(f"새로 생성된 컬럼들:")
for col in X_encoded.columns:
    if any(cat in col for cat in categorical_columns):
        print(f"  - {col}")

# 원래 X를 X_encoded로 대체
X = X_encoded

One-Hot Encoding 전:
Geography 고유값: ['France' 'Spain' 'Germany']
Gender 고유값: ['Male' 'Female']

One-Hot Encoding 후:
전처리 후 X 모양: (165034, 13)
새로 생성된 컬럼들:
  - Geography_France
  - Geography_Germany
  - Geography_Spain
  - Gender_Female
  - Gender_Male


In [17]:
# 3. Train/Validation Split (단순 분할, stratify만 적용)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain 크기: {X_train.shape}, Validation 크기: {X_val.shape}")
print(f"Train 타겟 분포: {y_train.value_counts(normalize=True).round(3)}")
print(f"Validation 타겟 분포: {y_val.value_counts(normalize=True).round(3)}")


Train 크기: (132027, 13), Validation 크기: (33007, 13)
Train 타겟 분포: Exited
0    0.788
1    0.212
Name: proportion, dtype: float64
Validation 타겟 분포: Exited
0    0.788
1    0.212
Name: proportion, dtype: float64


In [19]:
# 4. 베이스라인 로지스틱 회귀 모델
baseline_lr = LogisticRegression(random_state=42, max_iter=1000)
baseline_lr.fit(X_train, y_train)

In [20]:
# 예측
y_pred = baseline_lr.predict(X_val)
y_pred_proba = baseline_lr.predict_proba(X_val)[:, 1]

In [21]:
# 성능 평가
print("=== 베이스라인 성능 ===")
accuracy = accuracy_score(y_val, y_pred)
auc = roc_auc_score(y_val, y_pred_proba)

print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_val, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_val, y_pred)
print(f"\nConfusion Matrix:")
print(f"              예측")
print(f"실제    0      1")
print(f"  0   {cm[0,0]:5d}  {cm[0,1]:5d}")
print(f"  1   {cm[1,0]:5d}  {cm[1,1]:5d}")

# 클래스별 정확도 계산
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)  # 클래스 0 정확도 (Specificity)
sensitivity = tp / (tp + fn)  # 클래스 1 정확도 (Sensitivity/Recall)

print(f"\n클래스별 성능:")
print(f"클래스 0 (잔류) 정확도: {specificity:.4f} ({tn}/{tn+fp})")
print(f"클래스 1 (이탈) 정확도: {sensitivity:.4f} ({tp}/{tp+fn})")

# 성능 로그 시작
performance_log = {
    'baseline_simple': {
        'accuracy': accuracy,
        'auc': auc,
        'recall_class1': sensitivity,
        'precision_class1': tp / (tp + fp) if (tp + fp) > 0 else 0,
        'specificity': specificity,
        'model': 'LogisticRegression (기본)',
        'preprocessing': '원핫 인코딩 적용'
    }
}

=== 베이스라인 성능 ===
Accuracy: 0.8340
AUC: 0.8115

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.95      0.90     26023
           1       0.69      0.39      0.50      6984

    accuracy                           0.83     33007
   macro avg       0.77      0.67      0.70     33007
weighted avg       0.82      0.83      0.82     33007


Confusion Matrix:
              예측
실제    0      1
  0   24822   1201
  1    4279   2705

클래스별 성능:
클래스 0 (잔류) 정확도: 0.9538 (24822/26023)
클래스 1 (이탈) 정확도: 0.3873 (2705/6984)


In [24]:
print(f"\n=== 성능 로그 ===")
for step, metrics in performance_log.items():
    print(f"{step}:")
    print(f"  - Accuracy: {metrics['accuracy']:.4f}")
    print(f"  - AUC: {metrics['auc']:.4f}")
    print(f"  - 이탈고객 탐지율 (Recall): {metrics['recall_class1']:.4f}")
    print(f"  - 이탈고객 정밀도 (Precision): {metrics['precision_class1']:.4f}")
    print(f"  - 잔류고객 정확도 (Specificity): {metrics['specificity']:.4f}")

# 나중에 사용할 수 있도록 변수 저장
import pickle
with open('performance_log.pkl', 'wb') as f:
    pickle.dump(performance_log, f)

print("성능 로그가 저장되었습니다.")


=== 성능 로그 ===
baseline_simple:
  - Accuracy: 0.8340
  - AUC: 0.8115
  - 이탈고객 탐지율 (Recall): 0.3873
  - 이탈고객 정밀도 (Precision): 0.6925
  - 잔류고객 정확도 (Specificity): 0.9538
성능 로그가 저장되었습니다.
