# 라이브러리(필요한 도구) 불러오기

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


## 데이터 로딩

In [2]:
train = pd.read_csv(r"C:\Users\ysj08\Documents\ML_titanic\train.csv")
test = pd.read_csv(r"C:\Users\ysj08\Documents\ML_titanic\test.csv")


## 가설 검증 (EDA)
- 성별, 등급, 나이, 요금에 따른 생존률을 확인
- 간단한 막대그래프로 직관적으로 보기


In [None]:
import matplotlib.pyplot as plt

# 생존률 계산
sex_rate = train.groupby('Sex')['Survived'].mean()
pclass_rate = train.groupby('Pclass')['Survived'].mean()

age_bins = pd.cut(train['Age'], bins=[0,10,20,30,40,50,60,70,80], right=False)
age_rate = train.groupby(age_bins)['Survived'].mean()

fare_bins = pd.qcut(train['Fare'], q=4, duplicates='drop')
fare_rate = train.groupby(fare_bins)['Survived'].mean()

# 출력
print('Sex survival rate:', sex_rate)
print('Pclass survival rate:', pclass_rate)
print('Age (bins) survival rate:', age_rate)
print('Fare (quartiles) survival rate:', fare_rate)

# 막대그래프
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
sex_rate.plot(kind='bar', ax=axes[0,0], title='Survival Rate by Sex')
pclass_rate.plot(kind='bar', ax=axes[0,1], title='Survival Rate by Pclass')
age_rate.plot(kind='bar', ax=axes[1,0], title='Survival Rate by Age (bins)')
fare_rate.plot(kind='bar', ax=axes[1,1], title='Survival Rate by Fare (quartiles)')
plt.tight_layout()
plt.show()


SyntaxError: unterminated string literal (detected at line 14) (2752091481.py, line 14)

## 데이터 전처리 

In [None]:
# Embarked 결측 제거 (train만)
train = train.dropna(subset=['Embarked']).copy()

# Age 평균으로 채우기 (train 평균 기준)
age_mean = train['Age'].mean()
train['Age'] = train['Age'].fillna(age_mean)
test['Age'] = test['Age'].fillna(age_mean)

# 문자열 -> 숫자 인코딩
train['Sex'] = train['Sex'].map({'male': 0, 'female': 1})
test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})

embarked_map = {'C': 0, 'Q': 1, 'S': 2}
train['Embarked'] = train['Embarked'].map(embarked_map)
test['Embarked'] = test['Embarked'].map(embarked_map)


In [None]:
feature_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train_x = train[feature_cols]
train_y = train['Survived']

test_x = test[feature_cols]


## 훈련/검증 데이터 분리
- train을 8:2로 나눠서 성능을 확인


In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    train_x, train_y, test_size=0.2, random_state=42, stratify=train_y
)


## 모델 비교 (Validation)
- 여러 모델을 같은 조건으로 비교해서 성능을 확인


In [None]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    val_pred = model.predict(X_val)
    acc = accuracy_score(y_val, val_pred)
    results[name] = acc

print('Validation accuracy (higher is better):')
for name, acc in sorted(results.items(), key=lambda x: x[1], reverse=True):
    print(f'- {name}: {acc:.4f}')

best_name = max(results, key=results.get)
print(f'Best model: {best_name}')


## PyTorch 로지스틱 회귀 (복습용)
- w, b를 BCE 손실로 업데이트하는 가장 기본 모델
- PyTorch가 설치되어 있어야 실행 가능


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train_torch(X_tr, y_tr, X_va=None, y_va=None, epochs=200, lr=0.01):
    model = nn.Linear(X_tr.shape[1], 1).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        logits = model(X_tr)
        loss = criterion(logits, y_tr)
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 50 == 0 and X_va is not None:
            model.eval()
            with torch.no_grad():
                val_logits = model(X_va)
                val_probs = torch.sigmoid(val_logits)
                val_pred = (val_probs >= 0.5).float()
                val_acc = (val_pred.eq(y_va)).float().mean().item()
            print(f'Epoch {epoch+1}/{epochs} | loss={loss.item():.4f} | val_acc={val_acc:.4f}')
    return model

# 텐서로 변환
X_train_t = torch.tensor(X_train.values, dtype=torch.float32).to(device)
y_train_t = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).to(device)
X_val_t = torch.tensor(X_val.values, dtype=torch.float32).to(device)
y_val_t = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1).to(device)

# 학습 및 검증
torch_model = train_torch(X_train_t, y_train_t, X_val_t, y_val_t, epochs=200, lr=0.01)

# 최종 검증 정확도
torch_model.eval()
with torch.no_grad():
    val_probs = torch.sigmoid(torch_model(X_val_t))
    val_pred = (val_probs >= 0.5).float()
    val_acc = (val_pred.eq(y_val_t)).float().mean().item()
print(f'PyTorch Validation accuracy: {val_acc:.4f}')


## PyTorch 전체 데이터 학습 후 test 예측 (선택)
- PyTorch 모델로 제출 파일을 만들고 싶을 때 사용


In [None]:
# 전체 데이터로 재학습
X_full_t = torch.tensor(train_x.values, dtype=torch.float32).to(device)
y_full_t = torch.tensor(train_y.values, dtype=torch.float32).view(-1, 1).to(device)
X_test_t = torch.tensor(test_x.values, dtype=torch.float32).to(device)

torch_full_model = train_torch(X_full_t, y_full_t, epochs=200, lr=0.01)
torch_full_model.eval()
with torch.no_grad():
    test_probs = torch.sigmoid(torch_full_model(X_test_t))
    torch_test_pred = (test_probs >= 0.5).cpu().numpy().astype(int).ravel()

# PyTorch 예측으로 제출 파일 만들기 (원하면 사용)
torch_submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': torch_test_pred
})
torch_submission.to_csv('submission_torch.csv', index=False)


## 선택된 모델로 전체 데이터 학습 후 test 예측
- 검증에서 가장 좋은 모델로 전체 train을 다시 학습


In [None]:
best_model = models[best_name]
best_model.fit(train_x, train_y)
test_y_pred = best_model.predict(test_x)


## 제출 파일 생성

In [None]:
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_y_pred
})
submission.to_csv('submission.csv', index=False)
