## [요구사항 1]	titanic 딥러닝 모델 기본 훈련

In [6]:
import os
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, random_split
from datetime import datetime
import wandb
import argparse
from sklearn.preprocessing import LabelEncoder
import numpy as np

# 출력 옵션 설정 (데이터 전처리 디버깅용)
pd.set_option("display.width", None)
pd.set_option("display.max_columns", None)


# --- 1. Titanic Dataset 클래스 정의 ---
class TitanicDataset(Dataset):
  def __init__(self, X, y):
    # PyTorch 텐서로 변환
    self.X = torch.FloatTensor(X)
    # 분류 문제의 타겟은 일반적으로 LongTensor 사용
    self.y = torch.LongTensor(y) 

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    feature = self.X[idx]
    target = self.y[idx]
    # 이전 예제와 동일하게 딕셔너리 키를 'input'과 'target'으로 유지
    return {'input': feature, 'target': target}

  def __str__(self):
    return f"Data Size: {len(self.X)}, Input Shape: {self.X.shape}, Target Shape: {self.y.shape}"


class TitanicTestDataset(Dataset):
  def __init__(self, X):
    self.X = torch.FloatTensor(X)

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    feature = self.X[idx]
    return {'input': feature}

  def __str__(self):
    return f"Data Size: {len(self.X)}, Input Shape: {self.X.shape}"


# --- 2. 데이터 전처리 보조 함수 (FutureWarning 해결) ---

def get_preprocessed_dataset_1(all_df):
    Fare_mean = all_df[["Pclass", "Fare"]].groupby("Pclass").mean().reset_index()
    Fare_mean.columns = ["Pclass", "Fare_mean"]
    all_df = pd.merge(all_df, Fare_mean, on="Pclass", how="left")
    all_df.loc[(all_df["Fare"].isnull()), "Fare"] = all_df["Fare_mean"]
    all_df = all_df.drop(columns=["Fare_mean"])
    return all_df


def get_preprocessed_dataset_2(all_df):
    name_df = all_df["Name"].str.split("[,.]", n=2, expand=True)
    name_df.columns = ["family_name", "title", "name"]
    name_df["family_name"] = name_df["family_name"].str.strip()
    name_df["title"] = name_df["title"].str.strip()
    name_df["name"] = name_df["name"].str.strip()
    all_df = pd.concat([all_df, name_df], axis=1)
    return all_df


def get_preprocessed_dataset_3(all_df):
    title_age_mean = all_df[["title", "Age"]].groupby("title").median().round().reset_index()
    title_age_mean.columns = ["title", "title_age_mean", ]
    all_df = pd.merge(all_df, title_age_mean, on="title", how="left")
    # 결측치 대체 전에 존재하지 않는 title의 중앙값(NaN)은 0으로 임시 대체 (Merge에서 발생 가능)
    all_df["title_age_mean"] = all_df["title_age_mean"].fillna(0) 
    all_df.loc[(all_df["Age"].isnull()), "Age"] = all_df["title_age_mean"]
    all_df = all_df.drop(["title_age_mean"], axis=1)
    # Age가 여전히 NaN인 경우(매우 희귀한 title의 Age가 모두 NaN인 경우), 전체 평균으로 최종 대체
    all_df["Age"] = all_df["Age"].fillna(all_df["Age"].mean())
    return all_df


def get_preprocessed_dataset_4(all_df):
    all_df["family_num"] = all_df["Parch"] + all_df["SibSp"]
    all_df.loc[all_df["family_num"] == 0, "alone"] = 1
    # 수정: inplace=True 대신 직접 할당을 사용하여 FutureWarning 해결
    all_df["alone"] = all_df["alone"].fillna(0)
    
    # 학습에 불필요한 컬럼 제거
    all_df = all_df.drop(["PassengerId", "Name", "family_name", "name", "Ticket", "Cabin"], axis=1)
    return all_df


def get_preprocessed_dataset_5(all_df):
    # title 값 개수 줄이기
    main_titles = ["Mr", "Miss", "Mrs", "Master"]
    all_df.loc[
    ~all_df["title"].isin(main_titles),
    "title"
    ] = "other"
    # 수정: inplace=True 대신 직접 할당을 사용하여 FutureWarning 해결
    all_df["Embarked"] = all_df["Embarked"].fillna("missing")
    return all_df


def get_preprocessed_dataset_6(all_df):
    # 카테고리 변수를 LabelEncoder를 사용하여 수치값으로 변경하기
    category_features = all_df.columns[all_df.dtypes == "object"]
    for category_feature in category_features:
        le = LabelEncoder()
        # 모든 데이터(train + test)를 이용해 fit 후 transform
        all_df[category_feature] = le.fit_transform(all_df[category_feature])
    return all_df


# --- 3. 데이터 로드 및 DataLoader 생성 함수 (California Housing 스타일로 통합) ---
def get_data():
    # 데이터 파일 경로 설정 (실행 환경에 맞게 조정 필요)
    # 가정: train.csv와 test.csv가 현재 작업 디렉토리에 있음
    train_data_path = os.path.join(os.getcwd(), "train.csv")
    test_data_path = os.path.join(os.getcwd(), "test.csv")

    try:
        train_df = pd.read_csv(train_data_path)
        test_df = pd.read_csv(test_data_path)
    except FileNotFoundError:
        print("에러: 'train.csv' 또는 'test.csv' 파일을 찾을 수 없습니다. 경로를 확인해 주세요.")
        raise

    all_df = pd.concat([train_df, test_df], sort=False)

    # 전처리 단계 순차적 실행
    all_df = get_preprocessed_dataset_1(all_df)
    all_df = get_preprocessed_dataset_2(all_df)
    all_df = get_preprocessed_dataset_3(all_df)
    all_df = get_preprocessed_dataset_4(all_df)
    all_df = get_preprocessed_dataset_5(all_df)
    all_df = get_preprocessed_dataset_6(all_df)
    
    # 학습/검증 데이터 (Survived가 null이 아닌 행) 분리
    train_X = all_df[~all_df["Survived"].isnull()].drop("Survived", axis=1).reset_index(drop=True)
    train_y = train_df["Survived"]

    dataset = TitanicDataset(train_X.values, train_y.values)
    print(dataset)
    
    # 학습 80%, 검증 20% 분할
    train_size = int(len(dataset) * 0.8)
    validation_size = len(dataset) - train_size
    train_dataset, validation_dataset = random_split(dataset, [train_size, validation_size])
    
    print(f"Train/Validation Split: {len(train_dataset)} / {len(validation_dataset)}")

    # DataLoader 생성
    train_data_loader = DataLoader(dataset=train_dataset, batch_size=wandb.config.batch_size, shuffle=True)
    # 검증은 전체를 하나의 배치로 처리
    validation_data_loader = DataLoader(dataset=validation_dataset, batch_size=validation_size)

    return train_data_loader, validation_data_loader


# --- 4. 모델 정의 (이전과 동일) ---
class MyModel(nn.Module):
  N_INPUT = 10 
  N_OUTPUT = 1 

  def __init__(self):
    super().__init__()

    self.model = nn.Sequential(
      nn.Linear(self.N_INPUT, wandb.config.n_hidden_unit_list[0]),
      nn.ReLU(),
      nn.Linear(wandb.config.n_hidden_unit_list[0], wandb.config.n_hidden_unit_list[1]),
      nn.ReLU(),
      nn.Linear(wandb.config.n_hidden_unit_list[1], self.N_OUTPUT),
    )

  def forward(self, x):
    x = self.model(x)
    return x


def get_model_and_optimizer():
  my_model = MyModel()
  optimizer = optim.SGD(my_model.parameters(), lr=wandb.config.learning_rate)

  return my_model, optimizer


# --- 5. 학습 루프 (이전과 동일) ---
def training_loop(model, optimizer, train_data_loader, validation_data_loader):
  n_epochs = wandb.config.epochs
  loss_fn = nn.BCEWithLogitsLoss()
  next_print_epoch = 100

  for epoch in range(1, n_epochs + 1):
    # Training
    model.train()
    loss_train = 0.0
    num_trains = 0
    num_correct_train = 0
    num_total_train = 0

    for train_batch in train_data_loader:
      input = train_batch['input']
      target = train_batch['target'].float().unsqueeze(1) 
      
      output_train = model(input)
      loss = loss_fn(output_train, target)
      loss_train += loss.item()
      num_trains += 1

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      prediction_train = (torch.sigmoid(output_train) > 0.5).long()
      num_correct_train += (prediction_train == target.long()).sum().item()
      num_total_train += target.size(0)

    # Validation
    model.eval()
    loss_validation = 0.0
    num_validations = 0
    num_correct_validation = 0
    num_total_validation = 0

    with torch.no_grad():
      for validation_batch in validation_data_loader:
        input = validation_batch['input']
        target = validation_batch['target'].float().unsqueeze(1)
        
        output_validation = model(input)
        loss = loss_fn(output_validation, target)
        loss_validation += loss.item()
        num_validations += 1

        prediction_validation = (torch.sigmoid(output_validation) > 0.5).long()
        num_correct_validation += (prediction_validation == target.long()).sum().item()
        num_total_validation += target.size(0)

    # 지표 계산
    train_loss_avg = loss_train / num_trains
    val_loss_avg = loss_validation / num_validations
    train_accuracy = num_correct_train / num_total_train
    val_accuracy = num_correct_validation / num_total_validation

    # wandb 로깅
    wandb.log({
      "Epoch": epoch,
      "Training loss": train_loss_avg,
      "Validation loss": val_loss_avg,
      "Training accuracy": train_accuracy,
      "Validation accuracy": val_accuracy,
    })

    if epoch >= next_print_epoch:
      print(
        f"Epoch {epoch}, "
        f"Train Loss {train_loss_avg:.4f}, "
        f"Val Loss {val_loss_avg:.4f}, "
        f"Train Acc {train_accuracy:.4f}, "
        f"Val Acc {val_accuracy:.4f}"
      )
      next_print_epoch += 100


# --- 6. 메인 실행 함수 (이전과 동일) ---
def main(args):
  current_time_str = datetime.now().astimezone().strftime('%Y-%m-%d_%H-%M-%S')

  config = {
    'epochs': args.epochs,
    'batch_size': args.batch_size,
    'learning_rate': 1e-3, # 초기 학습률
    'n_hidden_unit_list': [32, 16], # 은닉층 뉴런 수 조정
  }

  wandb.init(
    mode="online" if args.wandb else "disabled",
    project="titanic_survival_prediction", # 프로젝트 이름 변경
    notes="Titanic Classification with Preprocessing",
    tags=["titanic", "classification"],
    name=current_time_str,
    config=config
  )
  print(args)
  print(wandb.config)

  train_data_loader, validation_data_loader = get_data()

  linear_model, optimizer = get_model_and_optimizer()
  print("-" * 50)
  print(linear_model)
  print("#" * 50)

  training_loop(
    model=linear_model,
    optimizer=optimizer,
    train_data_loader=train_data_loader,
    validation_data_loader=validation_data_loader
  )
  wandb.finish()


# --- 7. 명령줄 인수 처리 (WandB 활성화 수정) ---
if __name__ == "__main__":
  # Python 스크립트 실행 경로를 현재 경로로 임시 설정 (데이터 로드를 위해)
  # os.chdir(os.path.dirname(os.path.abspath(__file__))) 
  
  parser = argparse.ArgumentParser()

  # argparse 기본값은 False지만, Jupyter에서는 명시적 설정이 필요
  parser.add_argument(
    "--wandb", action=argparse.BooleanOptionalAction, default=False, help="True or False"
  )

  parser.add_argument(
    "-b", "--batch_size", type=int, default=64, help="Batch size (int, default: 64)"
  )

  parser.add_argument(
    "-e", "--epochs", type=int, default=500, help="Number of training epochs (int, default: 500)"
  )

  # 1. argparse의 기본 동작을 먼저 수행 (args.wandb는 default=False)
  args = parser.parse_args([])

  # 2. WandB를 활성화하기 위해 args 객체의 wandb 속성을 명시적으로 True로 덮어씌움
  # Jupyter Notebook에서 가장 확실하게 WandB를 켜는 방법
  args.wandb = True

  print(f"[NOTE] WandB has been manually set to: {args.wandb}")

  main(args)

[NOTE] WandB has been manually set to: True


wandb: Currently logged in as: le993939 (le993939-korea-university-of-technology-and-education) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


Namespace(wandb=True, batch_size=64, epochs=500)
{'epochs': 500, 'batch_size': 64, 'learning_rate': 0.001, 'n_hidden_unit_list': [32, 16]}
Data Size: 891, Input Shape: torch.Size([891, 10]), Target Shape: torch.Size([891])
Train/Validation Split: 712 / 179
--------------------------------------------------
MyModel(
  (model): Sequential(
    (0): Linear(in_features=10, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=1, bias=True)
  )
)
##################################################
Epoch 100, Train Loss 0.6089, Val Loss 0.6021, Train Acc 0.6896, Val Acc 0.7095
Epoch 200, Train Loss 0.5857, Val Loss 0.5921, Train Acc 0.7107, Val Acc 0.7039
Epoch 300, Train Loss 0.5807, Val Loss 0.5903, Train Acc 0.7149, Val Acc 0.7151
Epoch 400, Train Loss 0.5917, Val Loss 0.5876, Train Acc 0.6966, Val Acc 0.7151
Epoch 500, Train Loss 0.5684, Val Loss 0.5749, Train Acc 0.7037, Val Acc

0,1
Epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇████
Training accuracy,▁▁▁▂▂▁▁▃▂▂▅▅▆▆█▇▇▇▆▆▆▇▆█▆▆▆▅▅▅▅▅▅▅▅▅▅▅▅▅
Training loss,█▃▃▂▂▂▂▂▃▃▂▃▂▂▂▃▁▂▁▂▁▁▂▁▁▁▁▁▂▂▁▁▁▁▁▂▂▁▂▂
Validation accuracy,▄▄▅▄▄▄▅▄▄▂▁▁▅█▅▅▄▅▄▅▅▅▄▅▅▅▅▅▇▅▇▂▁▄▂▅▁▂▄▂
Validation loss,█▅▇▆▆▅▅▅▅▅▅▅▄▅▅▄▃▄▄▄▄▃▄▂▅▄▃▄▃▃▁▄▂▄▁▄▄▄▁▁

0,1
Epoch,500.0
Training accuracy,0.70365
Training loss,0.56842
Validation accuracy,0.70391
Validation loss,0.57495


### 해석
- all_df를 통해 학습 데이터와 테스트 데이터를 합친 통합 데이터프레임를 입력받을 수 있음
- reset_index() : groupby 결과를 일반 DataFrame 형태로 되돌림
- .loc : Pandas의 인덱서 중 하나로 데이터프레임의 특정 위치에 접근핧 수 있다. [행 선택 조건, 열 선택]의 형식을 가진다.
- .round() : 소수점이 생기면 반올림해서 정수 형태로 만듬
- ~ : 논리 부정(NOT) 연산자
- LabelEncoder : sklearn.preprocessing의 클래스로 문자열로 되어 있는 범주(label)를 숫자(label index)로 바꿔주는 인코더
- .fit() : 데이터 안에 있는 고유한 값(unique categories) 을 학습
- .transform() : 각 값을 고유한 정수로 바꿈
- .fit_transform() : 둘을 한 번에 수행하는 메서드
- pd.concat()을 통해 학습 데이터와 테스트 데이터를 합침 => 이는 전처리를 train/test 모두 같은 기준으로 일관되게 수행하기 위함임
- shuffle=True : 매 epoch마다 데이터 순서를 섞어 학습 안정성 향상
- nn.Sequential : 여러 개의 레이어(층)를 순차적으로 연결하여 신경망 구조를 정의
- optim.SGD : 확률적 경사 하강법 최적화 도구를 정의
- lr=wandb.config.learning_rate : 학습률 설정
- nn.BCEWithLogitsLoss() : 손실 함수. sigmoid + binary cross entropy를 한 번에 계산.
- torch.sigmoid(output_train) > 0.5를 통해 로짓 출력에 시그모이드를 적용하여 확률을 구하고, 0.5를 기준으로 이진 예측을 수행하여 정확도 측정
- with torch.no_grad()를 통해 검증 시에는 기울기 계산을 비활성화 → 메모리와 시간 절약
- wandb.log : 각 epoch의 주요 지표를 wandb 대시보드에 기록

### 취득한 기술적 사항/고찰 내용
- get_preprocessed_dataset_1(all_df) 함수를 통해 데이터가 깨끗해지고, 머신러닝 모델 학습 시 결측치로 인한 오류를 방지할 수 있음
- get_preprocessed_dataset_2 함수를 통해 Name 컬럼에서 호칭(Title) 정보를 추출하여 나이(Age)의 결측치를 채울 수 있음
- get_preprocessed_dataset_3 함수에서는 title별로 Age의 중앙값(median) 을 구하는데 이는 평균(mean)보다 극단값(이상치) 의 영향을 덜 받기 때문이다.
- get_preprocessed_dataset_4 함수는 직계 가족의 총 인원수을 계산하는데, 이는 가족 규모가 생존 확률과 강한 상관관계가 있는 것으로 알려져 있기 때문이다.
- get_preprocessed_dataset_5 함수는 데이터 불균형을 줄이고, 모델의 일반화 성능을 높이기 위해 나머지는 모두 "other"로 묶는다. 또한 Embarked의 결측치를 "missing"으로 채워 카테고리형 변수를 단순하고 안정적으로 정리한다.
- get_preprocessed_dataset_6 함수는 신경망(PyTorch 모델)은 문자열 데이터를 직접 처리할 수 없으므로, 학습이 가능하도록 모든 데이터를 수치화하는 Label Encoding을 하지만, LabelEncoder가 정수를 할당하는 과정에서 값 사이에 순서(Ordinality)가 있는 것처럼 모델에게 잘못된 정보를 전달할 수 있다는 문제점이 존재한다. 따라서  순서가 없는 범주형 데이터(Nominal Data)의 경우, LabelEncoder보다 원-핫 인코딩(One-Hot Encoding)을 사용하는 것이 더 안전하다.

## [요구사항 2]	Activation	Function	과	Batch	Size	변경	및	선택하기

In [2]:
import os
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, random_split
from datetime import datetime
import wandb
import argparse
from sklearn.preprocessing import LabelEncoder
import numpy as np
import plotly.graph_objects as go 

# 출력 옵션 설정
pd.set_option("display.width", None)
pd.set_option("display.max_columns", None)


# --- 1. Titanic Dataset 클래스 정의 ---
class TitanicDataset(Dataset):
  def __init__(self, X, y):
    self.X = torch.FloatTensor(X)
    self.y = torch.LongTensor(y)

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    return {'input': self.X[idx], 'target': self.y[idx]}

  def __str__(self):
    return f"Data Size: {len(self.X)}, Input Shape: {self.X.shape}, Target Shape: {self.y.shape}"


class TitanicTestDataset(Dataset):
  def __init__(self, X):
    self.X = torch.FloatTensor(X)

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    return {'input': self.X[idx]}

  def __str__(self):
    return f"Data Size: {len(self.X)}, Input Shape: {self.X.shape}"


# --- 2. 전처리 함수들 ---
def get_preprocessed_dataset_1(all_df):
    Fare_mean = all_df[["Pclass", "Fare"]].groupby("Pclass").mean().reset_index()
    Fare_mean.columns = ["Pclass", "Fare_mean"]
    all_df = pd.merge(all_df, Fare_mean, on="Pclass", how="left")
    all_df.loc[(all_df["Fare"].isnull()), "Fare"] = all_df["Fare_mean"]
    all_df = all_df.drop(columns=["Fare_mean"])
    return all_df


def get_preprocessed_dataset_2(all_df):
    name_df = all_df["Name"].str.split("[,.]", n=2, expand=True)
    name_df.columns = ["family_name", "title", "name"]
    name_df = name_df.apply(lambda col: col.str.strip())
    all_df = pd.concat([all_df, name_df], axis=1)
    return all_df


def get_preprocessed_dataset_3(all_df):
    title_age_mean = all_df[["title", "Age"]].groupby("title").median().round().reset_index()
    title_age_mean.columns = ["title", "title_age_mean"]
    all_df = pd.merge(all_df, title_age_mean, on="title", how="left")
    all_df["title_age_mean"] = all_df["title_age_mean"].fillna(0)
    all_df.loc[(all_df["Age"].isnull()), "Age"] = all_df["title_age_mean"]
    all_df = all_df.drop(["title_age_mean"], axis=1)
    all_df["Age"] = all_df["Age"].fillna(all_df["Age"].mean())
    return all_df


def get_preprocessed_dataset_4(all_df):
    all_df["family_num"] = all_df["Parch"] + all_df["SibSp"]
    all_df.loc[all_df["family_num"] == 0, "alone"] = 1
    all_df["alone"] = all_df["alone"].fillna(0)
    all_df = all_df.drop(["PassengerId", "Name", "family_name", "name", "Ticket", "Cabin"], axis=1)
    return all_df


def get_preprocessed_dataset_5(all_df):
    main_titles = ["Mr", "Miss", "Mrs", "Master"]
    all_df.loc[~all_df["title"].isin(main_titles), "title"] = "other"
    all_df["Embarked"] = all_df["Embarked"].fillna("missing")
    return all_df


def get_preprocessed_dataset_6(all_df):
    category_features = all_df.columns[all_df.dtypes == "object"]
    for feature in category_features:
        le = LabelEncoder()
        all_df[feature] = le.fit_transform(all_df[feature])
    return all_df


# --- 3. 데이터 로드 ---
def get_data():
    train_data_path = os.path.join(os.getcwd(), "train.csv")
    test_data_path = os.path.join(os.getcwd(), "test.csv")

    train_df = pd.read_csv(train_data_path)
    test_df = pd.read_csv(test_data_path)

    all_df = pd.concat([train_df, test_df], sort=False)
    all_df = get_preprocessed_dataset_1(all_df)
    all_df = get_preprocessed_dataset_2(all_df)
    all_df = get_preprocessed_dataset_3(all_df)
    all_df = get_preprocessed_dataset_4(all_df)
    all_df = get_preprocessed_dataset_5(all_df)
    all_df = get_preprocessed_dataset_6(all_df)

    train_X = all_df[~all_df["Survived"].isnull()].drop("Survived", axis=1).reset_index(drop=True)
    train_y = train_df["Survived"]

    dataset = TitanicDataset(train_X.values, train_y.values)
    train_size = int(len(dataset) * 0.8)
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_dataset, batch_size=wandb.config.batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=val_size)
    return train_loader, val_loader


# --- 4. 모델 정의 ---
def get_activation(name):
    name = name.lower()
    if name == "relu": return nn.ReLU()
    if name == "elu": return nn.ELU()
    if name in ("leaky_relu", "leaky-relu", "lrelu"): return nn.LeakyReLU(0.01)
    if name == "sigmoid": return nn.Sigmoid()
    raise ValueError(f"Unknown activation: {name}")


class MyModel(nn.Module):
  N_INPUT = 10
  N_OUTPUT = 1
  def __init__(self, activation_name="relu"):
    super().__init__()
    self.model = nn.Sequential(
        nn.Linear(self.N_INPUT, wandb.config.n_hidden_unit_list[0]),
        get_activation(activation_name),
        nn.Linear(wandb.config.n_hidden_unit_list[0], wandb.config.n_hidden_unit_list[1]),
        get_activation(activation_name),
        nn.Linear(wandb.config.n_hidden_unit_list[1], self.N_OUTPUT),
    )
  def forward(self, x): return self.model(x)


def get_model_and_optimizer(activation_name):
    model = MyModel(activation_name)
    optimizer = optim.SGD(model.parameters(), lr=wandb.config.learning_rate)
    return model, optimizer


# --- 5. 학습 루프 ---
def training_loop(model, optimizer, train_loader, val_loader):
    n_epochs = wandb.config.epochs
    loss_fn = nn.BCEWithLogitsLoss()
    best_val_acc = -1.0
    best_val_loss = float("inf")
    train_hist, val_hist = [], []

    for epoch in range(1, n_epochs + 1):
        model.train()
        loss_train, correct_train, total_train = 0, 0, 0
        for batch in train_loader:
            x, y = batch['input'], batch['target'].float().unsqueeze(1)
            optimizer.zero_grad()
            output = model(x)
            loss = loss_fn(output, y)
            loss.backward()
            optimizer.step()
            loss_train += loss.item()
            pred = (torch.sigmoid(output) > 0.5).long()
            correct_train += (pred == y.long()).sum().item()
            total_train += y.size(0)
        train_loss_avg = loss_train / len(train_loader)
        train_hist.append(train_loss_avg)

        model.eval()
        val_loss, correct_val, total_val = 0, 0, 0
        with torch.no_grad():
            for batch in val_loader:
                x, y = batch['input'], batch['target'].float().unsqueeze(1)
                output = model(x)
                loss = loss_fn(output, y)
                val_loss += loss.item()
                pred = (torch.sigmoid(output) > 0.5).long()
                correct_val += (pred == y.long()).sum().item()
                total_val += y.size(0)
        val_loss_avg = val_loss / len(val_loader)
        val_hist.append(val_loss_avg)

        train_acc = correct_train / total_train
        val_acc = correct_val / total_val

        # best 추적
        if val_acc > best_val_acc: best_val_acc = val_acc
        if val_loss_avg < best_val_loss: best_val_loss = val_loss_avg

        wandb.log({
            "Epoch": epoch,
            "Training loss": train_loss_avg,
            "Validation loss": val_loss_avg,
            "Training accuracy": train_acc,
            "Validation accuracy": val_acc,
        })

    return {
        "train_loss_history": train_hist,
        "val_loss_history": val_hist,
        "best_val_loss": best_val_loss,
        "best_val_acc": best_val_acc,
    }


# --- 6. 메인 ---
def main(args):
    current_time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    config = {
        'epochs': args.epochs,
        'batch_size': args.batch_size,
        'learning_rate': 1e-3,
        'n_hidden_unit_list': [32, 16],
    }

    wandb.init(
        mode="online" if args.wandb else "disabled",
        project="titanic_survival_prediction",
        notes="Activation function comparison (Plotly + backup line_series)",
        tags=["titanic", "activation-compare", "plotly"],
        name=current_time_str,
        config=config,
        settings=wandb.Settings(_disable_stats=True),  # system metric 비활성화
    )

    train_loader, val_loader = get_data()
    activations = ["relu", "elu", "leaky_relu", "sigmoid"]

    train_hist_by_act, val_hist_by_act = {}, {}
    for act in activations:
        wandb.config.update({"activation": act}, allow_val_change=True)
        model, opt = get_model_and_optimizer(act)
        print(f"\n[Activation: {act}]")
        metrics = training_loop(model, opt, train_loader, val_loader)
        train_hist_by_act[act] = metrics["train_loss_history"]
        val_hist_by_act[act] = metrics["val_loss_history"]

    # --- Plotly 그래프 생성 (색상 구분 확실) ---
    epochs = list(range(1, wandb.config.epochs + 1))
    color_map = {
        "relu": "#1f77b4",        # 파랑
        "elu": "#ff7f0e",         # 주황
        "leaky_relu": "#2ca02c",  # 초록
        "sigmoid": "#d62728",     # 빨강
    }

    # Validation Loss Plot (Media 탭)
    fig_val = go.Figure()
    for act in activations:
        fig_val.add_trace(go.Scatter(
            x=epochs, y=val_hist_by_act[act],
            mode="lines",
            name=act,
            line=dict(color=color_map[act], width=2)
        ))
    fig_val.update_layout(
        title="Validation Loss by Activation",
        xaxis_title="Epoch",
        yaxis_title="Loss",
        legend_title="Activation",
        template="plotly_white"
    )

    # Training Loss Plot (Media 탭)
    fig_tr = go.Figure()
    for act in activations:
        fig_tr.add_trace(go.Scatter(
            x=epochs, y=train_hist_by_act[act],
            mode="lines",
            name=act,
            line=dict(color=color_map[act], width=2)
        ))
    fig_tr.update_layout(
        title="Training Loss by Activation",
        xaxis_title="Epoch",
        yaxis_title="Loss",
        legend_title="Activation",
        template="plotly_white"
    )

    # Plotly → Media 탭에 보이도록 래핑
    wandb.log({
        "val_loss_compare_plotly": wandb.Plotly(fig_val),
        "train_loss_compare_plotly": wandb.Plotly(fig_tr),
    }, commit=True)

    # Charts 탭에도 보이도록 line_series 함께 로그
    val_plot_backup = wandb.plot.line_series(
        xs=epochs,
        ys=[val_hist_by_act[a] for a in activations],
        keys=activations,
        title="Validation Loss by Activation (backup)",
        xname="Epoch"
    )
    train_plot_backup = wandb.plot.line_series(
        xs=epochs,
        ys=[train_hist_by_act[a] for a in activations],
        keys=activations,
        title="Training Loss by Activation (backup)",
        xname="Epoch"
    )
    wandb.log({
        "val_loss_compare_backup": val_plot_backup,
        "train_loss_compare_backup": train_plot_backup,
    })

    wandb.finish()


# --- 7. 실행 ---
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--wandb", action=argparse.BooleanOptionalAction, default=False)
    parser.add_argument("-b", "--batch_size", type=int, default=64)
    parser.add_argument("-e", "--epochs", type=int, default=500)
    args = parser.parse_args([])
    args.wandb = True  # Jupyter에서 강제 on
    print(f"[NOTE] WandB manually set to: {args.wandb}")
    main(args)


[NOTE] WandB manually set to: True



[Activation: relu]

[Activation: elu]

[Activation: leaky_relu]

[Activation: sigmoid]


0,1
Epoch,▁▃▄▄▄▅▆▆▇█▂▂▃▃▄▆▆▇▇██▁▂▃▃▄▅▆▆▇█▁▃▃▄▅▆▆▆█
Training accuracy,▅▅▅▆▅▅▇▇▇▇▅█▇███▆▆▇▆▆▆▇▆▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Training loss,▆▅▅▃▄▅▃▃▄▃▅▂▃▂▂▂▃▂▂▁▅▄▄▄▃▄▃▃▃▃▃▂▃█▇▇▇███
Validation accuracy,▇▇▇▇▇▆▇▆▇▆██▇▆█▆▇▅▇▇▇▇▇▇▆▆▆▁▁▁▁▁▁▁▁▁▁▁▁▁
Validation loss,▂▂▂▂▃▂▂▂▂▂▂▁▁▁▃▃▃▂▂▂▂▂▂▂▂█▇▇▇▇▇▇▇▇▇▆▆▆▆▆

0,1
Epoch,500.0
Training accuracy,0.60674
Training loss,0.66639
Validation accuracy,0.65363
Validation loss,0.64708


### 해석
- get_activation 함수를 만들어 각각 ELU, ReLU, Leaky ReLU, Sigmoid에 해당하는 PyTorch 활성화 함수 모듈을 반환
- activation_name="relu" : MyModel 클래스에서 인수로 활성화 함수 이름을 받게 하기 위해 추가
- get_activation(activation_name) : 동적으로 활성화 함수 적용
- get_model_and_optimizer(activation_name) 함수 : activation_name으로 활성화 함수 이름 받기
- train_hist_by_act[act] = metrics["train_loss_history"] / val_hist_by_act[act] = metrics["val_loss_history"]
  : 각 모델의 학습 완료 후 손실 이력 저장
- wandb.Plotly(figure) : 생성된 Plotly 그래프를 WandB 대시보드의 Media 탭에 로깅
- wandb.plot.line_series : W&B의 기본 차트용(line plot) 객체를 생성하는 함수
- ys=[val_hist_by_act[a] for a in activations] : 각 활성화 함수의 loss 변화 리스트를 y축 값에 넣기

### 취득한 기술적 사항/고찰 내용
- 한 차트 안에 ELU, ReLU, Leaky ReLU, Sigmoid를 비교하려니 선의 색이 같아 비교하기 어려운 문제가 발생했다. 이를 해결하기 위해 Plotly를 사용했다.
- notebook 모드에서는 Plotly 그래프가 보이지 않는 환경이므로 Charts 탭에서 볼 수 있도록 백업 시각화를 추가해야 된다는 점
- 현재 차트 안에서 ELU가 가장 빠르게 수렴하는 그래프를 보여주고 있으므로 더	나은 성능을 산출하는	Activation Function은 ELU이다.

In [3]:
import os
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, random_split
from datetime import datetime
import wandb
import argparse
from sklearn.preprocessing import LabelEncoder
import numpy as np
import plotly.graph_objects as go  # 인터랙티브 차트

# 출력 옵션
pd.set_option("display.width", None)
pd.set_option("display.max_columns", None)

# ---------------------------
# 1) Dataset
# ---------------------------
class TitanicDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return {'input': self.X[idx], 'target': self.y[idx]}

class TitanicTestDataset(Dataset):
    def __init__(self, X): self.X = torch.FloatTensor(X)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return {'input': self.X[idx]}

# ---------------------------
# 2) 전처리
# ---------------------------
def get_preprocessed_dataset_1(all_df):
    Fare_mean = all_df[["Pclass", "Fare"]].groupby("Pclass").mean().reset_index()
    Fare_mean.columns = ["Pclass", "Fare_mean"]
    all_df = pd.merge(all_df, Fare_mean, on="Pclass", how="left")
    all_df.loc[(all_df["Fare"].isnull()), "Fare"] = all_df["Fare_mean"]
    all_df = all_df.drop(columns=["Fare_mean"])
    return all_df

def get_preprocessed_dataset_2(all_df):
    name_df = all_df["Name"].str.split("[,.]", n=2, expand=True)
    name_df.columns = ["family_name", "title", "name"]
    name_df = name_df.apply(lambda col: col.str.strip())
    all_df = pd.concat([all_df, name_df], axis=1)
    return all_df

def get_preprocessed_dataset_3(all_df):
    title_age_mean = all_df[["title", "Age"]].groupby("title").median().round().reset_index()
    title_age_mean.columns = ["title", "title_age_mean"]
    all_df = pd.merge(all_df, title_age_mean, on="title", how="left")
    all_df["title_age_mean"] = all_df["title_age_mean"].fillna(0)
    all_df.loc[(all_df["Age"].isnull()), "Age"] = all_df["title_age_mean"]
    all_df = all_df.drop(["title_age_mean"], axis=1)
    all_df["Age"] = all_df["Age"].fillna(all_df["Age"].mean())
    return all_df

def get_preprocessed_dataset_4(all_df):
    all_df["family_num"] = all_df["Parch"] + all_df["SibSp"]
    all_df.loc[all_df["family_num"] == 0, "alone"] = 1
    all_df["alone"] = all_df["alone"].fillna(0)
    all_df = all_df.drop(["PassengerId", "Name", "family_name", "name", "Ticket", "Cabin"], axis=1)
    return all_df

def get_preprocessed_dataset_5(all_df):
    main_titles = ["Mr", "Miss", "Mrs", "Master"]
    all_df.loc[~all_df["title"].isin(main_titles), "title"] = "other"
    all_df["Embarked"] = all_df["Embarked"].fillna("missing")
    return all_df

def get_preprocessed_dataset_6(all_df):
    category_features = all_df.columns[all_df.dtypes == "object"]
    for feature in category_features:
        le = LabelEncoder()
        all_df[feature] = le.fit_transform(all_df[feature])
    return all_df

# ---------------------------
# 3) DataLoader (wandb.config.batch_size 사용)
# ---------------------------
def get_data():
    train_data_path = os.path.join(os.getcwd(), "train.csv")
    test_data_path = os.path.join(os.getcwd(), "test.csv")

    train_df = pd.read_csv(train_data_path)
    test_df = pd.read_csv(test_data_path)

    all_df = pd.concat([train_df, test_df], sort=False)
    all_df = get_preprocessed_dataset_1(all_df)
    all_df = get_preprocessed_dataset_2(all_df)
    all_df = get_preprocessed_dataset_3(all_df)
    all_df = get_preprocessed_dataset_4(all_df)
    all_df = get_preprocessed_dataset_5(all_df)
    all_df = get_preprocessed_dataset_6(all_df)

    train_X = all_df[~all_df["Survived"].isnull()].drop("Survived", axis=1).reset_index(drop=True)
    train_y = train_df["Survived"]

    dataset = TitanicDataset(train_X.values, train_y.values)
    train_size = int(len(dataset) * 0.8)
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_dataset, batch_size=wandb.config.batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=val_size)
    return train_loader, val_loader

# ---------------------------
# 4) 모델/옵티마이저 (ELU만 사용)
# ---------------------------
class MyModel(nn.Module):
    N_INPUT = 10
    N_OUTPUT = 1
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(self.N_INPUT, wandb.config.n_hidden_unit_list[0]),
            nn.ELU(),
            nn.Linear(wandb.config.n_hidden_unit_list[0], wandb.config.n_hidden_unit_list[1]),
            nn.ELU(),
            nn.Linear(wandb.config.n_hidden_unit_list[1], self.N_OUTPUT),
        )
    def forward(self, x): return self.model(x)

def get_model_and_optimizer():
    model = MyModel()  # ELU 고정
    optimizer = optim.SGD(model.parameters(), lr=wandb.config.learning_rate)
    return model, optimizer

# ---------------------------
# 5) 학습 루프 (히스토리 수집)
# ---------------------------
def training_loop(model, optimizer, train_loader, val_loader):
    n_epochs = wandb.config.epochs
    loss_fn = nn.BCEWithLogitsLoss()
    train_hist, val_hist = [], []

    for epoch in range(1, n_epochs + 1):
        # Train
        model.train()
        loss_train, correct_train, total_train = 0.0, 0, 0
        for batch in train_loader:
            x = batch['input']
            y = batch['target'].float().unsqueeze(1)
            optimizer.zero_grad()
            out = model(x)
            loss = loss_fn(out, y)
            loss.backward()
            optimizer.step()
            loss_train += loss.item()
            pred = (torch.sigmoid(out) > 0.5).long()
            correct_train += (pred == y.long()).sum().item()
            total_train += y.size(0)
        train_loss_avg = loss_train / len(train_loader)
        train_hist.append(train_loss_avg)

        # Validation
        model.eval()
        loss_val, correct_val, total_val = 0.0, 0, 0
        with torch.no_grad():
            for batch in val_loader:
                x = batch['input']
                y = batch['target'].float().unsqueeze(1)
                out = model(x)
                loss = loss_fn(out, y)
                loss_val += loss.item()
                pred = (torch.sigmoid(out) > 0.5).long()
                correct_val += (pred == y.long()).sum().item()
                total_val += y.size(0)
        val_loss_avg = loss_val / len(val_loader)
        val_hist.append(val_loss_avg)

        wandb.log({
            "Epoch": epoch,
            "Training loss": train_loss_avg,
            "Validation loss": val_loss_avg,
            "Training accuracy": correct_train / total_train,
            "Validation accuracy": correct_val / total_val,
        })

    return {"train_loss_history": train_hist, "val_loss_history": val_hist}

# ---------------------------
# 6) 메인: ELU 고정 + 배치사이즈 비교 (16/32/64/128)
# ---------------------------
def main(args):
    current_time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    config = {
        'epochs': args.epochs,
        'batch_size': args.batch_size,   # 초기값(루프에서 덮어씀)
        'learning_rate': 1e-3,
        'n_hidden_unit_list': [32, 16],
        'compare_mode': 'batch_size',
    }

    wandb.init(
        mode="online" if args.wandb else "disabled",
        project="titanic_survival_prediction",
        notes="ELU-only | Compare batch sizes (16/32/64/128) | Plotly + backup",
        tags=["titanic", "elu", "batchsize-compare", "plotly"],
        name=current_time_str,
        config=config,
        settings=wandb.Settings(_disable_stats=True),  # 시스템 성능 로깅 비활성화
    )

    batch_sizes = [16, 32, 64, 128]
    train_hist_by_bs, val_hist_by_bs = {}, {}

    for bs in batch_sizes:
        # 배치 사이즈 갱신 → DataLoader 재생성
        wandb.config.update({"batch_size": bs}, allow_val_change=True)
        train_loader, val_loader = get_data()

        # 모델/옵티마이저 생성 (ELU 고정)
        model, opt = get_model_and_optimizer()

        print(f"\n[Batch Size: {bs} | Activation: ELU]")
        metrics = training_loop(model, opt, train_loader, val_loader)
        train_hist_by_bs[bs] = metrics["train_loss_history"]
        val_hist_by_bs[bs]   = metrics["val_loss_history"]

    # --- Plotly 그래프 (Media 탭)
    epochs = list(range(1, wandb.config.epochs + 1))
    color_map = {16: "#1f77b4", 32: "#ff7f0e", 64: "#2ca02c", 128: "#d62728"}

    # Validation Loss
    fig_val = go.Figure()
    for bs in batch_sizes:
        fig_val.add_trace(go.Scatter(
            x=epochs, y=val_hist_by_bs[bs], mode="lines",
            name=f"BS={bs}", line=dict(color=color_map[bs], width=2)
        ))
    fig_val.update_layout(
        title="Validation Loss by Batch Size (Activation = ELU)",
        xaxis_title="Epoch", yaxis_title="Loss",
        legend_title="Batch Size", template="plotly_white"
    )
    wandb.log({"val_loss_by_batchsize_plotly": wandb.Plotly(fig_val)}, commit=True)

    # Training Loss
    fig_tr = go.Figure()
    for bs in batch_sizes:
        fig_tr.add_trace(go.Scatter(
            x=epochs, y=train_hist_by_bs[bs], mode="lines",
            name=f"BS={bs}", line=dict(color=color_map[bs], width=2)
        ))
    fig_tr.update_layout(
        title="Training Loss by Batch Size (Activation = ELU)",
        xaxis_title="Epoch", yaxis_title="Loss",
        legend_title="Batch Size", template="plotly_white"
    )
    wandb.log({"train_loss_by_batchsize_plotly": wandb.Plotly(fig_tr)})

    # --- 백업: Charts 탭 라인차트
    val_plot_backup = wandb.plot.line_series(
        xs=epochs,
        ys=[val_hist_by_bs[b] for b in batch_sizes],
        keys=[f"BS={b}" for b in batch_sizes],
        title="Validation Loss by Batch Size (backup)",
        xname="Epoch"
    )
    train_plot_backup = wandb.plot.line_series(
        xs=epochs,
        ys=[train_hist_by_bs[b] for b in batch_sizes],
        keys=[f"BS={b}" for b in batch_sizes],
        title="Training Loss by Batch Size (backup)",
        xname="Epoch"
    )
    wandb.log({
        "val_loss_by_batchsize_backup": val_plot_backup,
        "train_loss_by_batchsize_backup": train_plot_backup,
    })

    wandb.finish()

# ---------------------------
# 7) 실행
# ---------------------------
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--wandb", action=argparse.BooleanOptionalAction, default=False)
    parser.add_argument("-b", "--batch_size", type=int, default=64)
    parser.add_argument("-e", "--epochs", type=int, default=500)
    args = parser.parse_args([])
    args.wandb = True  # Jupyter에서 강제 on
    print(f"[NOTE] WandB manually set to: {args.wandb}")
    main(args)


[NOTE] WandB manually set to: True



[Batch Size: 16 | Activation: ELU]

[Batch Size: 32 | Activation: ELU]

[Batch Size: 64 | Activation: ELU]

[Batch Size: 128 | Activation: ELU]


0,1
Epoch,▁▂▃▄▄▆▆▆██▂▄▅▅▅▆▆▆▆▇▁▂▃▃▄▅▅▆▆█▁▂▂▂▃▄▄▅▅▇
Training accuracy,▁▁▁▂▃▆▇▇██▄▆▆▅▆▅▅▆▆█▂▅▅▆▆▆▆▆▂▃▃▃▃▃▃▄▄▄▅▄
Training loss,▇▇▇▆▆▆▆▆▅▅▂▂▁▇▇▆▆▆▅▄▄▇█▆▇▆▆▆▅▅▆▅▆▅▅▅▄▅▅▅
Validation accuracy,▂▄▄▅▅▅▅▅▆▆▆▆▆▅▇▇▇▇▇█▇█▆▅▅▅▅▅▅▅▂▂▁▁▁▂▂▂▃▃
Validation loss,▅▄▄▅▄▄▄▂▁▁▆▅▃▃▂▁▂▂▁▂▄▄▄▄▄▃▄▄▃▄▇██████▇▇▇

0,1
Epoch,500.0
Training accuracy,0.71348
Training loss,0.56638
Validation accuracy,0.67598
Validation loss,0.62504


### 해석
- for bs in batch_sizes : train_loader, val_loader = get_data()
  : 배치사이즈를 바꿀 때마다 get_data() 다시 호출 => train_loader와 val_loader가 새 배치 크기로 형성

### 취득한 기술적 사항/고찰 내용
- batch size가 64, 128인 경우 수렴 속도는 느리고, Validation Loss도 높게 유지되므로 16, 32인 경우보다 성능이 낮다고 평가할 수 있음
- batch size가 32인 경우 validation loss가 16인 경우보다 더 낮게 내려가므로 가장 좋은 성능의 batch size로 32 채택
- 직접 보기에 batch size가 32인 경우가 불안정해보여 분석한 결과 전반적으로 batch size가 16인 경우의 validation loss보다 batch size가 32인인 validation loss가 더 낮은 평균을 보여주며, 모델을 더 오래 학습시킨다면 불안정성을 회복시킬 수 있을 거라는 예상이 나와 이를 채택했다.

## [요구사항 3]	테스트	및	submission.csv 생성

In [5]:
import os
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, random_split
from datetime import datetime
import wandb
import argparse
from sklearn.preprocessing import LabelEncoder
import numpy as np

pd.set_option("display.width", None)
pd.set_option("display.max_columns", None)

# ---------------------------
# Dataset
# ---------------------------
class TitanicDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return {'input': self.X[idx], 'target': self.y[idx]}

class TitanicTestDataset(Dataset):
    """test.csv처럼 라벨 없는 데이터용"""
    def __init__(self, X):
        self.X = torch.FloatTensor(X)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return {'input': self.X[idx]}

# ---------------------------
# 전처리 모듈
# ---------------------------
def get_preprocessed_dataset_1(all_df):
    Fare_mean = all_df[["Pclass", "Fare"]].groupby("Pclass").mean().reset_index()
    Fare_mean.columns = ["Pclass", "Fare_mean"]
    all_df = pd.merge(all_df, Fare_mean, on="Pclass", how="left")
    all_df.loc[all_df["Fare"].isnull(), "Fare"] = all_df["Fare_mean"]
    all_df = all_df.drop(columns=["Fare_mean"])
    return all_df

def get_preprocessed_dataset_2(all_df):
    name_df = all_df["Name"].str.split("[,.]", n=2, expand=True)
    name_df.columns = ["family_name", "title", "name"]
    name_df = name_df.apply(lambda col: col.str.strip())
    all_df = pd.concat([all_df, name_df], axis=1)
    return all_df

def get_preprocessed_dataset_3(all_df):
    title_age_mean = all_df[["title", "Age"]].groupby("title").median().round().reset_index()
    title_age_mean.columns = ["title", "title_age_mean"]
    all_df = pd.merge(all_df, title_age_mean, on="title", how="left")
    all_df["title_age_mean"] = all_df["title_age_mean"].fillna(0)
    all_df.loc[all_df["Age"].isnull(), "Age"] = all_df["title_age_mean"]
    all_df = all_df.drop(["title_age_mean"], axis=1)
    all_df["Age"] = all_df["Age"].fillna(all_df["Age"].mean())
    return all_df

def get_preprocessed_dataset_4(all_df):
    all_df["family_num"] = all_df["Parch"] + all_df["SibSp"]
    all_df.loc[all_df["family_num"] == 0, "alone"] = 1
    all_df["alone"] = all_df["alone"].fillna(0)
    all_df = all_df.drop(["PassengerId", "Name", "family_name", "name", "Ticket", "Cabin"], axis=1)
    return all_df

def get_preprocessed_dataset_5(all_df):
    main_titles = ["Mr", "Miss", "Mrs", "Master"]
    all_df.loc[~all_df["title"].isin(main_titles), "title"] = "other"
    all_df["Embarked"] = all_df["Embarked"].fillna("missing")
    return all_df

def get_preprocessed_dataset_6(all_df):
    category_features = all_df.columns[all_df.dtypes == "object"]
    for feature in category_features:
        le = LabelEncoder()
        all_df[feature] = le.fit_transform(all_df[feature])
    return all_df

def preprocess_all(train_df, test_df):
    """train/test를 합쳐 동일 전처리 적용 후, 다시 분리"""
    all_df = pd.concat([train_df, test_df], sort=False)
    for f in [get_preprocessed_dataset_1, get_preprocessed_dataset_2, get_preprocessed_dataset_3,
              get_preprocessed_dataset_4, get_preprocessed_dataset_5, get_preprocessed_dataset_6]:
        all_df = f(all_df)
    return all_df

# ---------------------------
# DataLoaders
# ---------------------------
def get_train_val_loaders():
    train_data_path = os.path.join(os.getcwd(), "train.csv")
    test_data_path = os.path.join(os.getcwd(), "test.csv")

    train_df = pd.read_csv(train_data_path)
    test_df  = pd.read_csv(test_data_path)

    all_df = preprocess_all(train_df, test_df)

    train_X = all_df[~all_df["Survived"].isnull()].drop("Survived", axis=1).reset_index(drop=True)
    train_y = train_df["Survived"]
    dataset = TitanicDataset(train_X.values, train_y.values)

    train_size = int(len(dataset) * 0.8)
    val_size   = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_dataset, batch_size=wandb.config.batch_size, shuffle=True)
    val_loader   = DataLoader(val_dataset,   batch_size=val_size)
    return train_loader, val_loader

def get_test_loader_and_ids():
    """test.csv 파트만 분리해 DataLoader와 PassengerId 반환"""
    train_data_path = os.path.join(os.getcwd(), "train.csv")
    test_data_path  = os.path.join(os.getcwd(), "test.csv")

    train_df = pd.read_csv(train_data_path)
    test_df  = pd.read_csv(test_data_path)
    test_ids = test_df["PassengerId"].values  # 전처리 전 보관

    all_df = preprocess_all(train_df, test_df)
    test_X = all_df[all_df["Survived"].isnull()].drop("Survived", axis=1).reset_index(drop=True).values
    test_dataset = TitanicTestDataset(test_X)
    test_loader  = DataLoader(test_dataset, batch_size=wandb.config.batch_size)
    return test_loader, test_ids

# ---------------------------
# Model / Optim
# ---------------------------
class MyModel(nn.Module):
    N_INPUT = 10
    N_OUTPUT = 1
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(self.N_INPUT, wandb.config.n_hidden_unit_list[0]),
            nn.ELU(),
            nn.Linear(wandb.config.n_hidden_unit_list[0], wandb.config.n_hidden_unit_list[1]),
            nn.ELU(),
            nn.Linear(wandb.config.n_hidden_unit_list[1], self.N_OUTPUT),
        )
    def forward(self, x): return self.model(x)

def get_model_and_optimizer():
    model = MyModel()
    optimizer = optim.SGD(model.parameters(), lr=wandb.config.learning_rate)

    return model, optimizer

# ---------------------------
# Train loop with best-checkpoint
# ---------------------------
def training_loop(model, optimizer, train_loader, val_loader, ckpt_path="best_model.pt"):
    n_epochs = wandb.config.epochs
    loss_fn  = nn.BCEWithLogitsLoss()
    best_val_loss = float("inf")
    best_epoch    = -1
    train_hist, val_hist = [], []

    # 스케줄러
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=20)

    for epoch in range(1, n_epochs + 1):
        # Train
        model.train()
        loss_train, correct_train, total_train = 0.0, 0, 0
        for batch in train_loader:
            x = batch['input']
            y = batch['target'].float().unsqueeze(1)
            optimizer.zero_grad()
            out = model(x)
            loss = loss_fn(out, y)
            loss.backward()
            optimizer.step()
            loss_train += loss.item()
            pred = (torch.sigmoid(out) > 0.5).long()
            correct_train += (pred == y.long()).sum().item()
            total_train += y.size(0)
        train_loss_avg = loss_train / len(train_loader)
        train_hist.append(train_loss_avg)

        # Val
        model.eval()
        loss_val, correct_val, total_val = 0.0, 0, 0
        with torch.no_grad():
            for batch in val_loader:
                x = batch['input']
                y = batch['target'].float().unsqueeze(1)
                out = model(x)
                loss = loss_fn(out, y)
                loss_val += loss.item()
                pred = (torch.sigmoid(out) > 0.5).long()
                correct_val += (pred == y.long()).sum().item()
                total_val += y.size(0)
        val_loss_avg = loss_val / len(val_loader)
        val_hist.append(val_loss_avg)

        train_acc = correct_train / total_train
        val_acc   = correct_val   / total_val

        # BEST 체크포인트 저장 (검증 손실 최저 시)
        if val_loss_avg < best_val_loss:
            best_val_loss = val_loss_avg
            best_epoch = epoch
            torch.save(model.state_dict(), ckpt_path)

        # 스케줄러 업데이트
        scheduler.step(val_loss_avg)

        wandb.log({
            "Epoch": epoch,
            "Training loss": train_loss_avg,
            "Validation loss": val_loss_avg,
            "Training accuracy": train_acc,
            "Validation accuracy": val_acc,
            "Best val loss so far": best_val_loss,
            "Best epoch": best_epoch,
            "LR": optimizer.param_groups[0]["lr"],
        })

    return {
        "train_loss_history": train_hist,
        "val_loss_history": val_hist,
        "best_val_loss": best_val_loss,
        "best_epoch": best_epoch,
        "ckpt_path": ckpt_path,
    }

# ---------------------------
# Test inference → submission.csv
# ---------------------------
def predict_test_and_save_csv(model, ckpt_path, out_csv="submission.csv"):
    # best checkpoint 로드
    model.load_state_dict(torch.load(ckpt_path, map_location="cpu"))
    model.eval()

    test_loader, test_ids = get_test_loader_and_ids()
    preds = []
    with torch.no_grad():
        for batch in test_loader:
            x = batch['input']
            logits = model(x)
            probs = torch.sigmoid(logits).squeeze(1)   # [N]
            p = (probs > 0.5).long().cpu().numpy()
            preds.extend(p.tolist())

    submission = pd.DataFrame({"PassengerId": test_ids, "Survived": preds})
    submission.to_csv(out_csv, index=False)
    print(f"[OK] Saved: {out_csv} (rows={len(submission)})")
    return out_csv

# ---------------------------
# Main
# ---------------------------
def main(args):
    current_time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    config = {
        'epochs': args.epochs,
        'batch_size': 32,                 # BS=32 고정
        'learning_rate': 1e-3,
        'n_hidden_unit_list': [32, 16],
        'setup': 'ELU + BS=32',
    }

    wandb.init(
        mode="online" if args.wandb else "disabled",
        project="titanic_survival_prediction",
        notes="ELU + BS=32 | best val-loss checkpoint → test inference → submission",
        tags=["titanic", "elu", "bs32", "best-checkpoint", "submission"],
        name=current_time_str,
        config=config,
        settings=wandb.Settings(_disable_stats=True),
    )

    train_loader, val_loader = get_train_val_loaders()
    model, optimizer = get_model_and_optimizer()

    ckpt_path = "best_model.pt"
    metrics = training_loop(model, optimizer, train_loader, val_loader, ckpt_path=ckpt_path)

    #  훈련 종료 후: best checkpoint로 test 예측 → submission.csv
    out_csv = f"submission_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    predict_test_and_save_csv(model, metrics["ckpt_path"], out_csv=out_csv)

    wandb.summary["best_epoch"] = metrics["best_epoch"]
    wandb.summary["best_val_loss"] = metrics["best_val_loss"]
    wandb.summary["submission_csv"] = out_csv

    # Charts 탭에 손실 곡선 로그
    epochs = list(range(1, wandb.config.epochs + 1))
    wandb.log({
        "val_loss_curve": wandb.plot.line_series(xs=epochs, ys=[metrics["val_loss_history"]],
                                                 keys=["Validation"], title="Validation Loss", xname="Epoch"),
        "train_loss_curve": wandb.plot.line_series(xs=epochs, ys=[metrics["train_loss_history"]],
                                                   keys=["Training"], title="Training Loss", xname="Epoch"),
    })

    wandb.finish()

# ---------------------------
# Run
# ---------------------------
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--wandb", action=argparse.BooleanOptionalAction, default=False)
    parser.add_argument("-e", "--epochs", type=int, default=500)
    args = parser.parse_args([])  # 노트북에서 인자 없이 실행
    args.wandb = True
    print(f"[NOTE] WandB manually set to: {args.wandb}")
    main(args)


[NOTE] WandB manually set to: True


[OK] Saved: submission_20251017_194959.csv (rows=418)


0,1
Best epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
Best val loss so far,███████▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▃▃▃▂▂▂▁▁▁
Epoch,▁▁▁▁▁▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇████
LR,███████████████████████████████████████▁
Training accuracy,▁▁▁▄▄▄▅▄▄▄▅▅▄▅▅▄▅▅▆▅▅▅▆▅▆▆▅▆▆▇▆▇▇▅▆▇▇█▇▇
Training loss,████▇▇▇▇█▆▆▇▆▆▆▅▆▆▅▅▆▄▄▄▄▄▃▃▄▃▃▃▂▃▄▃▂▂▂▁
Validation accuracy,▂▃▂▂▂▅▅▄▄▁▄▄▄▄▅▄▄▅▆▅▅▄▅▅▅▃▄▅▄▅▆▅▆▅▆▆█▅▆▇
Validation loss,███████▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▄▄▅▄▄▄▃▄▃▄▃▃▃▃▂▂▂▁

0,1
Best epoch,500
Best val loss so far,0.5769
Epoch,500
LR,0.0005
Training accuracy,0.7486
Training loss,0.5148
Validation accuracy,0.67039
Validation loss,0.5769
best_epoch,500
best_val_loss,0.5769


### 해석
- ReduceLROnPlateau : LR 스케줄러로 검증 손실 정체 시 학습률 감소
- if val_loss_avg < best_val_loss : 검증 손실이 사상 최저일 때 베스트 체크포인트 저장

### 취득한 기술적 사항/고찰 내용
- 검증 손실 최저가 마지막 에포크에 발생했으므로 epoch 500 시점에서 submission.csv 형성
- batch size에서 16과 32의 사이, 32와 64의 사이를 측정하면 더 나은 결과를 얻을 수 있는 가능성 존재

## [요구사항 4]	submission.csv제출 및 등수확인

![요구사항 4 이미지](https://github.com/tongwu2021/deep-learning-homework/blob/main/hw_2_%EC%9A%94%EA%B5%AC%EC%82%AC%ED%95%AD%204.png?raw=true)

## 숙제 후기
wandb 그래프 비교분석을 한 차트 안에서 하는 걸 추천한다고 했기에 이를 처음 시도했을 때, 한 차트 안에 있는 여러 그래프가 서로 구별하기 힘들게 나와 놀랐었다.
코드를 기록하고 실행하면서 많은 오류가 발생했고 이를 고치는데 꽤 많은 시간을 사용한 것 같다. 음, 요약하자면 중간고사 공부보다 이 과제를 더 열심히 한 것 같다.