# 필요한 모듈 설치

In [1]:
!pip install tqdm
!pip install koreanize-matplotlib



# 필요한 모듈들 import

In [2]:
# 시스템 및 환경 관련 라이브러리
import os  # 운영체제 관련 기능을 제공하는 모듈 (파일 경로 처리 등)
import random  # 무작위 수를 생성하거나 랜덤화할 때 사용하는 모듈
import tqdm  # 반복문 진행 상황을 시각적으로 보여주는 모듈 (진행 바 표시)
import pytz

# 데이터 분석 및 처리 라이브러리
import pandas as pd  # 데이터 처리 및 분석을 위한 라이브러리 (특히 DataFrame 사용)
import numpy as np  # 수학적 계산 및 배열 작업을 위한 라이브러리 (벡터화 연산)

# 데이터 시각화 관련 라이브러리
import seaborn as sns
import matplotlib.pyplot as plt
import koreanize_matplotlib

# 머신러닝 관련 라이브러리
from sklearn.model_selection import train_test_split  # 데이터를 훈련 세트와 테스트 세트로 나누는 기능
from sklearn.preprocessing import OneHotEncoder  # 범주형 데이터를 원-핫 인코딩하는 클래스
from sklearn.tree import DecisionTreeClassifier  # 결정 트리 분류 모델을 위한 클래스
from sklearn.ensemble import RandomForestClassifier # 랜덤 포레스트 분류 모델
from sklearn.metrics import roc_curve, auc  # ROC 곡선 계산 및 AUC 값 평가를 위한 함수

# 딥러닝 및 GPU 연산 관련 라이브러리
import torch  # 딥러닝 모델 및 GPU 연산을 위한 주요 라이브러리 (PyTorch)

from datetime import datetime  # 날짜와 시간 처리 모듈

# 추가적인 sklearn 모듈
from sklearn.model_selection import train_test_split  # (중복된 임포트, 이미 위에서 임포트됨)
from sklearn.preprocessing import OneHotEncoder  # (중복된 임포트, 이미 위에서 임포트됨)

# 현재 날짜와 시간 정보를 가져옵니다.
today = datetime.today().strftime('%m%d')  # 오늘 날짜 (예: 0207)

# seed 값을 고정하는 함수
def reset_seeds(seed=42):
    random.seed(seed)  # random 모듈의 시드 고정
    os.environ['PYTHONHASHSEED'] = str(seed)  # 파이썬 해시 시드 고정
    np.random.seed(seed)  # numpy의 시드 고정
    torch.manual_seed(seed)  # PyTorch CPU 연산의 시드 고정
    torch.cuda.manual_seed(seed)  # PyTorch GPU 연산의 시드 고정
    torch.backends.cudnn.deterministic = True  # CUDA 라이브러리에서 결정론적 예측 설정

# 제출 파일 이름을 생성하는 함수
def get_submission_filename(base_path, base_filename):
    counter = 1  # 파일 이름 중복 방지를 위한 카운터 초기화
    submission_path = os.path.join(base_path, f"{base_filename}_{today}_{counter}.csv")

    # 해당 경로에 파일이 존재하는지 확인하고, 존재하면 counter를 증가시켜 반복
    while os.path.exists(submission_path):
        counter += 1
        submission_path = os.path.join(base_path, f"{base_filename}_{today}_{counter}.csv")

    return submission_path


# Connect to googld drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Global Variables

In [4]:
import easydict
args = easydict.EasyDict()

# path info
args.default_path = "/content/drive/MyDrive/Colab Notebooks/AI 기초_KDT/강의/kaggle/data/"
args.train_csv = args.default_path + "train.csv"
args.test_csv = args.default_path + "test.csv"
args.default_submission = args.default_path + "submission.csv"
args.submit_submission = "/content/drive/MyDrive/Colab Notebooks/AI 기초_KDT/강의/kaggle/submission_csv/"

# 결과 저장 경로 설정
history_results_dir = "/content/drive/MyDrive/Colab Notebooks/AI 기초_KDT/강의/kaggle/"
history_results_path = os.path.join(history_results_dir, "history_results.csv")

# 제출용
args.submission_csv = get_submission_filename(args.submit_submission, "submission")
args.submission_csv

# 기타 설정
args.random_seed = 42

# 추후의 확인을 위한 리스트
args.results = []

# Load Titanic Dataset
- Surived:0=사망, 1=생존
- Pclass: 1=1등석, 2=2등석, 3=3등석
- gender:male=남성, female=여성
- Age: 나이
- SibSp: 타이타닉 호에 동승한 자매/배우자의 수
- Parch: 타이타닉 호에 동승한 부모/자식의 수
- Ticket: 티켓 번호
- Fare: 승객 요금
- Cabin: 방 호수
- Embarked: 탑승지; C=셰르부르, Q=퀴즈타운, S=사우샘프턴

In [5]:
ori_train = pd.read_csv(args.train_csv)
ori_test = pd.read_csv(args.test_csv)
default_submission = pd.read_csv(args.default_submission)

ori_train.shape, ori_test.shape, default_submission.shape

((916, 12), (393, 11), (393, 2))

In [6]:
default_submission.head()

Unnamed: 0,passengerid,survived
0,916,0.5
1,917,0.5
2,918,0.5
3,919,0.5
4,920,0.5


In [7]:
ori_train.head()

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,0,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S
1,1,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q
2,2,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",female,52.0,1,1,12749,93.5,B69,S
3,3,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27.0,0,0,350043,7.7958,,S
4,4,0,2,"Hold, Mr. Stephen",male,44.0,1,0,26707,26.0,,S


In [8]:
ori_train.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'gender', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [9]:
ori_test.columns

Index(['passengerid', 'pclass', 'name', 'gender', 'age', 'sibsp', 'parch',
       'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

- test 데이터셋에는 survived가 없음을 알 수 있음
- train 데이터셋에서 훈련을 위한 데이터프레임과 검증을 위한 데이터프레임을 나눠야 함.

# model_rf_V0

## Train & Test Split

In [None]:
y = ori_train['survived']
X = ori_train.drop(['survived'], axis=1)

In [None]:
reset_seeds()
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=ori_train['survived'])

X_tr.shape, X_te.shape, y_tr.shape, y_te.shape

((732, 11), (184, 11), (732,), (184,))

## Data Preprocessing

In [None]:
train = X_tr.copy()
test = X_te.copy()
ori_te = ori_test.copy()

train.shape, test.shape, ori_te.shape

((732, 11), (184, 11), (393, 11))

### Data Cleaning

In [None]:
train.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
cabin,580
age,135
passengerid,0
pclass,0
name,0
gender,0
sibsp,0
parch,0
ticket,0
fare,0


In [None]:
test.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
cabin,138
age,45
embarked,1
passengerid,0
pclass,0
name,0
gender,0
sibsp,0
parch,0
ticket,0


In [None]:
ori_te.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
cabin,296
age,83
fare,1
embarked,1
passengerid,0
pclass,0
name,0
gender,0
sibsp,0
parch,0


In [None]:
age_median = train['age'].median()
fare_median = train['fare'].median()

# embarked와 cabin에 대해 mode()를 사용하는 이유는 두 열이 범주형 데이터이기 때문
embarked_mode = train['embarked'].mode().values[0]
cabin_mode = train['cabin'].mode().values[0]

age_median, fare_median, embarked_mode, cabin_mode

(28.0, 14.4583, 'S', 'F2')

In [None]:
train['age'].fillna(age_median, inplace=True)
test['age'].fillna(age_median, inplace=True)
ori_te['age'].fillna(age_median, inplace=True)

train['fare'].fillna(fare_median, inplace=True)
test['fare'].fillna(fare_median, inplace=True)
ori_te['fare'].fillna(fare_median, inplace=True)

train['embarked'].fillna(embarked_mode, inplace=True)
test['embarked'].fillna(embarked_mode, inplace=True)
ori_te['embarked'].fillna(embarked_mode, inplace=True)

train['cabin'].fillna(cabin_mode, inplace=True)
test['cabin'].fillna(cabin_mode, inplace=True)
ori_te['cabin'].fillna(cabin_mode, inplace=True)

train.isnull().sum().sum(), test.isnull().sum().sum(), ori_te.isnull().sum().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['age'].fillna(age_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['age'].fillna(age_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

(0, 0, 0)

In [None]:
print(f'before: {train.shape} / {test.shape}')
drop_cols = ['name', 'ticket', 'cabin']

train.drop(drop_cols, axis=1, inplace=True) # 모델이 학습하는데 사용하는 데이터
test.drop(drop_cols, axis=1, inplace=True) # 모델의 학습을 평가(잘했는지?? 못했는지??)하기 위한 데이터
ori_te.drop(drop_cols, axis=1, inplace=True) # 학습이 잘된 모델을 이용해서 내가 알고 싶은(ori_te) 데이터를 예측하게 하는 것

print(f'after: {train.shape} / {test.shape}')
train.info()

before: (732, 11) / (184, 11)
after: (732, 8) / (184, 8)
<class 'pandas.core.frame.DataFrame'>
Index: 732 entries, 914 to 636
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  732 non-null    int64  
 1   pclass       732 non-null    int64  
 2   gender       732 non-null    object 
 3   age          732 non-null    float64
 4   sibsp        732 non-null    int64  
 5   parch        732 non-null    int64  
 6   fare         732 non-null    float64
 7   embarked     732 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 51.5+ KB


### EDA

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 732 entries, 914 to 636
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  732 non-null    int64  
 1   pclass       732 non-null    int64  
 2   gender       732 non-null    object 
 3   age          732 non-null    float64
 4   sibsp        732 non-null    int64  
 5   parch        732 non-null    int64  
 6   fare         732 non-null    float64
 7   embarked     732 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 51.5+ KB


### Data Encoding

In [None]:
enc_cols = ['gender', 'embarked']
normal_cols = list(set(train.columns) - set(enc_cols))
normal_cols

['sibsp', 'age', 'pclass', 'passengerid', 'parch', 'fare']

In [None]:
reset_seeds()
print(f'before: {train.shape} / {test.shape}')

enc = OneHotEncoder()
# train
tmp_tr = pd.DataFrame(
    enc.fit_transform(train[enc_cols]).toarray(),
    columns = enc.get_feature_names_out()
)
enc_tr = pd.concat(
    [train[normal_cols].reset_index(drop=True), tmp_tr.reset_index(drop=True)]
    , axis=1
)
# test
tmp_te = pd.DataFrame(
    enc.transform(test[enc_cols]).toarray(),
    columns = enc.get_feature_names_out()
)
enc_te = pd.concat(
    [test[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)
# ori_test
tmp_te = pd.DataFrame(
    enc.transform(ori_te[enc_cols]).toarray(),
    columns = enc.get_feature_names_out()
)
enc_ori_te = pd.concat(
    [ori_te[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)

print(f'after: {enc_tr.shape} / {enc_te.shape}')
enc_tr.head()

before: (732, 8) / (184, 8)
after: (732, 11) / (184, 11)


Unnamed: 0,sibsp,age,pclass,passengerid,parch,fare,gender_female,gender_male,embarked_C,embarked_Q,embarked_S
0,0,28.0,3,914,0,7.7333,1.0,0.0,0.0,1.0,0.0
1,4,6.0,3,805,2,31.275,1.0,0.0,0.0,0.0,1.0
2,1,1.0,3,255,1,12.1833,1.0,0.0,0.0,0.0,1.0
3,0,70.5,3,769,0,7.75,0.0,1.0,0.0,1.0,0.0
4,0,35.0,3,527,0,7.125,0.0,1.0,0.0,0.0,1.0


## Training

In [None]:
enc_tr.isnull().sum().sum(), enc_te.isnull().sum().sum(), enc_ori_te.isnull().sum().sum()

(0, 0, 0)

In [None]:
# row는 다르더라도 column은 같아야 함
enc_tr.shape, enc_te.shape, enc_ori_te.shape

((732, 11), (184, 11), (393, 11))

In [None]:
reset_seeds()

# Random Forest 하이퍼파라미터 설정
parameters = {
    'n_estimators': 500,
    'max_depth': 10,
    'min_samples_split': 5,
    'min_samples_leaf': 3,
    'max_features': 'sqrt',
    'random_state': args.random_seed
}

# Random Forest 모델 정의
model_rf_V0 = RandomForestClassifier(**parameters)

print(f'{model_rf_V0} : {enc_tr.shape} / {y_tr.shape}')
model_rf_V0.fit(enc_tr, y_tr)

RandomForestClassifier(max_depth=10, min_samples_leaf=3, min_samples_split=5,
                       n_estimators=500, random_state=42) : (732, 11) / (732,)


## Evaluation (평가)

In [None]:
reset_seeds()

# Random Forest 모델
# - Train data
score_tr_rf = model_rf_V0.score(enc_tr, y_tr)
# - Test data
score_te_rf = model_rf_V0.score(enc_te, y_te)

print(f'{model_rf_V0} : {score_tr_rf}, {score_te_rf}')


RandomForestClassifier(max_depth=30, min_samples_leaf=5, min_samples_split=5,
                       n_estimators=500, random_state=42) : 0.8838797814207651, 0.8586956521739131


### AUC 점수

In [None]:
y_pred = model_rf_V0.predict_proba(enc_te)[:,1]
fpr, tpr, thresholds = roc_curve(y_te, y_pred)

auc_te = auc(fpr, tpr)
print(f'{model_rf_V0}: {auc_te}')

RandomForestClassifier(max_depth=30, min_samples_leaf=5, min_samples_split=5,
                       n_estimators=500, random_state=42): 0.9175438596491229


In [None]:
ori_te_pred = model_rf_V0.predict_proba(ori_te)[:,1]
ori_te_pred.shape

(393,)

### 특성 중요도

In [None]:
df_feature_importances = pd.DataFrame(model_rf_V0.feature_importances_, X_tr.columns).sort_values(by=[0], ascending=False).reset_index()

print(f'{df_feature_importances.shape}')
df_feature_importances

### 결과 정리

In [None]:
args.results.append(
    {
        'model': 'model_rf_V0',
        'score_tr': score_tr_rf,
        'score_te': score_te_rf,
        'auc_te': auc_te,
        'ori_te_pred': ori_te_pred,
        'len_features': X_tr.shape[1],
        'feaute_importances': list(df_feature_importances['index'].values[:X_tr.shape[1]]),
        'create_dt': today
    }
)

In [None]:
pd.DataFrame(args.results).sort_values(by=['auc_te'], ascending=False)

# model_rf_V1

## Train & Test Split

In [10]:
y = ori_train['survived']
X = ori_train.drop(['survived'], axis=1)

In [11]:
reset_seeds()
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=ori_train['survived'])

X_tr.shape, X_te.shape, y_tr.shape, y_te.shape

((732, 11), (184, 11), (732,), (184,))

## Data Preprocessing

In [12]:
train = X_tr.copy()
test = X_te.copy()
ori_te = ori_test.copy()

train.shape, test.shape, ori_te.shape

((732, 11), (184, 11), (393, 11))

### Data Cleaning

In [13]:
train.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
cabin,580
age,135
passengerid,0
pclass,0
name,0
gender,0
sibsp,0
parch,0
ticket,0
fare,0


In [14]:
test.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
cabin,138
age,45
embarked,1
passengerid,0
pclass,0
name,0
gender,0
sibsp,0
parch,0
ticket,0


In [15]:
ori_te.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
cabin,296
age,83
fare,1
embarked,1
passengerid,0
pclass,0
name,0
gender,0
sibsp,0
parch,0


In [16]:
# age와 fare의 결측치 확인
print(train[['pclass', 'gender', 'age', 'fare']].isnull().sum(), '\n')
print(test[['pclass', 'gender', 'age', 'fare']].isnull().sum(), '\n')
print(ori_te[['pclass', 'gender', 'age', 'fare']].isnull().sum(), '\n')

pclass      0
gender      0
age       135
fare        0
dtype: int64 

pclass     0
gender     0
age       45
fare       0
dtype: int64 

pclass     0
gender     0
age       83
fare       1
dtype: int64 



In [17]:
# pclass와 gender로 그룹화하여 그룹별 평균값으로 결측치 대체
age_mean = train.groupby(['pclass', 'gender'])['age'].transform(lambda x: x.fillna(x.mean()))
age_overall_mean = train['age'].mean()
fare_mean = train.groupby(['pclass', 'gender'])['fare'].transform(lambda x: x.fillna(x.mean()))
fare_overall_mean = train['fare'].mean()

# embarked와 cabin에 대해 mode()를 사용하는 이유는 두 열이 범주형 데이터이기 때문
embarked_mode = train['embarked'].mode().values[0]

age_mean, fare_mean, embarked_mode

(914    21.414891
 805     6.000000
 255     1.000000
 769    70.500000
 527    35.000000
          ...    
 403     1.000000
 824    34.000000
 879    10.000000
 214    24.000000
 636    70.000000
 Name: age, Length: 732, dtype: float64,
 914     7.7333
 805    31.2750
 255    12.1833
 769     7.7500
 527     7.1250
         ...   
 403    15.7417
 824     8.0500
 879    24.1500
 214    79.2000
 636    71.0000
 Name: fare, Length: 732, dtype: float64,
 'S')

In [18]:
# train과 test, ori_te 데이터마다 그룹화 결과가 달라서 결측치가 완전히 없어지지 않는 것으로 생각해서
# 그 경우에는 그냥 train 데이터의 전체 평균으로 대체
train['age'].fillna(age_mean, inplace=True)
train['age'].fillna(age_overall_mean, inplace=True)
test['age'].fillna(age_mean, inplace=True)
test['age'].fillna(age_overall_mean, inplace=True)
ori_te['age'].fillna(age_mean, inplace=True)
ori_te['age'].fillna(age_overall_mean, inplace=True)

train['fare'].fillna(fare_mean, inplace=True)
train['fare'].fillna(fare_overall_mean, inplace=True)
test['fare'].fillna(fare_mean, inplace=True)
test['fare'].fillna(fare_overall_mean, inplace=True)
ori_te['fare'].fillna(fare_mean, inplace=True)
ori_te['fare'].fillna(fare_overall_mean, inplace=True)

train['embarked'].fillna(embarked_mode, inplace=True)
test['embarked'].fillna(embarked_mode, inplace=True)
ori_te['embarked'].fillna(embarked_mode, inplace=True)

# cabin의 경우 사용하지 않을 것이고, cabin 컬럼의 값이 있으면 1 없으면 0이라는 데이터가 들어가는 컬럼을 추가
# Cabin 존재 여부를 나타내는 변수 생성
train['has_cabin'] = train['cabin'].notnull().astype(int)
test['has_cabin'] = test['cabin'].notnull().astype(int)
ori_te['has_cabin'] = ori_te['cabin'].notnull().astype(int)

train.isnull().sum().sum(), test.isnull().sum().sum(), ori_te.isnull().sum().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['age'].fillna(age_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['age'].fillna(age_overall_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

(580, 138, 296)

In [19]:
print(f'before: {train.shape} / {test.shape}')
drop_cols = ['name', 'ticket', 'cabin']

train.drop(drop_cols, axis=1, inplace=True) # 모델이 학습하는데 사용하는 데이터
test.drop(drop_cols, axis=1, inplace=True) # 모델의 학습을 평가(잘했는지?? 못했는지??)하기 위한 데이터
ori_te.drop(drop_cols, axis=1, inplace=True) # 학습이 잘된 모델을 이용해서 내가 알고 싶은(ori_te) 데이터를 예측하게 하는 것

print(f'after: {train.shape} / {test.shape}')
train.info()

before: (732, 12) / (184, 12)
after: (732, 9) / (184, 9)
<class 'pandas.core.frame.DataFrame'>
Index: 732 entries, 914 to 636
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  732 non-null    int64  
 1   pclass       732 non-null    int64  
 2   gender       732 non-null    object 
 3   age          732 non-null    float64
 4   sibsp        732 non-null    int64  
 5   parch        732 non-null    int64  
 6   fare         732 non-null    float64
 7   embarked     732 non-null    object 
 8   has_cabin    732 non-null    int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 73.4+ KB


In [20]:
# 컬럼 삭제 이후에 결측치 있는지 확인
train.isnull().sum().sum(), test.isnull().sum().sum(), ori_te.isnull().sum().sum()

(0, 0, 0)

### EDA

In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 732 entries, 914 to 636
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  732 non-null    int64  
 1   pclass       732 non-null    int64  
 2   gender       732 non-null    object 
 3   age          732 non-null    float64
 4   sibsp        732 non-null    int64  
 5   parch        732 non-null    int64  
 6   fare         732 non-null    float64
 7   embarked     732 non-null    object 
 8   has_cabin    732 non-null    int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 73.4+ KB


### Data Encoding

In [22]:
train.columns, len(train.columns)

(Index(['passengerid', 'pclass', 'gender', 'age', 'sibsp', 'parch', 'fare',
        'embarked', 'has_cabin'],
       dtype='object'),
 9)

In [23]:
# 범주형 컬럼들
# has_cabin 컬럼도 상태값을 가지고 있는 것이기 때문에 범주형으로 봐야 함.
enc_cols = ['gender', 'embarked', 'has_cabin']

# 수치형 컬럼들
# 집합은 중복을 제거하고, 차집합 연산을 수행할 수 있는 자료구조이므로 이를 활용하여 범주형 컬럼들과 분리
normal_cols = list(set(train.columns) - set(enc_cols))

print(f"범주형 컬럼들 : {enc_cols} / 수치형 컬럼들 : {normal_cols}")
print(f"범주형 컬럼들 개수: {len(enc_cols)} / 수치형 컬럼들 개수 : {len(normal_cols)}")

범주형 컬럼들 : ['gender', 'embarked', 'has_cabin'] / 수치형 컬럼들 : ['parch', 'sibsp', 'pclass', 'age', 'fare', 'passengerid']
범주형 컬럼들 개수: 3 / 수치형 컬럼들 개수 : 6


In [24]:
reset_seeds()
print(f'before: {train.shape} / {test.shape}')

enc = OneHotEncoder()

# train
tmp_tr = pd.DataFrame(
    enc.fit_transform(train[enc_cols]).toarray(),
    columns = enc.get_feature_names_out()
)
enc_tr = pd.concat(
    [train[normal_cols].reset_index(drop=True), tmp_tr.reset_index(drop=True)]
    , axis=1
)

# test
tmp_te = pd.DataFrame(
    enc.transform(test[enc_cols]).toarray(),
    columns = enc.get_feature_names_out()
)
enc_te = pd.concat(
    [test[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)

# ori_test
tmp_te = pd.DataFrame(
    enc.transform(ori_te[enc_cols]).toarray(),
    columns = enc.get_feature_names_out()
)
enc_ori_te = pd.concat(
    [ori_te[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)

print(f'after: {enc_tr.shape} / {enc_te.shape}')
enc_tr.head()

before: (732, 9) / (184, 9)
after: (732, 13) / (184, 13)


Unnamed: 0,parch,sibsp,pclass,age,fare,passengerid,gender_female,gender_male,embarked_C,embarked_Q,embarked_S,has_cabin_0,has_cabin_1
0,0,0,3,21.414891,7.7333,914,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,2,4,3,6.0,31.275,805,1.0,0.0,0.0,0.0,1.0,1.0,0.0
2,1,1,3,1.0,12.1833,255,1.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0,0,3,70.5,7.75,769,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0,0,3,35.0,7.125,527,0.0,1.0,0.0,0.0,1.0,1.0,0.0


## Training

In [25]:
enc_tr.isnull().sum().sum(), enc_te.isnull().sum().sum(), enc_ori_te.isnull().sum().sum()

(0, 0, 0)

In [26]:
# row는 다르더라도 column은 같아야 함
enc_tr.shape, enc_te.shape, enc_ori_te.shape

((732, 13), (184, 13), (393, 13))

In [27]:
reset_seeds()

# Random Forest 하이퍼파라미터 설정
parameters = {
    'n_estimators': 500,
    'max_depth': 30,
    'min_samples_split': 5,
    'min_samples_leaf': 5,
    'max_features': 'sqrt',
    'random_state': args.random_seed
}

# Random Forest 모델 정의
model_rf_V1 = RandomForestClassifier(**parameters)

print(f'{model_rf_V1} : {enc_tr.shape} / {y_tr.shape}')
model_rf_V1.fit(enc_tr, y_tr)

RandomForestClassifier(max_depth=30, min_samples_leaf=5, min_samples_split=5,
                       n_estimators=500, random_state=42) : (732, 13) / (732,)


## Evaluation (평가)

In [28]:
reset_seeds()

# Random Forest 모델
# - Train data
score_tr_rf = model_rf_V1.score(enc_tr, y_tr)
# - Test data
score_te_rf = model_rf_V1.score(enc_te, y_te)

print(f'{model_rf_V1} : {score_tr_rf}, {score_te_rf}')


RandomForestClassifier(max_depth=30, min_samples_leaf=5, min_samples_split=5,
                       n_estimators=500, random_state=42) : 0.8770491803278688, 0.8586956521739131


### AUC 점수

In [29]:
y_pred = model_rf_V1.predict_proba(enc_te)[:,1]
fpr, tpr, thresholds = roc_curve(y_te, y_pred)

auc_te = auc(fpr, tpr)
print(f'{model_rf_V1}: {auc_te}')

RandomForestClassifier(max_depth=30, min_samples_leaf=5, min_samples_split=5,
                       n_estimators=500, random_state=42): 0.9259398496240601


In [30]:
ori_te_pred = model_rf_V1.predict_proba(enc_ori_te)[:,1]
ori_te_pred.shape

(393,)

### 특성 중요도

In [31]:
df_feature_importances = pd.DataFrame(model_rf_V1.feature_importances_, enc_tr.columns).sort_values(by=[0], ascending=False).reset_index()

print(f'{df_feature_importances.shape}')
df_feature_importances

(13, 2)


Unnamed: 0,index,0
0,gender_female,0.341754
1,gender_male,0.337929
2,fare,0.073868
3,age,0.068291
4,passengerid,0.056572
5,pclass,0.029996
6,sibsp,0.027168
7,has_cabin_0,0.015208
8,parch,0.015198
9,has_cabin_1,0.013687


### 결과 정리

In [32]:
args.results.append(
    {
        'model': 'model_rf_V1',
        'score_tr': score_tr_rf,
        'score_te': score_te_rf,
        'auc_te': auc_te,
        'ori_te_pred': ori_te_pred,
        'len_features': X_tr.shape[1],
        'feaute_importances': list(df_feature_importances['index'].values[:X_tr.shape[1]]),
        'create_dt': today
    }
)

In [33]:
pd.DataFrame(args.results).sort_values(by=['auc_te'], ascending=False)

Unnamed: 0,model,score_tr,score_te,auc_te,ori_te_pred,len_features,feaute_importances,create_dt
0,model_rf_V1,0.877049,0.858696,0.92594,"[0.8324515682544862, 0.8033673247193746, 0.831...",11,"[gender_female, gender_male, fare, age, passen...",210


# Submission

In [34]:
default_submission.head()

Unnamed: 0,passengerid,survived
0,916,0.5
1,917,0.5
2,918,0.5
3,919,0.5
4,920,0.5


In [35]:
default_submission['survived'] = ori_te_pred
print(f"{default_submission.isnull().sum().sum()}")
default_submission.head()

0


Unnamed: 0,passengerid,survived
0,916,0.832452
1,917,0.803367
2,918,0.831495
3,919,0.118582
4,920,0.88541


In [36]:
default_submission.to_csv(args.submission_csv, header=True, index=False)

# args.results 리스트에 저장하여 csv 형태로 저장
- 추후에 쉽게 모델, 파라미터 등을 위해서

In [37]:
# 폴더가 존재하지 않으면 생성
os.makedirs(history_results_dir, exist_ok=True)

# 방금 제출한 csv 파일의 파일명만 추출
submission_filename = os.path.basename(args.submission_csv)

# 한국 타임존 설정
kst = pytz.timezone('Asia/Seoul')

# 현재 한국 시간 가져오기
now_kst = datetime.now(kst)

args.results.append(
    {
        'model': 'model_rf_V1',
        'model_params': parameters,
        'score_tr': score_tr_rf,
        'score_te': score_te_rf,
        'auc_te': auc_te,
        'ori_te_pred': ori_te_pred,
        'len_features': X_tr.shape[1],
        'feaute_importances': list(df_feature_importances['index'].values[:X_tr.shape[1]]),
        'create_dt': str(now_kst.strftime("%Y-%m-%d %H:%M:%S")),
        'submission_filename': submission_filename
    }
)
args.results

[{'model': 'model_rf_V1',
  'score_tr': 0.8770491803278688,
  'score_te': 0.8586956521739131,
  'auc_te': 0.9259398496240601,
  'ori_te_pred': array([0.83245157, 0.80336732, 0.83149451, 0.11858213, 0.88541024,
         0.88142453, 0.17276089, 0.16693497, 0.55821211, 0.13462167,
         0.16978171, 0.15921247, 0.53784947, 0.74987209, 0.17801022,
         0.26655029, 0.24256255, 0.12198167, 0.15374447, 0.12291235,
         0.1908287 , 0.15067842, 0.12508416, 0.12752809, 0.19309056,
         0.16519172, 0.22990846, 0.6680579 , 0.36061845, 0.18751829,
         0.70783993, 0.69881523, 0.28115407, 0.24418988, 0.28109273,
         0.14084707, 0.83635825, 0.12787603, 0.31467453, 0.20589292,
         0.11922555, 0.9068171 , 0.17900651, 0.91901902, 0.12004617,
         0.12666997, 0.19243579, 0.19328382, 0.15916191, 0.17406176,
         0.4390872 , 0.20856451, 0.94791557, 0.18969633, 0.58361214,
         0.82800716, 0.20659435, 0.14904112, 0.12908881, 0.53329075,
         0.87686353, 0.53245856

In [38]:
# args.results를 DataFrame으로 변환
df_results = pd.DataFrame(args.results)

# 이전 실험 결과가 저장된 파일이 있으면 불러오기
if os.path.exists(history_results_path):
    df_prev_results = pd.read_csv(history_results_path)
    df_results = pd.concat([df_prev_results, df_results], ignore_index=True)


# 데이터프레임을 CSV 파일로 저장
df_results.to_csv(history_results_path, index=False)

In [39]:
# 모든 실험에 대한 결과가 저장된 csv에서 상위 5개를 조회
history = pd.read_csv(history_results_path)
history.sort_values(by=['auc_te'], ascending=False).head()

Unnamed: 0,model,model_params,score_tr,score_te,auc_te,ori_te_pred,len_features,feaute_importances,create_dt,submission_filename
9,model_rf_V0,,1.0,0.875,0.929261,[0.87 0.81 0.93 0.07 0.94 0.96 0.22 0.14 0.49 ...,11,"['gender_female', 'gender_male', 'passengerid'...",210,
20,random_forest,"{'n_estimators': 500, 'max_depth': 30, 'min_sa...",0.877049,0.858696,0.92594,[0.83245157 0.80336732 0.83149451 0.11858213 0...,11,"['gender_female', 'gender_male', 'fare', 'age'...",2025-02-10 14:15:09,submission_0210_6.csv
19,model_rf_V1,,0.877049,0.858696,0.92594,[0.83245157 0.80336732 0.83149451 0.11858213 0...,11,"['gender_female', 'gender_male', 'fare', 'age'...",0210,
7,random_forest,"{'n_estimators': 500, 'max_depth': 10, 'min_sa...",0.894809,0.86413,0.924687,[0.85586648 0.77351232 0.86423613 0.1335128 0...,11,"['gender_female', 'gender_male', 'fare', 'age'...",209,submission_0209_2.csv
8,random_forest,"{'n_estimators': 500, 'max_depth': 30, 'min_sa...",0.877049,0.858696,0.924687,[0.84749016 0.80134122 0.85024549 0.12336316 0...,11,"['gender_female', 'gender_male', 'fare', 'age'...",209,submission_0209_2.csv
