Q. 타이타닉 생존자 예측 데이터 세트 train.csv에 대하여 다음 사항을 수행하세요.
- 일괄 전처리 사용자 함수 transform_features(df) 작성
- dt, lr, rf 모델링 및 평가(정확도)
- dt_clf , folds=5 적용하여 KFold 교차검증 수행
- dt_clf , cv=5 적용, cross_val_score를 이용하여 교차검증 수행
- GridSearchCV의 최적 하이퍼 파라미터로 학습된 Estimator로 예측 및 평가 수행.
  - parameters = {'max_depth':[2,3,5,10], 'min_samples_split':[2,3,5], 'min_samples_leaf':[1,5,8]}
  - dt_clf, scoring='accuracy', cv=5 적용

In [14]:
# 구글 마운트
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
# 데이터 확인
import pandas as pd
titanic_df = pd.read_csv('/content/drive/MyDrive/kdt_jyg/workspace/m5_머신러닝/train.csv')
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [16]:
# 일괄 전처리 사용자 함수 transform_features(df) 작성

from sklearn.preprocessing import LabelEncoder

# 1. Null 처리 함수
def fill_null_values(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

# 2. 머신러닝 불필요 속성 제거
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

# 3. 레이블 인코딩
def encode_labels(df):
    le = LabelEncoder()
    df['Cabin'] = df['Cabin'].apply(lambda x: str(x)[0])
    df['Sex'] = le.fit_transform(df['Sex'])
    df['Embarked'] = le.fit_transform(df['Embarked'])
    df['Cabin'] = le.fit_transform(df['Cabin'])
    return df

# 종합하여 실행
def transform_data(df):
  df = fill_null_values(df)
  df = drop_features(df)
  df = encode_labels(df)

  return df

In [17]:
# 학습, 검증용 데이터 분할
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

titanic_df = pd.read_csv('/content/drive/MyDrive/kdt_jyg/workspace/m5_머신러닝/train.csv')
y_titanic_df = titanic_df['Survived']
x_titanic_df = titanic_df.drop('Survived', axis=1)
x_titanic_df = transform_data(x_titanic_df)
X_train, X_test, y_train, y_test = train_test_split(x_titanic_df, y_titanic_df, test_size=0.2, random_state=120, stratify=y_titanic_df)

In [18]:
# dt 모델링 및 평가(정확도)
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
y_pred = dt_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'의사결정트리 정확도: {accuracy}')

의사결정트리 정확도: 0.770949720670391


In [19]:
# lr 모델링 및 평가(정확도)
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
y_pred = lr_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'로지스틱레그레션 정확도: {accuracy}')

로지스틱레그레션 정확도: 0.8100558659217877


In [20]:
# rf 모델링 및 평가(정확도)
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'랜덤포레스트 정확도: {accuracy}')

랜덤포레스트 정확도: 0.7988826815642458


In [22]:
# dt_clf , folds=5 적용하여 KFold 교차검증 수행
from sklearn.model_selection import KFold
import numpy as np
dt_clf = DecisionTreeClassifier()
kfold = KFold(n_splits=5, shuffle=True)
accuracy_list = []
chk_num = 0
for train_index, test_index in kfold.split(x_titanic_df):
  chk_num += 1
  X_train, X_test = x_titanic_df.iloc[train_index], x_titanic_df.iloc[test_index]
  y_train, y_test = y_titanic_df.iloc[train_index], y_titanic_df.iloc[test_index]
  dt_clf.fit(X_train, y_train)
  y_pred = dt_clf.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  print(f'{chk_num}회 정확도값: ', accuracy)
  print("="*70)
  accuracy_list.append(accuracy)

print(f'의사결정트리에 대한 KFold 정확도 평균: {np.mean(accuracy_list).round(2)}')

1회 정확도값:  0.7486033519553073
2회 정확도값:  0.7921348314606742
3회 정확도값:  0.797752808988764
4회 정확도값:  0.7752808988764045
5회 정확도값:  0.8033707865168539
의사결정트리에 대한 KFold 정확도 평균: 0.78


In [23]:
# dt_clf , cv=5 적용, cross_val_score를 이용하여 교차검증 수행
from sklearn.model_selection import cross_val_score
dt_clf = DecisionTreeClassifier()
dt_clf.fit(x_titanic_df, y_titanic_df)
scores = cross_val_score(dt_clf, x_titanic_df, y_titanic_df, cv=5, scoring='accuracy')
print(f'의사결정트리 KFold 교차검증 점수: {scores}')
print(f'의사결정트리 KFold 평균 점수: {np.mean(scores).round(2)}')

의사결정트리 KFold 교차검증 점수: [0.75418994 0.7752809  0.79775281 0.76966292 0.83707865]
의사결정트리 KFold 평균 점수: 0.79


In [27]:
# GridSearchCV의 최적 하이퍼 파라미터로 학습된 Estimator로 예측 및 평가 수행.
# 전체 데이터에 대해 최적의 파라미터 찾기

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

dtree = DecisionTreeClassifier(random_state=1)

parameters = {
    'max_depth': list(range(1, 15)),
    'min_samples_leaf': list(range(1, 15)),
    'min_samples_split': list(range(1, 15))
}

grid_search = GridSearchCV(dtree, parameters, scoring='accuracy', cv=5, refit=True)
grid_search.fit(x_titanic_df, y_titanic_df)

best_mv = grid_search.best_estimator_
y_pred = best_mv.predict(x_titanic_df)

print("최적의 하이퍼파라미터:", grid_search.best_params_)
print("최고 교차 검증 정확도:", grid_search.best_score_)
print("최적 모델의 정확도:", accuracy_score(y_titanic_df, y_pred))

최적의 하이퍼파라미터: {'max_depth': 6, 'min_samples_leaf': 6, 'min_samples_split': 2}
최고 교차 검증 정확도: 0.8283221392254095
최적 모델의 정확도: 0.8540965207631874


In [31]:
# test 데이터가 주어지지 않았다고 가정하고 X_train, y_train에 대해서 최적의 파라미터값 찾기 수행

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

dtree = DecisionTreeClassifier(random_state=1)

parameters = {
    'max_depth': list(range(1, 11)),
    'min_samples_leaf': list(range(1, 11)),
    'min_samples_split': list(range(1, 11))
}

grid_search = GridSearchCV(dtree, parameters, scoring='accuracy', cv=5, refit=True)
grid_search.fit(X_train, y_train)

best_mv = grid_search.best_estimator_
y_pred = best_mv.predict(X_train)

print("최적의 하이퍼파라미터:", grid_search.best_params_)
print("최고 교차 검증 정확도:", grid_search.best_score_)
print("최적 모델의 정확도:", accuracy_score(y_train, y_pred))

최적의 하이퍼파라미터: {'max_depth': 4, 'min_samples_leaf': 2, 'min_samples_split': 2}
최고 교차 검증 정확도: 0.8204865556978232
최적 모델의 정확도: 0.847124824684432


In [33]:
# 그리드 서치 정확도 확인
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(max_depth= 4, min_samples_leaf= 2, min_samples_split= 2)
dt_clf.fit(X_train, y_train)
y_pred = dt_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'의사결정트리 정확도(test와 비교): {accuracy}')

의사결정트리 정확도(test와 비교): 0.8089887640449438
