In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 데이터 로드
titanic_df = pd.read_csv('./titanic_train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df= titanic_df.drop('Survived',axis=1)

# 1 번
# 학습/테스트 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X_titanic_df, y_titanic_df, \
 test_size=0.2, random_state=11)

# 2번
mean_Age = titanic_df['Age'].mean()
le_Embarked =[0];
le_Sex =[0];
le_Cabin=[0];
# Null 처리 함수
def fillna(df):
    
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

# 머신러닝 알고리즘에 불필요한 피처 제거
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

# 레이블 인코딩 수행.
def format_features(df1,df2):
    #cabin slicing
    df1['Cabin'] = df1['Cabin'].str[:1]
    df2['Cabin'] = df2['Cabin'].str[:1]
    
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        #train data 인코딩
        le = LabelEncoder()
        le = le.fit(df1[feature])
        print(le.classes_)
        #train_data 인코딩
        df1[feature] = le.transform(df1[feature])
        #test_data 인코딩
        df2[feature] = le.transform(df2[feature])
            
    return df1 , df2



# 앞에서 설정한 데이터 전처리 함수 호출
def transform_features(df1,df2):
    df1 = fillna(df1)
    df2 = fillna(df2)
    df1 = drop_features(df1)
    df2 = drop_features(df2)
    df1,df2 = format_features(df1,df2)
    return df1,df2



# 전처리는 학습데이터와 테스트 데이터를 분리한 다음 시행해야함
X_train,X_test = transform_features(X_train,X_test)





# 학습
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

dt_clf = DecisionTreeClassifier(random_state=11)
parameters = {'max_depth':[2,3,5,10],
              'min_samples_split':[2,3,5],
              'min_samples_leaf':[1,5,8]}

grid_dclf = GridSearchCV(dt_clf , param_grid=parameters , scoring='accuracy' , cv=5)
grid_dclf.fit(X_train , y_train)
print('GridSearchCV 최적 하이퍼 파라미터 :',grid_dclf.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_dclf.best_score_))
best_dclf = grid_dclf.best_estimator_

# 테스트: 예측, 성능평가
dpredictions = best_dclf.predict(X_test)
accuracy = accuracy_score(y_test , dpredictions)
print('테스트 세트에서의 DecisionTreeClassifier 정확도 : {0:.4f}'.format(accuracy))

['A' 'B' 'C' 'D' 'E' 'F' 'G' 'N' 'T']
['female' 'male']
['C' 'N' 'Q' 'S']
GridSearchCV 최적 하이퍼 파라미터 : {'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 2}
GridSearchCV 최고 정확도: 0.7992
테스트 세트에서의 DecisionTreeClassifier 정확도 : 0.8715
