In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree.export import export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import export_graphviz

# 데이터 준비

In [3]:
df = pd.read_excel('titanic.xls')
df.dropna(axis=0, inplace=True)
X = df.loc[:, df.columns != "survived"]
Y = df["survived"]

In [4]:
X[["1st_class", "2nd_class", "3rd_class"]] = pd.get_dummies(X["pclass"])
X[["Female","Male"]] = pd.get_dummies(X["sex"]) 
X[["C", "Q", "S"]] = pd.get_dummies(X["embarked"]) 
X = X[["1st_class","2nd_class","3rd_class","Male","C","Q","S","age","sibsp","parch","fare"]]

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=77)

# 트리 만들기

파라미터 설명

criterion : {“gini”, “entropy”}, default=”gini” <br> max_depth : int, default=None </br> min_samples_split : int or float, default=2 <br> min_samples_leaf : int or float, default=1 </br>

기타 파라미터들 및 설명: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

### 깊이 3

In [6]:
titanic_tree = DecisionTreeClassifier(max_depth = 3, random_state=77)
titanic_tree = titanic_tree.fit(X_train, Y_train)

In [7]:
r = export_text(titanic_tree, feature_names = list(X.columns))
print(r)

|--- Male <= 0.50
|   |--- 3rd_class <= 0.50
|   |   |--- fare <= 26.12
|   |   |   |--- class: 1
|   |   |--- fare >  26.12
|   |   |   |--- class: 1
|   |--- 3rd_class >  0.50
|   |   |--- fare <= 23.09
|   |   |   |--- class: 1
|   |   |--- fare >  23.09
|   |   |   |--- class: 0
|--- Male >  0.50
|   |--- age <= 14.25
|   |   |--- sibsp <= 2.50
|   |   |   |--- class: 1
|   |   |--- sibsp >  2.50
|   |   |   |--- class: 0
|   |--- age >  14.25
|   |   |--- 1st_class <= 0.50
|   |   |   |--- class: 0
|   |   |--- 1st_class >  0.50
|   |   |   |--- class: 0



In [8]:
export_graphviz(decision_tree = titanic_tree,
               out_file = "tree_model.dot",
               feature_names = list(X.columns),
               class_names = ["dead","survived"])

생성된 tree_model.dot 파일의 내용을

http://www.webgraphviz.com/

에 붙여넣기

![a](depth3.png "depth3")

In [9]:
Y_pred_tr = titanic_tree.predict(X_train)
print('Accuracy: %.4f' % accuracy_score(Y_train, Y_pred_tr))

Accuracy: 0.8178


In [10]:
Y_pred_test = titanic_tree.predict(X_test)
print('Accuracy: %.4f' % accuracy_score(Y_test, Y_pred_test))

Accuracy: 0.8083


### 깊이 10

In [11]:
titanic_tree = DecisionTreeClassifier(max_depth = 10, random_state=77)
titanic_tree = titanic_tree.fit(X_train, Y_train)

In [12]:
export_graphviz(decision_tree = titanic_tree,
               out_file = "tree_model.dot",
               feature_names = list(X.columns),
               class_names = ["dead","survived"])

![a](depth10.png "depth10")

In [13]:
Y_pred_tr = titanic_tree.predict(X_train)
print('Accuracy: %.4f' % accuracy_score(Y_train, Y_pred_tr))

Accuracy: 0.9466


In [14]:
Y_pred_test = titanic_tree.predict(X_test)
print('Accuracy: %.4f' % accuracy_score(Y_test, Y_pred_test))

Accuracy: 0.7732
