# 결정 트리

### 결정 트리 모델의 시각화

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import export_graphviz
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline
import graphviz
import warnings
warnings.filterwarnings('ignore')

In [2]:
dt_clf = DecisionTreeClassifier(random_state = 156)
iris_data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target, test_size = 0.2, random_state = 11)
dt_clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=156)

In [3]:
export_graphviz(dt_clf, out_file = "tree.dot", class_names = iris_data.target_names
                , feature_names = iris_data.feature_names, impurity = True, filled = True)

In [4]:
with open("tree.dot") as f:
    dot_graph = f.read()
    src = graphviz.Source(dot_graph)
    src.view()

### 결정 트리 실습 - 사용자 행동 인식 데이터 세트

In [5]:
feature_name_df = pd.read_csv('C:/Users/creade/data_science/파이썬 머신러닝 완벽 가이드/human_activity/human_activity/features.txt'
                              , sep = '\s+', header = None, names = ['column_index', 'column_name'])

feature_name = feature_name_df.iloc[:, 1].values.tolist()

In [6]:
feature_dup_df = feature_name_df.groupby('column_name').count()
print(feature_dup_df[feature_dup_df['column_index'] > 1].count())

column_index    42
dtype: int64


In [7]:
#원본 피처명에 _1 또는 _2를 추가로 부여해 새로운 피처명을 가지는 DataFrame을 반환
def get_new_feature_name_df(old_feature_name_df):
    feature_dup_df = pd.DataFrame(data = old_feature_name_df.groupby('column_name').cumcount(), columns = ['dup_cnt'])
    feature_dup_df = feature_dup_df.reset_index()
    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, how = 'outer')
    new_feature_name_df['column_name'] = new_feature_name_df[['column_name', 'dup_cnt']].apply(lambda x : x[0] + '_' + str(x[1]) if x[1] > 0 else x[0], axis = 1)
    new_feature_name_df = new_feature_name_df.drop(['index'], axis = 1)
    return new_feature_name_df

In [8]:
def get_human_dataset():
    
    feature_name_df = pd.read_csv('C:/Users/creade/data_science/파이썬 머신러닝 완벽 가이드/human_activity/human_activity/features.txt'
                                 , sep = '\s+', header = None, names = ['column_index', 'column_name'])
    
    new_feature_name_df = get_new_feature_name_df(feature_name_df)
    feature_name = new_feature_name_df.iloc[:, 1].values.tolist()
    
    X_train = pd.read_csv('C:/Users/creade/data_science/파이썬 머신러닝 완벽 가이드/human_activity/human_activity/train/X_train.txt'
                         , sep = "\s+", names = feature_name)
    X_test = pd.read_csv('C:/Users/creade/data_science/파이썬 머신러닝 완벽 가이드/human_activity/human_activity/test/X_test.txt'
                         , sep = "\s+", names = feature_name)
    y_train = pd.read_csv('C:/Users/creade/data_science/파이썬 머신러닝 완벽 가이드/human_activity/human_activity/train/y_train.txt'
                         , sep = "\s+", header = None, names = ['action'])
    y_test = pd.read_csv('C:/Users/creade/data_science/파이썬 머신러닝 완벽 가이드/human_activity/human_activity/test/y_test.txt'
                         , sep = "\s+", header = None, names = ['action'])
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = get_human_dataset()

In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7352 entries, 0 to 7351
Columns: 561 entries, tBodyAcc-mean()-X to angle(Z,gravityMean)
dtypes: float64(561)
memory usage: 31.5 MB


In [10]:
y_train['action'].value_counts()

6    1407
5    1374
4    1286
1    1226
2    1073
3     986
Name: action, dtype: int64

In [11]:
dt_clf = DecisionTreeClassifier(random_state = 156)
dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print('의사 결정 트리 정확도: {0:.4f}'.format(accuracy))

의사 결정 트리 정확도: 0.8548


In [12]:
params = {'max_depth' : [6, 8, 10, 12, 16, 20, 24]}

grid_cv = GridSearchCV(dt_clf, param_grid = params, scoring = 'accuracy', cv = 5, verbose = 1)
grid_cv.fit(X_train, y_train)
print('GridSearchCV 최고 평균 정확도 수치 : {0:.4f}'.format(grid_cv.best_score_))
print('GridSearchCV 최적 하이퍼 파라미터 :',grid_cv.best_params_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
GridSearchCV 최고 평균 정확도 수치 : 0.8513
GridSearchCV 최적 하이퍼 파라미터 : {'max_depth': 16}


In [13]:
cv_result_df = pd.DataFrame(grid_cv.cv_results_)
cv_result_df[['param_max_depth', 'mean_test_score']]

Unnamed: 0,param_max_depth,mean_test_score
0,6,0.850791
1,8,0.851069
2,10,0.851209
3,12,0.844135
4,16,0.851344
5,20,0.8508
6,24,0.84944


In [14]:
max_depths = [6, 8, 10, 12, 16, 20, 24]

for depth in max_depths:
    dt_clf = DecisionTreeClassifier(max_depth = depth, random_state = 156)
    dt_clf.fit(X_train, y_train)
    dt_clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    print('max_depth = {0}, 정확도 : {1:.4f}'.format(depth, accuracy))

max_depth = 6, 정확도 : 0.8548
max_depth = 8, 정확도 : 0.8548
max_depth = 10, 정확도 : 0.8548
max_depth = 12, 정확도 : 0.8548
max_depth = 16, 정확도 : 0.8548
max_depth = 20, 정확도 : 0.8548
max_depth = 24, 정확도 : 0.8548
