# 결정 트리 실습

사용자 행동 인식 데이터 세트

[원본 데이터셋]
https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones

데이터 세트 정보:

실험은 19-48세 사이의 30명의 지원자 그룹으로 수행되었습니다. 각자 스마트폰(Samsung Galaxy S II)을 허리에 차고 6가지 활동(
1:WALKING, 
2:WALKING_UPSTAIRS, 
3:WALKING_DOWNSTAIRS, 4:SITTING, 5:STANDING, 6:LAYING)을 수행했습니다. 내장된 가속도계와 자이로스코프를 사용하여 50Hz의 일정한 속도로 3축 선형 가속도와 3축 각속도를 캡처했습니다. 실험은 데이터에 수동으로 레이블을 지정하기 위해 비디오로 녹화되었습니다. 얻은 데이터 세트는 훈련 데이터 생성을 위해 70%, 테스트 데이터 생성을 위해 30%가 선택된 두 세트로 무작위로 분할되었습니다.

센서 신호(가속도계 및 자이로스코프)는 노이즈 필터를 적용하여 사전 처리된 다음 2.56초 및 50% 중첩(128 판독/창)의 고정 너비 슬라이딩 창에서 샘플링되었습니다. 중력 및 신체 운동 성분을 갖는 센서 가속도 신호는 Butterworth 저역 통과 필터를 사용하여 신체 가속도와 중력으로 분리되었습니다. 중력은 저주파 성분만 있다고 가정하므로 차단 주파수가 0.3Hz인 필터를 사용했습니다. 각 창에서 시간 및 주파수 영역에서 변수를 계산하여 특징 벡터를 얻었습니다.

이 데이터 세트에 대한 자세한 내용은 README.txt 파일을 확인하십시오.

참가자 중 한 명과 함께 녹화된 6가지 활동의 예를 포함하는 실험 비디오는 다음 링크에서 볼 수 있습니다.

https://www.youtube.com/watch?v=XOEN9W05_4A]


**Mission**

수집된 데이터를 기반으로 결정트리를 이용해 어떤 행동을 하고 있는지 예측해 봅시다.

In [None]:
# ## 코랩을 사용할 때
# #드라이브 마운트
# from google.colab import drive
# drive.mount('/content/drive')

# #현재 작업 위치 이동
# #띄어쓰기에 \붙일 것
# %cd /content/drive/Othercomputers/내\ 노트북_before/Devpy/13.머신러닝

In [134]:
import pandas as pd
import matplotlib.pyplot as plt

## 데이터 로드

1. 피처만 있는 파일을 로드
2. 훈련데이터와 피처를 합쳐 훈련 데이터 프레임 만듬
3. 데스트데이터와 피처를 합쳐 테스트 데이터 프레임 만듬

In [135]:
# 피처이름 로드 
feature_name_df = pd.read_csv('./dataset/human_activity/features_new.txt',sep='\s+', header=None,names=['column_name'])


In [136]:
feature_name_df # 이름리스트만 있다.

Unnamed: 0,column_name
0,tBodyAcc-mean()-X
1,tBodyAcc-mean()-Y
2,tBodyAcc-mean()-Z
3,tBodyAcc-std()-X
4,tBodyAcc-std()-Y
...,...
556,"angle(tBodyGyroMean,gravityMean)"
557,"angle(tBodyGyroJerkMean,gravityMean)"
558,"angle(X,gravityMean)"
559,"angle(Y,gravityMean)"


In [137]:
# 피처이름을 리스트로
feature_name = feature_name_df['column_name'].values.tolist()

In [138]:
feature_name # 파이썬의 리스트형태로 .

['tBodyAcc-mean()-X',
 'tBodyAcc-mean()-Y',
 'tBodyAcc-mean()-Z',
 'tBodyAcc-std()-X',
 'tBodyAcc-std()-Y',
 'tBodyAcc-std()-Z',
 'tBodyAcc-mad()-X',
 'tBodyAcc-mad()-Y',
 'tBodyAcc-mad()-Z',
 'tBodyAcc-max()-X',
 'tBodyAcc-max()-Y',
 'tBodyAcc-max()-Z',
 'tBodyAcc-min()-X',
 'tBodyAcc-min()-Y',
 'tBodyAcc-min()-Z',
 'tBodyAcc-sma()',
 'tBodyAcc-energy()-X',
 'tBodyAcc-energy()-Y',
 'tBodyAcc-energy()-Z',
 'tBodyAcc-iqr()-X',
 'tBodyAcc-iqr()-Y',
 'tBodyAcc-iqr()-Z',
 'tBodyAcc-entropy()-X',
 'tBodyAcc-entropy()-Y',
 'tBodyAcc-entropy()-Z',
 'tBodyAcc-arCoeff()-X,1',
 'tBodyAcc-arCoeff()-X,2',
 'tBodyAcc-arCoeff()-X,3',
 'tBodyAcc-arCoeff()-X,4',
 'tBodyAcc-arCoeff()-Y,1',
 'tBodyAcc-arCoeff()-Y,2',
 'tBodyAcc-arCoeff()-Y,3',
 'tBodyAcc-arCoeff()-Y,4',
 'tBodyAcc-arCoeff()-Z,1',
 'tBodyAcc-arCoeff()-Z,2',
 'tBodyAcc-arCoeff()-Z,3',
 'tBodyAcc-arCoeff()-Z,4',
 'tBodyAcc-correlation()-X,Y',
 'tBodyAcc-correlation()-X,Z',
 'tBodyAcc-correlation()-Y,Z',
 'tGravityAcc-mean()-X',
 'tGravityA

In [139]:
# 학습 피처 데이터 셋과 테스트 피처 데이터을 DataFrame으로 로딩. 컬럼명은 feature_name 적용
# 학습용 x트레인의 값을 로드하면서 컬럼이름을 리스트에서 지정
X_test = pd.read_csv('./dataset/human_activity/test/X_test.txt',sep='\s+',header=None,names=feature_name)
X_train = pd.read_csv('./dataset/human_activity/train/X_train.txt',sep='\s+',header=None,names=feature_name)


In [140]:
# 학습 레이블과 테스트 레이블 데이터을 DataFrame으로 로딩하고 컬럼명은 action으로 부여
y_train = pd.read_csv('./dataset/human_activity/train/y_train.txt',sep='\s+',header=None,names=['action'])
y_test = pd.read_csv('./dataset/human_activity/test/y_test.txt',sep='\s+',header=None,names=['action'])

In [141]:
#action 피처의 class별 갯수
y_train['action'].value_counts()

6    1407
5    1374
4    1286
1    1226
2    1073
3     986
Name: action, dtype: int64

In [154]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np
# DesisionTree분류 클래스 사용(randomstate 156)
dt_clf = DecisionTreeClassifier(random_state=156)

# 훈련
dt_clf.fit(X_train , y_train)

# 예측

pred1 = dt_clf.predict(X_test)

#평가
accuracy = accuracy_score(y_test,pred1)

print('결정트리 예측 정확도: ', np.round(accuracy, 4))

결정트리 예측 정확도:  0.8548


In [155]:
# DecisionTreeClassifier의 하이퍼 파라미터 추출
print('DecisionTreeClassifier 기본 하이퍼 파라미터:\n', dt_clf.get_params())

DecisionTreeClassifier 기본 하이퍼 파라미터:
 {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 156, 'splitter': 'best'}


## 하이퍼파라미터 튜닝1

In [156]:
from sklearn.model_selection import GridSearchCV

#max_depth를 바꿔가며 훈련해보자
#나무의 깊이를 얼마만큼 성장시킬것인가

params = {
    'max_depth' : [ 6, 8 ,10, 12, 16 ,20, 24]
}

dt_clf = DecisionTreeClassifier(random_state=156,min_samples_leaf=4,max_depth=6)




In [157]:
# GridSearchCV 사용

grid_clf  = GridSearchCV( dt_clf , param_grid=params, cv=3, refit=True)
# 훈련
grid_clf.fit(X_train,y_train)

In [158]:
grid_clf.best_params_

{'max_depth': 6}

In [159]:
grid_clf.best_score_

0.847794569480179

In [160]:
# GridSearchCV객체의 cv_results_ 속성을 DataFrame으로 생성. 
grid_df = pd.DataFrame(grid_clf.cv_results_)
grid_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,1.51585,0.039595,0.008686,0.000966,6,{'max_depth': 6},0.845777,0.864137,0.833469,0.847795,0.012601,1
1,1.800787,0.074806,0.005335,0.003773,8,{'max_depth': 8},0.838433,0.821705,0.835918,0.832019,0.007365,2
2,2.066274,0.103346,0.008028,4.8e-05,10,{'max_depth': 10},0.824153,0.844961,0.807755,0.825623,0.015225,3
3,2.199202,0.118835,0.006627,0.001891,12,{'max_depth': 12},0.824153,0.835577,0.814694,0.824808,0.008538,4
4,2.242754,0.139656,0.004202,0.003278,16,{'max_depth': 16},0.824561,0.835577,0.813061,0.8244,0.009193,5
5,2.239369,0.13997,0.008005,4e-06,20,{'max_depth': 20},0.824561,0.835577,0.813061,0.8244,0.009193,5
6,2.241991,0.134092,0.007962,5e-05,24,{'max_depth': 24},0.824561,0.835577,0.813061,0.8244,0.009193,5


In [162]:

#베스트 분류기 선택
estimator = grid_clf.best_estimator_

 #예측

pred2 = estimator.predict(X_test)

#평가
print('결정트리 예측 정확도: ', np.round(accuracy, 4))
print('결정트리 예측 정확도: ', np.round(accuracy_score(y_test,pred2), 4))


결정트리 예측 정확도:  0.8548
결정트리 예측 정확도:  0.8558


```
    하이퍼파라미터 튜닝으로 성능이 향상 되었나요? 조금달라졌어요,,
```

## 하이퍼파라미터 튜닝2

In [163]:
#하이퍼파라미터
params = {
    'max_depth' : [ 8 , 12, 16 ,20], 
    'min_samples_split' : [16,24],
}

dt_clf = DecisionTreeClassifier(random_state=156,min_samples_leaf=4,max_depth=6)

In [164]:
# GridSearchCV 사용

grid_clf  = GridSearchCV( dt_clf , param_grid=params, cv=3, refit=True)
# 훈련
grid_clf.fit(X_train,y_train)


In [165]:
#베스트 분류기 선택

estimator = grid_clf.best_estimator_

#예측
pred3 = estimator.predict(X_test)

#평가
print('결정트리 예측 정확도: ', np.round(accuracy_score(y_test,pred1), 4))
print('결정트리 예측 정확도: ', np.round(accuracy_score(y_test,pred2), 4))
print('결정트리 예측 정확도: ', np.round(accuracy_score(y_test,pred3), 4))

결정트리 예측 정확도:  0.8548
결정트리 예측 정확도:  0.8558
결정트리 예측 정확도:  0.8714


```
    하이퍼파라미터 튜닝으로 성능이 향상 되었나요?
```

---

**[생활탐구]**


```
방문을 꼭 닫은채 공부하고 있다는 보검이...
정말 보검이는 공부하고 있을까요?
생체 신호 측정기를 착용한 보검이로 부터 아래와 같은 신호를 가져왔습니다. 
지금 현재 보검이는 무엇을 하고 있나요?

[보검이 생체 신호 파일]X_quiz.txt 

```

In [442]:
X_quiz = pd.read_csv('./dataset/human_activity/test/X_quiz.txt' ,header=None,names=['action'],sep=' ')

In [443]:
X_quiz

Unnamed: 0,action
0,",0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,1..."
1,"0,0.25717778,-0.02328523,-0.014653762,-0.93840..."


In [444]:
X_quiz = pd.DataFrame(X_quiz['action'].str.split(',',561).tolist())

In [445]:
X_quiz.drop(0,axis=1,inplace=True)
X_quiz.drop(0,axis=0,inplace=True)
X_quiz.columns = range(561)

In [446]:
X_quiz

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,551,552,553,554,555,556,557,558,559,560
1,0.25717778,-0.02328523,-0.014653762,-0.938404,-0.92009078,-0.66768331,-0.95250112,-0.92524867,-0.67430222,-0.89408755,...,0.071645446,-0.33037044,-0.70597388,0.0064624029,0.16291982,-0.82588562,0.27115145,-0.72000927,0.27680104,-0.057978304


In [447]:
grid_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,1.51585,0.039595,0.008686,0.000966,6,{'max_depth': 6},0.845777,0.864137,0.833469,0.847795,0.012601,1
1,1.800787,0.074806,0.005335,0.003773,8,{'max_depth': 8},0.838433,0.821705,0.835918,0.832019,0.007365,2
2,2.066274,0.103346,0.008028,4.8e-05,10,{'max_depth': 10},0.824153,0.844961,0.807755,0.825623,0.015225,3
3,2.199202,0.118835,0.006627,0.001891,12,{'max_depth': 12},0.824153,0.835577,0.814694,0.824808,0.008538,4
4,2.242754,0.139656,0.004202,0.003278,16,{'max_depth': 16},0.824561,0.835577,0.813061,0.8244,0.009193,5
5,2.239369,0.13997,0.008005,4e-06,20,{'max_depth': 20},0.824561,0.835577,0.813061,0.8244,0.009193,5
6,2.241991,0.134092,0.007962,5e-05,24,{'max_depth': 24},0.824561,0.835577,0.813061,0.8244,0.009193,5


In [448]:
grid_df.describe()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
count,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
mean,2.043747,0.107186,0.006978,0.00143,0.829457,0.839016,0.818717,0.829064,0.010187,3.571429
std,0.28271,0.037905,0.001662,0.001591,0.008898,0.012996,0.011149,0.008706,0.002734,1.618347
min,1.51585,0.039595,0.004202,4e-06,0.824153,0.821705,0.807755,0.8244,0.007365,1.0
25%,1.933531,0.089076,0.005981,4.9e-05,0.824357,0.835577,0.813061,0.8244,0.008866,2.5
50%,2.199202,0.118835,0.007962,0.000966,0.824561,0.835577,0.813061,0.824808,0.009193,4.0
75%,2.24068,0.136874,0.008017,0.002584,0.831497,0.840269,0.824082,0.828821,0.010897,5.0
max,2.242754,0.13997,0.008686,0.003773,0.845777,0.864137,0.835918,0.847795,0.015225,5.0


In [449]:
X_quiz.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,551,552,553,554,555,556,557,558,559,560
count,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
unique,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
top,0.25717778,-0.02328523,-0.014653762,-0.938404,-0.92009078,-0.66768331,-0.95250112,-0.92524867,-0.67430222,-0.89408755,...,0.071645446,-0.33037044,-0.70597388,0.0064624029,0.16291982,-0.82588562,0.27115145,-0.72000927,0.27680104,-0.057978304
freq,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [450]:
pd.read_csv('./dataset/human_activity/activity_labels.txt')

Unnamed: 0,1 WALKING
0,2 WALKING_UPSTAIRS
1,3 WALKING_DOWNSTAIRS
2,4 SITTING
3,5 STANDING
4,6 LAYING


In [451]:
estimator = grid_clf.best_estimator_
pred4 = estimator.predict(X_quiz)
pred4



array([5], dtype=int64)

### *STANDING ?* 
> 아이고 파일을 잘못 리드했네요..!

----

> 다시..

In [458]:
X_quiz55 = pd.read_csv('./dataset/human_activity/test/X_quiz55.txt',header=None)
X_quiz55

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,551,552,553,554,555,556,557,558,559,560
0,0.173769,-0.024869,-0.094733,-0.893862,-0.944204,-0.960824,-0.904192,-0.950321,-0.967834,-0.923737,...,-0.267475,0.199002,-0.324917,-0.089103,0.10531,0.183368,-0.169434,0.751465,-0.484928,-0.505


In [459]:
pred5 = estimator.predict(X_quiz55)
pred5



array([6], dtype=int64)

### LAYING!