# UCI HAR

- 교차 검증
- 하이퍼 파라미터 튜닝
    - 최적의 하이퍼 파라미터(max_depth, min_samples_split)와 정확도를 구하시오.
    - 사람이 움직이는데 있어서 어떤 Feature가 중요한지 결정트리를 이용해서 20개만 골라내세요.

```
DATA_PATH = /00.data/archive.ics.uci.edu/UCI_HAR_Dataset
```

In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
features_file = '../00.data/archive.ics.uci.edu/UCI_HAR_Dataset/features.txt'
features_data = pd.read_csv(features_file, header=None, delim_whitespace=True)
features_data.head()

Unnamed: 0,0,1
0,1,tBodyAcc-mean()-X
1,2,tBodyAcc-mean()-Y
2,3,tBodyAcc-mean()-Z
3,4,tBodyAcc-std()-X
4,5,tBodyAcc-std()-Y


In [4]:
X_train_file = '../00.data/archive.ics.uci.edu/UCI_HAR_Dataset/train/X_train.txt'
X_train_data = pd.read_csv(X_train_file, header=None, delim_whitespace=True)

y_train_file = '../00.data/archive.ics.uci.edu/UCI_HAR_Dataset/train/y_train.txt'
y_train_data = pd.read_csv(y_train_file, header=None, delim_whitespace=True)

X_test_file = '../00.data/archive.ics.uci.edu/UCI_HAR_Dataset/test/X_test.txt'
X_test_data = pd.read_csv(X_test_file, header=None, delim_whitespace=True)

y_test_file = '../00.data/archive.ics.uci.edu/UCI_HAR_Dataset/test/y_test.txt'
y_test_data = pd.read_csv(y_test_file, header=None, delim_whitespace=True)

In [5]:
X_train_data.shape, y_train_data.shape, X_test_data.shape, y_test_data.shape

((7352, 561), (7352, 1), (2947, 561), (2947, 1))

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_train_data, y_train_data, test_size=0.2, random_state=2021
)

In [8]:
# 데이터 사이언스 스쿨 - 모형 최적화
# https://datascienceschool.net/03%20machine%20learning/14.01%20모형%20최적화.html

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

estimator = DecisionTreeClassifier()

In [8]:
param_grid = {
    'max_depth': [1, 10, 100],
    'min_samples_split': [1, 10, 100],
}

gs = GridSearchCV(
    estimator=estimator, 
    param_grid=param_grid,
    scoring='accuracy', 
    cv=10, 
    n_jobs=-1,
    verbose=2
)
gs = gs.fit(X_train, y_train)

print('accuracy: {:.4f}'.format(gs.best_score_))
print('parameters: {}'.format(gs.best_params_))

Fitting 10 folds for each of 9 candidates, totalling 90 fits
accuracy: 0.9347
parameters: {'max_depth': 10, 'min_samples_split': 10}


In [12]:
def reclusive_best_params_(param_grid={}, best_params_=None, best_score_=None, **kwargs):
    
    best_params_ = {
        'max_depth': 1,
        'min_samples_split': 100,
        'min_samples_leaf': 220,
    }

    completed = []

    if best_params_:
        for k, v in best_params_.items():
            print(k, v)

            digit = re.sub(r'[0]', '', str(v))
            zeros = re.sub(r'[1-9]', '', str(v))
            chars = [int(char) for char in zeros]

            completed.append(len(chars))

            if len(chars) < 1:
                continue

            param_grid.update({
                k: v
            })

    # return completed

    # gs = GridSearchCV(
    #     param_grid=param_grid,
    #     **kwargs
    # )
    # gs = gs.fit(X_train, y_train)

    # if sum(completed) > 0:
    #     return reclusive_best_params_(
    #         param_grid=param_grid,
    #         best_params_=gs.best_params_,
    #         best_score_=gs.best_score_,
    #         **kwargs
    #     )
    # return {
    #     'accuracy: {}'.format(gs.best_score_),
    #     'parameters: {}'.format(gs.best_params_),
    # }

reclusive_best_params_(
    param_grid={
        'max_depth': [1, 10, 100],
        'min_samples_split': [1, 10, 100],
        'min_samples_leaf': [1, 10, 100],
    },
    estimator=estimator,
    scoring='accuracy', 
    cv=10, 
    n_jobs=-1,
    verbose=2
)

max_depth 1
min_samples_split 100
min_samples_leaf 220


In [None]:
[1, 10, 100, 1000]

1000 [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]
2000 [2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900]
2100 [2100, 2110, 2120, 2130, 2140, 2150, 2160, 2170, 2180, 2190]
2110 [2110, 2111, 2112, 2113, 2114, 2115, 2116, 2117, 2118, 2119]

100 [100, 200, 300, 400, 500, 600, 700, 800, 900]
200 [200, 210, 220, 230, 240, 250, 260, 270, 280, 290]
210 [210, 211, 212, 213, 214, 215, 216, 217, 218, 219]

10 [10, 20, 30, 40, 50, 60, 70, 80, 90]
20 [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]

1 [1, 2, 3, 4, 5, 6, 7, 8, 9]

In [82]:
n = 12300
word = re.sub(r'[1-9]', '', str(n))
nums = [int(char) for char in word]
nums

[0, 0]

In [None]:

# step = int('1' + '0' * (len(nums)-1))
# value = list(np.arange(v, v*10, step))
# param_grid.update({k: value})

# if sum(nums) == 1:
# elif sum(nums) > 1:
#     if len(nums) == 1:
#         param_grid.update({k: v})
#     if len(nums) > 1:
#         step = int('1' + '0' * (len(nums)-1))
#         value = list(np.arange(v, v+10, step))
#         param_grid.update({k: value})

# if sum(nums) == 1:
#     step = int('1' + '0' * (len(nums)-1))
#     param_grid.update({
#         k: list(np.arange(v, v*10, step))
#     })
# elif sum(nums) > 1:


In [42]:
int('1' + '0' * (2-1))

10

In [20]:
word = str(10)
nums = [int(char) for char in word]
nums

[1, 0]

In [21]:
sum(nums), len(nums)

(1, 2)

In [47]:
import re

max_depth = [1, 10, 100, 1000]
max_depth = list(map(lambda x: int(re.sub(r'[^1]', '', str(x))), max_depth))
max_depth

[1, 1, 1, 1]

In [22]:

    gs = GridSearchCV(
        estimator=estimator, 
        param_grid=param_grid,
        scoring='accuracy', 
        cv=10, 
        n_jobs=-1,
        verbose=2
    )
    gs = gs.fit(X_train, y_train)

    print('accuracy: {}'.format(gs.best_score_))
    print('parameters: {}'.format(gs.best_params_))

TypeError: unsupported operand type(s) for *: 'NoneType' and 'int'

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import accuracy_score
import graphviz

dt_clf = DecisionTreeClassifier(max_depth=3)
dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_test)
print('정확도: {}'.format(accuracy_score(y_test, pred)))

# export_graphviz()의 호출 결과로 out_file로 지정된 tree.dot 파일을 생성함
export_graphviz(dt_clf, 
    out_file='tree.dot',
    class_names=iris.target_names,
    feature_names=iris.feature_names,
    impurity=True,
    filled=True
)

# 위에서 생성된 tree.dot 파일을 graphviz 읽어서 주피터 노트북에서 시각화
with open('tree.dot') as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)