#**스마트폰 센서 데이터 기반 모션 분류**
# 단계3 : 단계별 모델링


## 0.미션

* 나만의 앙상블 모델 만들기.
    * 1. 다양한 종류의 모델을 생성(미션3에서 생성한 모델 활용)
    * 2. 각 모델을 통합하여 예측하는 앙상블 모델링
        * Voting, Stacking
        * 앙상블을 위한 다양한 시도를 해 봅시다.
            - 예① : 미션3에서 생성한 모델을 이용한 앙상블
            - 예② : Random Jungle!(Random Forest 여러 개로 예측하는 모델)

* 파이프라인 구축
    * 테스트 데이터를 입력하면, 전처리 + 예측 예측결과가 나오도록 파이프라인을 구성합니다.

* 성능 가이드
    * Accuracy : 0.97 ~


## 1.환경설정

* 세부 요구사항
    - 경로 설정 : 로컬 수행(Ananconda)
        * 제공된 압축파일을 다운받아 압축을 풀고
        * anaconda의 root directory(보통 C:/Users/< ID > 에 project3_1 폴더를 만들고, 복사해 넣습니다.
    - 기본적으로 필요한 라이브러리를 import 하도록 코드가 작성되어 있습니다.
        * 필요하다고 판단되는 라이브러리를 추가하세요.


### (1) 라이브러리 로딩

In [155]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

import joblib
from sklearn.ensemble import StackingClassifier, VotingClassifier

# 필요한 라이브러리, 함수 로딩 ------------------
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import *


* 제공 함수 생성
    * 변수 중요도를 시각화할 수 있는 함수를 제공합니다.
    * 입력 :
        * importance : 트리모델의 변수 중요도(예: model.feature_importances_)
        * names : 변수 이름 목록(예 : x_train.columns
        * result_only  : 변수 중요도 순으로 데이터프레임만 return할지, 그래프도 포함할지 결정. False이면 결과 데이터프레임 + 그래프
        * topn : 중요도 상위 n개만 표시. all 이면 전체.
    * 출력 :
        * 중요도 그래프 : 중요도 내림차순으로 정렬
        * 중요도 데이터프레임 : 중요도 내림차순으로 정렬

In [None]:
# 변수의 특성 중요도 계산하기
def plot_feature_importance(importance, names, result_only = False, topn = 'all'):
    feature_importance = np.array(importance)
    feature_name = np.array(names)

    data={'feature_name':feature_name,'feature_importance':feature_importance}
    fi_temp = pd.DataFrame(data)

    #변수의 특성 중요도 순으로 정렬하기
    fi_temp.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    fi_temp.reset_index(drop=True, inplace = True)

    if topn == 'all' :
        fi_df = fi_temp.copy()
    else :
        fi_df = fi_temp.iloc[:topn]

    #변수의 특성 중요도 그래프로 그리기
    if result_only == False :
        plt.figure(figsize=(10,20))
        sns.barplot(x='feature_importance', y='feature_name', data = fi_df)

        plt.xlabel('importance')
        plt.ylabel('feature name')
        plt.grid()

    return fi_df

### (2) 데이터 불러오기

* 주어진 데이터셋
    * data01_train.csv : 학습 및 검증용
    * data01_test.csv : 테스트용
    
* 세부 요구사항
    * 칼럼 삭제 : data01_train.csv와 data01_test.csv 에서 'subject' 칼럼은 불필요하므로 삭제합니다.

#### 1) 데이터로딩

In [114]:
file1 = 'data01_train.csv'
file2 = 'data01_test.csv'

In [116]:
data01 = pd.read_csv(file1)
test = pd.read_csv(file2)

In [117]:
# 불필요한 칼럼 삭제
data01.drop('subject', axis=1, inplace=True)
test.drop('subject', axis=1, inplace=True)

#### 2) 기본 정보 조회

In [120]:
#전체 데이터의 행,열 개수 확인
data.shape

(5881, 562)

In [122]:
#전체 데이터의 상위 5개 행 확인
data.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Activity
0,0.288508,-0.009196,-0.103362,-0.988986,-0.962797,-0.967422,-0.989,-0.962596,-0.96565,-0.929747,...,-0.487737,-0.816696,-0.042494,-0.044218,0.307873,0.07279,-0.60112,0.331298,0.165163,STANDING
1,0.265757,-0.016576,-0.098163,-0.989551,-0.994636,-0.987435,-0.990189,-0.99387,-0.987558,-0.937337,...,-0.23782,-0.693515,-0.062899,0.388459,-0.765014,0.771524,0.345205,-0.769186,-0.147944,LAYING
2,0.278709,-0.014511,-0.108717,-0.99772,-0.981088,-0.994008,-0.997934,-0.982187,-0.995017,-0.942584,...,-0.535287,-0.829311,0.000265,-0.525022,-0.891875,0.021528,-0.833564,0.202434,-0.032755,STANDING
3,0.289795,-0.035536,-0.150354,-0.231727,-0.006412,-0.338117,-0.273557,0.014245,-0.347916,0.008288,...,-0.004012,-0.408956,-0.255125,0.612804,0.747381,-0.072944,-0.695819,0.287154,0.111388,WALKING
4,0.394807,0.034098,0.091229,0.088489,-0.106636,-0.388502,-0.010469,-0.10968,-0.346372,0.584131,...,-0.157832,-0.563437,-0.044344,-0.845268,-0.97465,-0.887846,-0.705029,0.264952,0.137758,WALKING_DOWNSTAIRS


In [124]:
#전체 데이터의 수치형 변수 분포 확인
data.describe()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
count,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,...,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0
mean,0.274811,-0.017799,-0.109396,-0.603138,-0.509815,-0.604058,-0.628151,-0.525944,-0.605374,-0.46549,...,0.126955,-0.305883,-0.623548,0.008524,-0.001185,0.00934,-0.007099,-0.491501,0.059299,-0.054594
std,0.067614,0.039422,0.058373,0.448807,0.501815,0.417319,0.424345,0.485115,0.413043,0.544995,...,0.249176,0.322808,0.310371,0.33973,0.447197,0.60819,0.476738,0.509069,0.29734,0.278479
min,-0.503823,-0.684893,-1.0,-1.0,-0.999844,-0.999667,-1.0,-0.999419,-1.0,-1.0,...,-0.965725,-0.979261,-0.999765,-0.97658,-1.0,-1.0,-1.0,-1.0,-1.0,-0.980143
25%,0.262919,-0.024877,-0.121051,-0.992774,-0.97768,-0.980127,-0.993602,-0.977865,-0.980112,-0.936067,...,-0.02161,-0.541969,-0.845985,-0.122361,-0.294369,-0.481718,-0.373345,-0.811397,-0.018203,-0.141555
50%,0.277154,-0.017221,-0.108781,-0.943933,-0.844575,-0.856352,-0.948501,-0.849266,-0.849896,-0.878729,...,0.133887,-0.342923,-0.712677,0.010278,0.005146,0.011448,-0.000847,-0.709441,0.182893,0.003951
75%,0.288526,-0.01092,-0.098163,-0.24213,-0.034499,-0.26269,-0.291138,-0.068857,-0.268539,-0.01369,...,0.288944,-0.127371,-0.501158,0.154985,0.28503,0.499857,0.356236,-0.51133,0.248435,0.111932
max,1.0,1.0,1.0,1.0,0.916238,1.0,1.0,0.967664,1.0,1.0,...,0.9467,0.989538,0.956845,1.0,1.0,0.998702,0.996078,0.977344,0.478157,1.0


In [126]:
#전체 데이터의 모든 변수 확인
data.columns

Index(['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z',
       'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z',
       'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z',
       'tBodyAcc-max()-X',
       ...
       'fBodyBodyGyroJerkMag-skewness()', 'fBodyBodyGyroJerkMag-kurtosis()',
       'angle(tBodyAccMean,gravity)', 'angle(tBodyAccJerkMean),gravityMean)',
       'angle(tBodyGyroMean,gravityMean)',
       'angle(tBodyGyroJerkMean,gravityMean)', 'angle(X,gravityMean)',
       'angle(Y,gravityMean)', 'angle(Z,gravityMean)', 'Activity'],
      dtype='object', length=562)

## **2. 데이터 전처리**

* 가변수화, 데이터 분할, NaN 확인 및 조치, 스케일링 등 필요한 전처리를 수행한다.


### (1) 데이터 분할1 : x, y

* 세부 요구사항
    - x, y로 분할합니다.

In [128]:
target = 'Activity'
x = data01.drop(target, axis = 1)
y = data01.loc[:, target]

x1 = test.drop(target, axis = 1)
y1 = test.loc[:, target]

### (2) 데이터분할2 : train, validation

* 세부 요구사항
    - train : val = 8 : 2 혹은 7 : 3
    - random_state 옵션을 사용하여 다른 모델과 비교를 위해 성능이 재현되도록 합니다.

In [130]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.2,random_state = 1)
x_test, x_val2, y_test, y_val2 = train_test_split(x1, y1, test_size = 0.2,random_state = 1)

### (3) 스케일링


* 세부 요구사항
    - 스케일링을 필요로 하는 알고리즘 사용을 위해서 코드 수행
    - min-max 방식 혹은 standard 방식 중 한가지 사용.

In [132]:
# 모듈 불러오기
from sklearn.preprocessing import MinMaxScaler,StandardScaler
# 정규화
scaler = MinMaxScaler()
col_names = list(x_train)
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)

### (4) (옵션)KNN을 위한 전처리
* KNN은 대량의 데이터에 대한 거리 계산을 할때, 데이터를 연속적인 메모리 블록에 저장하는 것이 중요합니다.
* c_contiguous 속성을 요구

In [134]:
x_train = np.ascontiguousarray(x_train)
x_val = np.ascontiguousarray(x_val)

## **3.앙상블 모델링**
* Voting 혹은 Stacking 방식의 앙상블 모델을 구성하고 성능을 평가해 봅시다.


### (1) 모델 로딩하기

* 세부 요구사항
    * 2.기본 모델링 파일에서 저장한 모델을 로딩합니다.(joblib.load)


### (2) 앙상블 모델 생성


In [222]:
#knn
param1 = {'n_neighbors':[4],  'weights': ['distance'],'metric':['euclidean', 'manhattan']}   #홀수를 자주 사용하는 이유는 다수결 투표에서 동률을 피하기 위해서
knn = KNeighborsClassifier()                                       # 가중치 방식  # 거리 계산 방식
model_knn = GridSearchCV(knn, param1, cv=5, scoring='accuracy')
model_knn.fit(x_train, y_train)
y_pred = model_knn.predict(x_val)
print(f'가장 적합한 파라미터: {model_knn.best_params_}')
best_knn = model_knn.best_estimator_
print(accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print('Classification Report\n', classification_report(y_val, y_pred))

가장 적합한 파라미터: {'metric': 'manhattan', 'n_neighbors': 4, 'weights': 'distance'}
0.9821580288870009
[[222   0   0   0   0   0]
 [  0 183  15   0   0   0]
 [  0   6 229   0   0   0]
 [  0   0   0 192   0   0]
 [  0   0   0   0 156   0]
 [  0   0   0   0   0 174]]
Classification Report
                     precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       222
           SITTING       0.97      0.92      0.95       198
          STANDING       0.94      0.97      0.96       235
           WALKING       1.00      1.00      1.00       192
WALKING_DOWNSTAIRS       1.00      1.00      1.00       156
  WALKING_UPSTAIRS       1.00      1.00      1.00       174

          accuracy                           0.98      1177
         macro avg       0.98      0.98      0.98      1177
      weighted avg       0.98      0.98      0.98      1177



In [234]:
joblib.dump(best_knn,'model_knn(grid).pkl')

['model_knn(grid).pkl']

In [215]:
#logistic
param2 = {'C': [10], 'penalty': ['l2'], 'solver': ['newton-cg'] }
log = LogisticRegression(max_iter = 100)                
model_log = GridSearchCV(log, param2, cv=5, scoring='accuracy')
model_log.fit(x_train, y_train) # scaling 된 데이터로 학습해도 되지만 성능 비교를 위해 scaling되지 않은 데이터 부터 학습 
print(f'가장 적합한 파라미터: {model_log.best_params_}')
best_log = model_log.best_estimator_
y_pred = best_log.predict(x_val)
print(accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print('Classification Report\n' , classification_report(y_val, y_pred ))

가장 적합한 파라미터: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.983857264231096
[[222   0   0   0   0   0]
 [  0 192   6   0   0   0]
 [  0  11 224   0   0   0]
 [  0   0   0 192   0   0]
 [  0   0   0   2 154   0]
 [  0   0   0   0   0 174]]
Classification Report
                     precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       222
           SITTING       0.95      0.97      0.96       198
          STANDING       0.97      0.95      0.96       235
           WALKING       0.99      1.00      0.99       192
WALKING_DOWNSTAIRS       1.00      0.99      0.99       156
  WALKING_UPSTAIRS       1.00      1.00      1.00       174

          accuracy                           0.98      1177
         macro avg       0.98      0.99      0.98      1177
      weighted avg       0.98      0.98      0.98      1177



In [236]:
joblib.dump(best_log,'model_log(grid).pkl')

['model_log(grid).pkl']

In [225]:
#SvM
param3 = {'C': [3],'kernel' : ['linear']}
svm = SVC()
model_SVM = GridSearchCV(svm, param3, cv=5, scoring='accuracy')
model_SVM.fit(x_train, y_train)
print(f'가장 적합한 파라미터: {model_SVM.best_params_}')
best_svm  = model_SVM.best_estimator_
y_pred = model_SVM.predict(x_val)
print(accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print('Classification Report  \n' , classification_report(y_val, y_pred ))

가장 적합한 파라미터: {'C': 3, 'kernel': 'linear'}
0.9830076465590484
[[222   0   0   0   0   0]
 [  0 193   5   0   0   0]
 [  0  15 220   0   0   0]
 [  0   0   0 192   0   0]
 [  0   0   0   0 156   0]
 [  0   0   0   0   0 174]]
Classification Report  
                     precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       222
           SITTING       0.93      0.97      0.95       198
          STANDING       0.98      0.94      0.96       235
           WALKING       1.00      1.00      1.00       192
WALKING_DOWNSTAIRS       1.00      1.00      1.00       156
  WALKING_UPSTAIRS       1.00      1.00      1.00       174

          accuracy                           0.98      1177
         macro avg       0.98      0.99      0.98      1177
      weighted avg       0.98      0.98      0.98      1177



In [245]:
joblib.dump(best_svm,'model_SVM(grid).pkl')

['model_SVM(grid).pkl']

In [267]:
m1 =joblib.load('minip_LGBM_chan_real111.pkl')
m1

In [269]:
m2 =joblib.load('model_SVM(grid).pkl')
m2

In [271]:
m3 = joblib.load('model_log(grid).pkl')
m3

In [273]:
m4 = joblib.load('model_knn(grid).pkl')
m4

In [292]:
estimators = [('lgbm', m1), ('SVM', m2), ('log', m3), ('knn', m4)]

In [294]:
hv_mode = VotingClassifier(estimators=estimators, voting='hard')
hv_mode.fit(x_train, y_train)

In [286]:
pred1 = hv_mode.predict(x_val)

In [301]:
print(accuracy_score(y_val, pred1))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, pred1))

0.9872557349192863
[[222   0   0   0   0   0]
 [  0 193   5   0   0   0]
 [  0  15 220   0   0   0]
 [  0   0   0 192   0   0]
 [  0   0   0   0 156   0]
 [  0   0   0   0   0 174]]
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       222
           SITTING       0.94      0.99      0.97       198
          STANDING       0.99      0.95      0.97       235
           WALKING       0.99      1.00      1.00       192
WALKING_DOWNSTAIRS       1.00      0.99      1.00       156
  WALKING_UPSTAIRS       1.00      1.00      1.00       174

          accuracy                           0.99      1177
         macro avg       0.99      0.99      0.99      1177
      weighted avg       0.99      0.99      0.99      1177



In [325]:
model_stack = StackingClassifier(estimators=estimators
                                    , final_estimator= KNeighborsClassifier())
model_stack.fit(x_train, y_train)

In [328]:
pred1 = hv_mode.predict(x_val)
print(accuracy_score(y_val, pred1))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, pred1))

0.9872557349192863
[[222   0   0   0   0   0]
 [  0 193   5   0   0   0]
 [  0  15 220   0   0   0]
 [  0   0   0 192   0   0]
 [  0   0   0   0 156   0]
 [  0   0   0   0   0 174]]
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       222
           SITTING       0.94      0.99      0.97       198
          STANDING       0.99      0.95      0.97       235
           WALKING       0.99      1.00      1.00       192
WALKING_DOWNSTAIRS       1.00      0.99      1.00       156
  WALKING_UPSTAIRS       1.00      1.00      1.00       174

          accuracy                           0.99      1177
         macro avg       0.99      0.99      0.99      1177
      weighted avg       0.99      0.99      0.99      1177



## 4.파이프라인 구성

### (1) 함수 만들기

### (2) test 셋으로 예측하고 평가하기