#**스마트폰 센서 데이터 기반 모션 분류**
# 단계3 : 단계별 모델링


## 0.미션

* 나만의 앙상블 모델 만들기.
    * 1. 다양한 종류의 모델을 생성(미션3에서 생성한 모델 활용)
    * 2. 각 모델을 통합하여 예측하는 앙상블 모델링
        * Voting, Stacking
        * 앙상블을 위한 다양한 시도를 해 봅시다.
            - 예① : 미션3에서 생성한 모델을 이용한 앙상블
            - 예② : Random Jungle!(Random Forest 여러 개로 예측하는 모델)

* 파이프라인 구축
    * 테스트 데이터를 입력하면, 전처리 + 예측 예측결과가 나오도록 파이프라인을 구성합니다.

* 성능 가이드
    * Accuracy : 0.97 ~


## 1.환경설정

* 세부 요구사항
    - 경로 설정 : 로컬 수행(Ananconda)
        * 제공된 압축파일을 다운받아 압축을 풀고
        * anaconda의 root directory(보통 C:/Users/< ID > 에 project3_1 폴더를 만들고, 복사해 넣습니다.
    - 기본적으로 필요한 라이브러리를 import 하도록 코드가 작성되어 있습니다.
        * 필요하다고 판단되는 라이브러리를 추가하세요.


### (1) 라이브러리 로딩

In [97]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

import joblib
from sklearn.ensemble import StackingClassifier, VotingClassifier

# 필요한 라이브러리, 함수 로딩 ------------------
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.metrics import *

* 제공 함수 생성
    * 변수 중요도를 시각화할 수 있는 함수를 제공합니다.
    * 입력 :
        * importance : 트리모델의 변수 중요도(예: model.feature_importances_)
        * names : 변수 이름 목록(예 : x_train.columns
        * result_only  : 변수 중요도 순으로 데이터프레임만 return할지, 그래프도 포함할지 결정. False이면 결과 데이터프레임 + 그래프
        * topn : 중요도 상위 n개만 표시. all 이면 전체.
    * 출력 :
        * 중요도 그래프 : 중요도 내림차순으로 정렬
        * 중요도 데이터프레임 : 중요도 내림차순으로 정렬

In [4]:
# 변수의 특성 중요도 계산하기
def plot_feature_importance(importance, names, result_only = False, topn = 'all'):
    feature_importance = np.array(importance)
    feature_name = np.array(names)

    data={'feature_name':feature_name,'feature_importance':feature_importance}
    fi_temp = pd.DataFrame(data)

    #변수의 특성 중요도 순으로 정렬하기
    fi_temp.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    fi_temp.reset_index(drop=True, inplace = True)

    if topn == 'all' :
        fi_df = fi_temp.copy()
    else :
        fi_df = fi_temp.iloc[:topn]

    #변수의 특성 중요도 그래프로 그리기
    if result_only == False :
        plt.figure(figsize=(10,20))
        sns.barplot(x='feature_importance', y='feature_name', data = fi_df)

        plt.xlabel('importance')
        plt.ylabel('feature name')
        plt.grid()

    return fi_df

### (2) 데이터 불러오기

* 주어진 데이터셋
    * data01_train.csv : 학습 및 검증용
    * data01_test.csv : 테스트용
    
* 세부 요구사항
    * 칼럼 삭제 : data01_train.csv와 data01_test.csv 에서 'subject' 칼럼은 불필요하므로 삭제합니다.

#### 1) 데이터로딩

In [6]:
file1 = 'data01_train.csv'
file2 = 'data01_test.csv'

In [8]:
data = pd.read_csv(file1)
test = pd.read_csv(file2)

In [10]:
# 불필요한 칼럼 삭제
data.drop('subject', axis=1, inplace=True)
test.drop('subject', axis=1, inplace=True)

In [43]:
# 열이름에 LGBMClassifier시 JSON에러 뜨는 특수문자 지우기
import re
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

#### 2) 기본 정보 조회

In [45]:
#전체 데이터의 행,열 개수 확인
data.shape

(5881, 562)

In [47]:
#전체 데이터의 상위 5개 행 확인
data.head()

Unnamed: 0,tBodyAccmeanX,tBodyAccmeanY,tBodyAccmeanZ,tBodyAccstdX,tBodyAccstdY,tBodyAccstdZ,tBodyAccmadX,tBodyAccmadY,tBodyAccmadZ,tBodyAccmaxX,...,fBodyBodyGyroJerkMagskewness,fBodyBodyGyroJerkMagkurtosis,angletBodyAccMeangravity,angletBodyAccJerkMeangravityMean,angletBodyGyroMeangravityMean,angletBodyGyroJerkMeangravityMean,angleXgravityMean,angleYgravityMean,angleZgravityMean,Activity
0,0.288508,-0.009196,-0.103362,-0.988986,-0.962797,-0.967422,-0.989,-0.962596,-0.96565,-0.929747,...,-0.487737,-0.816696,-0.042494,-0.044218,0.307873,0.07279,-0.60112,0.331298,0.165163,STANDING
1,0.265757,-0.016576,-0.098163,-0.989551,-0.994636,-0.987435,-0.990189,-0.99387,-0.987558,-0.937337,...,-0.23782,-0.693515,-0.062899,0.388459,-0.765014,0.771524,0.345205,-0.769186,-0.147944,LAYING
2,0.278709,-0.014511,-0.108717,-0.99772,-0.981088,-0.994008,-0.997934,-0.982187,-0.995017,-0.942584,...,-0.535287,-0.829311,0.000265,-0.525022,-0.891875,0.021528,-0.833564,0.202434,-0.032755,STANDING
3,0.289795,-0.035536,-0.150354,-0.231727,-0.006412,-0.338117,-0.273557,0.014245,-0.347916,0.008288,...,-0.004012,-0.408956,-0.255125,0.612804,0.747381,-0.072944,-0.695819,0.287154,0.111388,WALKING
4,0.394807,0.034098,0.091229,0.088489,-0.106636,-0.388502,-0.010469,-0.10968,-0.346372,0.584131,...,-0.157832,-0.563437,-0.044344,-0.845268,-0.97465,-0.887846,-0.705029,0.264952,0.137758,WALKING_DOWNSTAIRS


In [49]:
#전체 데이터의 수치형 변수 분포 확인
data.describe()

Unnamed: 0,tBodyAccmeanX,tBodyAccmeanY,tBodyAccmeanZ,tBodyAccstdX,tBodyAccstdY,tBodyAccstdZ,tBodyAccmadX,tBodyAccmadY,tBodyAccmadZ,tBodyAccmaxX,...,fBodyBodyGyroJerkMagmeanFreq,fBodyBodyGyroJerkMagskewness,fBodyBodyGyroJerkMagkurtosis,angletBodyAccMeangravity,angletBodyAccJerkMeangravityMean,angletBodyGyroMeangravityMean,angletBodyGyroJerkMeangravityMean,angleXgravityMean,angleYgravityMean,angleZgravityMean
count,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,...,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0
mean,0.274811,-0.017799,-0.109396,-0.603138,-0.509815,-0.604058,-0.628151,-0.525944,-0.605374,-0.46549,...,0.126955,-0.305883,-0.623548,0.008524,-0.001185,0.00934,-0.007099,-0.491501,0.059299,-0.054594
std,0.067614,0.039422,0.058373,0.448807,0.501815,0.417319,0.424345,0.485115,0.413043,0.544995,...,0.249176,0.322808,0.310371,0.33973,0.447197,0.60819,0.476738,0.509069,0.29734,0.278479
min,-0.503823,-0.684893,-1.0,-1.0,-0.999844,-0.999667,-1.0,-0.999419,-1.0,-1.0,...,-0.965725,-0.979261,-0.999765,-0.97658,-1.0,-1.0,-1.0,-1.0,-1.0,-0.980143
25%,0.262919,-0.024877,-0.121051,-0.992774,-0.97768,-0.980127,-0.993602,-0.977865,-0.980112,-0.936067,...,-0.02161,-0.541969,-0.845985,-0.122361,-0.294369,-0.481718,-0.373345,-0.811397,-0.018203,-0.141555
50%,0.277154,-0.017221,-0.108781,-0.943933,-0.844575,-0.856352,-0.948501,-0.849266,-0.849896,-0.878729,...,0.133887,-0.342923,-0.712677,0.010278,0.005146,0.011448,-0.000847,-0.709441,0.182893,0.003951
75%,0.288526,-0.01092,-0.098163,-0.24213,-0.034499,-0.26269,-0.291138,-0.068857,-0.268539,-0.01369,...,0.288944,-0.127371,-0.501158,0.154985,0.28503,0.499857,0.356236,-0.51133,0.248435,0.111932
max,1.0,1.0,1.0,1.0,0.916238,1.0,1.0,0.967664,1.0,1.0,...,0.9467,0.989538,0.956845,1.0,1.0,0.998702,0.996078,0.977344,0.478157,1.0


In [51]:
#전체 데이터의 모든 변수 확인
data.columns

Index(['tBodyAccmeanX', 'tBodyAccmeanY', 'tBodyAccmeanZ', 'tBodyAccstdX',
       'tBodyAccstdY', 'tBodyAccstdZ', 'tBodyAccmadX', 'tBodyAccmadY',
       'tBodyAccmadZ', 'tBodyAccmaxX',
       ...
       'fBodyBodyGyroJerkMagskewness', 'fBodyBodyGyroJerkMagkurtosis',
       'angletBodyAccMeangravity', 'angletBodyAccJerkMeangravityMean',
       'angletBodyGyroMeangravityMean', 'angletBodyGyroJerkMeangravityMean',
       'angleXgravityMean', 'angleYgravityMean', 'angleZgravityMean',
       'Activity'],
      dtype='object', length=562)

## **2. 데이터 전처리**

* 가변수화, 데이터 분할, NaN 확인 및 조치, 스케일링 등 필요한 전처리를 수행한다.


### (1) 데이터 분할1 : x, y

* 세부 요구사항
    - x, y로 분할합니다.

In [53]:
target = 'Activity'

x = data.drop(columns=target)
y = data.loc[:,target]

x_test = test.drop(columns=target)
y_test = test.loc[:,target]

### (2) 데이터분할2 : train, validation

* 세부 요구사항
    - train : val = 8 : 2 혹은 7 : 3
    - random_state 옵션을 사용하여 다른 모델과 비교를 위해 성능이 재현되도록 합니다.

In [55]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, random_state=1)

### (3) 스케일링


* 세부 요구사항
    - 스케일링을 필요로 하는 알고리즘 사용을 위해서 코드 수행
    - min-max 방식 혹은 standard 방식 중 한가지 사용.

In [57]:
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)
x_test = scaler.transform(x_test)

### (4) (옵션)KNN을 위한 전처리
* KNN은 대량의 데이터에 대한 거리 계산을 할때, 데이터를 연속적인 메모리 블록에 저장하는 것이 중요합니다.
* c_contiguous 속성을 요구

In [59]:
x_train = np.ascontiguousarray(x_train)
x_val = np.ascontiguousarray(x_val)

## **3.앙상블 모델링**
* Voting 혹은 Stacking 방식의 앙상블 모델을 구성하고 성능을 평가해 봅시다.


### (1) 모델 로딩하기

* 세부 요구사항
    * 2.기본 모델링 파일에서 저장한 모델을 로딩합니다.(joblib.load)


In [61]:
model_knn = joblib.load('model_knn.pkl')
model_svm = joblib.load('model_svm.pkl')
model_lr = joblib.load('model_lr.pkl')
model_lgbm = joblib.load('model_lgbm.pkl')

In [67]:
base_model = [('knn', model_knn), ('svm', model_svm), ('lr', model_lr), ('lgbm', model_lgbm)]

### (2)-1 앙상블 모델 생성


In [71]:
# Hard Voting 모델 선언
hv_mode = VotingClassifier(estimators=base_model, voting='hard')

In [74]:
# 학습
hv_mode.fit(x_train, y_train)

In [79]:
# 예측
y_pred = hv_mode.predict(x_val)

In [86]:
# 평가
print(accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

0.9886685552407932
[[331   0   0   0   0   0]
 [  0 287   5   0   0   0]
 [  0  14 317   0   0   0]
 [  0   0   0 297   0   0]
 [  0   0   0   0 239   0]
 [  0   0   0   1   0 274]]
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       331
           SITTING       0.95      0.98      0.97       292
          STANDING       0.98      0.96      0.97       331
           WALKING       1.00      1.00      1.00       297
WALKING_DOWNSTAIRS       1.00      1.00      1.00       239
  WALKING_UPSTAIRS       1.00      1.00      1.00       275

          accuracy                           0.99      1765
         macro avg       0.99      0.99      0.99      1765
      weighted avg       0.99      0.99      0.99      1765



In [133]:
# test 데이터 예측
y_pred2 = hv_mode.predict(x_test)

In [135]:
# test 데이터 평가
print(accuracy_score(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))

0.9857239972807614
[[292   0   0   0   0   0]
 [  0 247   7   0   0   0]
 [  0  13 274   0   0   0]
 [  0   0   0 228   0   0]
 [  0   0   0   0 195   0]
 [  0   0   0   1   0 214]]
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       292
           SITTING       0.95      0.97      0.96       254
          STANDING       0.98      0.95      0.96       287
           WALKING       1.00      1.00      1.00       228
WALKING_DOWNSTAIRS       1.00      1.00      1.00       195
  WALKING_UPSTAIRS       1.00      1.00      1.00       215

          accuracy                           0.99      1471
         macro avg       0.99      0.99      0.99      1471
      weighted avg       0.99      0.99      0.99      1471



### (2)-2 앙상블2 : Stacking

In [114]:
# Stacking 선언
model_stack = StackingClassifier(estimators=base_model, final_estimator= LogisticRegression())

In [116]:
# 학습
model_stack.fit(x_train, y_train)

In [137]:
# 예측
y_pred3 = model_stack.predict(x_val)

In [141]:
# 평가
print(accuracy_score(y_val, y_pred3))
print(confusion_matrix(y_val, y_pred3))
print(classification_report(y_val, y_pred3))

0.9943342776203966
[[331   0   0   0   0   0]
 [  0 288   4   0   0   0]
 [  0   6 325   0   0   0]
 [  0   0   0 297   0   0]
 [  0   0   0   0 239   0]
 [  0   0   0   0   0 275]]
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       331
           SITTING       0.98      0.99      0.98       292
          STANDING       0.99      0.98      0.98       331
           WALKING       1.00      1.00      1.00       297
WALKING_DOWNSTAIRS       1.00      1.00      1.00       239
  WALKING_UPSTAIRS       1.00      1.00      1.00       275

          accuracy                           0.99      1765
         macro avg       0.99      0.99      0.99      1765
      weighted avg       0.99      0.99      0.99      1765



In [143]:
# test 데이터 예측
y_pred4 = model_stack.predict(x_test)

In [145]:
# test 데이터 평가
print(accuracy_score(y_test, y_pred4))
print(confusion_matrix(y_test, y_pred4))
print(classification_report(y_test, y_pred4))

0.9925220938137321
[[292   0   0   0   0   0]
 [  0 246   8   0   0   0]
 [  0   3 284   0   0   0]
 [  0   0   0 228   0   0]
 [  0   0   0   0 195   0]
 [  0   0   0   0   0 215]]
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       292
           SITTING       0.99      0.97      0.98       254
          STANDING       0.97      0.99      0.98       287
           WALKING       1.00      1.00      1.00       228
WALKING_DOWNSTAIRS       1.00      1.00      1.00       195
  WALKING_UPSTAIRS       1.00      1.00      1.00       215

          accuracy                           0.99      1471
         macro avg       0.99      0.99      0.99      1471
      weighted avg       0.99      0.99      0.99      1471



## 앙상블 결과

In [156]:
print('data hard:', accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print('='*30)
print('test hard:', accuracy_score(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))
print('='*30)
print('data stacking:', accuracy_score(y_val, y_pred3))
print(confusion_matrix(y_val, y_pred3))
print('='*30)
print('test stacking:', accuracy_score(y_test, y_pred4))
print(confusion_matrix(y_test, y_pred4))

data hard: 0.9886685552407932
[[331   0   0   0   0   0]
 [  0 287   5   0   0   0]
 [  0  14 317   0   0   0]
 [  0   0   0 297   0   0]
 [  0   0   0   0 239   0]
 [  0   0   0   1   0 274]]
test hard: 0.9857239972807614
[[292   0   0   0   0   0]
 [  0 247   7   0   0   0]
 [  0  13 274   0   0   0]
 [  0   0   0 228   0   0]
 [  0   0   0   0 195   0]
 [  0   0   0   1   0 214]]
data stacking: 0.9943342776203966
[[331   0   0   0   0   0]
 [  0 288   4   0   0   0]
 [  0   6 325   0   0   0]
 [  0   0   0 297   0   0]
 [  0   0   0   0 239   0]
 [  0   0   0   0   0 275]]
test stacking: 0.9925220938137321
[[292   0   0   0   0   0]
 [  0 246   8   0   0   0]
 [  0   3 284   0   0   0]
 [  0   0   0 228   0   0]
 [  0   0   0   0 195   0]
 [  0   0   0   0   0 215]]


## 4.파이프라인 구성

### (1) 함수 만들기

### (2) test 셋으로 예측하고 평가하기