# Mini Project #2
---

스마트폰 수집 신호를 이용한 인간 행위 인식

---

## 패키지 불러오기

In [73]:
# 패키지 임포트
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import cross_val_score
import datetime

## 데이터셋 입력 및 라벨 분리

In [74]:
# 데이터 읽기 (5s 소요)
input_file_name = "data/train.csv"
df_train = pd.read_csv(input_file_name, header=0)

## 데이터셋 확인

In [75]:
df_train

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject,Activity
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.710304,-0.112754,0.030400,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,1,STANDING
1,0.278419,-0.016411,-0.123520,-0.998245,-0.975300,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,1,STANDING
2,0.279653,-0.019467,-0.113462,-0.995380,-0.967187,-0.978944,-0.996520,-0.963668,-0.977469,-0.938692,...,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,1,STANDING
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.982750,-0.989302,-0.938692,...,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663,1,STANDING
4,0.276629,-0.016570,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.699205,0.123320,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892,1,STANDING
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7347,0.299665,-0.057193,-0.181233,-0.195387,0.039905,0.077078,-0.282301,0.043616,0.060410,0.210795,...,-0.880324,-0.190437,0.829718,0.206972,-0.425619,-0.791883,0.238604,0.049819,30,WALKING_UPSTAIRS
7348,0.273853,-0.007749,-0.147468,-0.235309,0.004816,0.059280,-0.322552,-0.029456,0.080585,0.117440,...,-0.680744,0.064907,0.875679,-0.879033,0.400219,-0.771840,0.252676,0.050053,30,WALKING_UPSTAIRS
7349,0.273387,-0.017011,-0.045022,-0.218218,-0.103822,0.274533,-0.304515,-0.098913,0.332584,0.043999,...,-0.304029,0.052806,-0.266724,0.864404,0.701169,-0.779133,0.249145,0.040811,30,WALKING_UPSTAIRS
7350,0.289654,-0.018843,-0.158281,-0.219139,-0.111412,0.268893,-0.310487,-0.068200,0.319473,0.101702,...,-0.344314,-0.101360,0.700740,0.936674,-0.589479,-0.785181,0.246432,0.025339,30,WALKING_UPSTAIRS


In [76]:
df_train.duplicated().sum()

0

In [77]:
df_train.isna().sum()

tBodyAcc-mean()-X       0
tBodyAcc-mean()-Y       0
tBodyAcc-mean()-Z       0
tBodyAcc-std()-X        0
tBodyAcc-std()-Y        0
                       ..
angle(X,gravityMean)    0
angle(Y,gravityMean)    0
angle(Z,gravityMean)    0
subject                 0
Activity                0
Length: 563, dtype: int64

In [78]:
df_train['Activity'].unique()

array(['STANDING', 'SITTING', 'LAYING', 'WALKING', 'WALKING_DOWNSTAIRS',
       'WALKING_UPSTAIRS'], dtype=object)

In [79]:
# DataFrame 객체를 numpy 배열로 변환
np_data = df_train.to_numpy()

In [80]:
# 학습 셋 분리
x_train = np_data[:, :-1]

In [81]:
# 레이블 분리
y_train = np_data[:, -1]

## 데이터 전처리 함수 정의

In [82]:
# 필요없는 특성 제거
x_train = x_train[:, :-1]

In [83]:
# Z-score 정규화
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)

In [84]:
# 레이블 인코딩
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)

## 데이터 학습

In [85]:
# 하이퍼파라미터 튜닝 함수 정의
from sklearn.model_selection import GridSearchCV

def tune_model(model, data, param_grid):
    grid_search = GridSearchCV(model, param_grid, cv=3, verbose=2, scoring='accuracy', n_jobs=-1)
    grid_search.fit(data, y_train)
    print(f"Best params for", model.__class__.__name__, ":", grid_search.best_params_)
    print("Best Cross-Validation Score:", grid_search.best_score_)
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_score_


## 로그 및 모델 저장

In [86]:
def save_log(desc, con):
    with open('log/ML-P2.log', 'a') as f:
        f.write("\n\n" + desc + "\n")
        f.writelines('\n'.join(con))

## SVC

In [69]:
from sklearn.svm import SVC

In [87]:
# SVC 모델 학습
model = SVC(kernel='rbf')
model.fit(x_train, y_train)

In [88]:
# SVC 모델 교차 검증
score = cross_val_score(model, x_train, y_train, cv=10, scoring='accuracy', verbose=0, n_jobs=-1)
print("SVC Cross-Validation Accuracy:", score.mean())

SVC Cross-Validation Accuracy: 0.9377087030464359


In [89]:
# SVC GridSearch : 7m 23s 소요
model = SVC()
param_grid = [
    {
        'kernel': ['rbf', 'linear', 'poly'],
        'gamma': ['scale', 'auto', 0.0001, 0.001, 0.01, 0.1, 1, 10],
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
    }
]
model, params, score = tune_model(model, x_train, param_grid)

Fitting 3 folds for each of 192 candidates, totalling 576 fits
Best params for SVC : {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Best Cross-Validation Score: 0.9408342561830935


In [90]:
# SVC 모델 학습
model = SVC(C=0.1, gamma='scale', kernel='linear')
model.fit(x_train, y_train)

In [91]:
# SVC 모델 교차 검증
score = cross_val_score(model, x_train, y_train, cv=10, scoring='accuracy', verbose=0, n_jobs=-1)
print("SVC Cross-Validation Accuracy:", score.mean())

SVC Cross-Validation Accuracy: 0.9483205782312923


## Random Forest

In [92]:
from sklearn.ensemble import RandomForestClassifier

In [93]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

In [94]:
score = cross_val_score(model, x_train, y_train, cv=10, scoring='accuracy', verbose=0, n_jobs=-1)
print("Random Forest Cross-Validation Accuracy:", score.mean())

Random Forest Cross-Validation Accuracy: 0.9318616903283052


In [102]:
model = RandomForestClassifier()
param_grid = {
    'n_estimators': [50, 100, 200, 300, 500],
    'max_depth': [10, 20, 50]
}
model, params, score = tune_model(model, x_train, param_grid)

Fitting 3 folds for each of 15 candidates, totalling 45 fits
Best params for RandomForestClassifier : {'max_depth': 10, 'n_estimators': 200}
Best Cross-Validation Score: 0.9117264368007505


## KNN

In [100]:
from sklearn.neighbors import KNeighborsClassifier

In [103]:
# KNN GridSearch (3m 소요)
model = KNeighborsClassifier()
param_grid = {
    'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}
model, params, score = tune_model(model, x_train, param_grid)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best params for KNeighborsClassifier : {'n_neighbors': 10}
Best Cross-Validation Score: 0.879626197276136


## XGBoost

In [96]:
from xgboost import XGBClassifier

In [97]:
# XGB 모델 학습 (3m 14s 소요)
xgb = XGBClassifier()
xgb.fit(x_train, y_train)

In [98]:
# XGB 모델 교차 검증 (14m 54s 소요)
score = cross_val_score(model, x_train, y_train, cv=10, scoring='accuracy', verbose=0, n_jobs=-1)
print("XGB Cross-Validation Accuracy:", score.mean())

XGB Cross-Validation Accuracy: 0.933220940550133


## CatBoost

In [105]:
from catboost import CatBoostClassifier

In [111]:
# CatBoost
model = CatBoostClassifier(verbose=0)
param_grid = {
    'iterations': [100, 200], 
    'learning_rate': [0.1, 0.01]
}
catboost_best = tune_model(model, x_train, param_grid)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best params for CatBoostClassifier : {'iterations': 200, 'learning_rate': 0.1}
Best Cross-Validation Score: 0.9212477483853599


In [112]:
# CatBoost 모델 학습
model = CatBoostClassifier(iterations=200, learning_rate=0.1, verbose=0)
model.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x16b35f260>

In [113]:
# CatBoost 모델 교차 검증
score = cross_val_score(model, x_train, y_train, cv=10, scoring='accuracy', verbose=0, n_jobs=-1)
print("CatBoost Cross-Validation Accuracy:", score.mean())

CatBoost Cross-Validation Accuracy: 0.9344448757763975


## SDG Classifier

In [117]:
from sklearn.linear_model import SGDClassifier

In [118]:
sgd_clf = SGDClassifier(max_iter=5, tol=None, random_state=42)
sgd_clf.fit(x_train, y_train)

In [123]:
# SGD 모델 교차 검증 (원본 데이터 사용) (27s 소요)
score = cross_val_score(sgd_clf, x_train, y_train, cv=10, scoring="accuracy").mean()
print("SGD Cross-Validation Accuracy:", score.mean())

SGD Cross-Validation Accuracy: 0.9288694173321502


## Logistic Regression

In [125]:
from sklearn.linear_model import LogisticRegression

In [126]:
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

In [127]:
score = cross_val_score(model, x_train, y_train, cv=10, scoring='accuracy', verbose=0, n_jobs=-1)
print("Logistic Regression Cross-Validation Accuracy:", score.mean())

Logistic Regression Cross-Validation Accuracy: 0.9450547175391895


In [129]:
model = LogisticRegression(max_iter=1000)
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}
model, params, score = tune_model(model, x_train, param_grid)

Fitting 3 folds for each of 7 candidates, totalling 21 fits
Best params for LogisticRegression : {'C': 1}
Best Cross-Validation Score: 0.939202546787789
