# Mini Project #2
---

스마트폰 수집 신호를 이용한 인간 행위 인식

---

## 데이터셋 입력 및 라벨 분리

In [1]:
# 패키지 임포트
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
import datetime

In [3]:
# 데이터 읽기 (5s 소요)
input_file_name = "data/train.csv"
df_train = pd.read_csv(input_file_name, header=0)

In [4]:
df_train

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject,Activity
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.710304,-0.112754,0.030400,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,1,STANDING
1,0.278419,-0.016411,-0.123520,-0.998245,-0.975300,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,1,STANDING
2,0.279653,-0.019467,-0.113462,-0.995380,-0.967187,-0.978944,-0.996520,-0.963668,-0.977469,-0.938692,...,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,1,STANDING
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.982750,-0.989302,-0.938692,...,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663,1,STANDING
4,0.276629,-0.016570,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.699205,0.123320,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892,1,STANDING
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7347,0.299665,-0.057193,-0.181233,-0.195387,0.039905,0.077078,-0.282301,0.043616,0.060410,0.210795,...,-0.880324,-0.190437,0.829718,0.206972,-0.425619,-0.791883,0.238604,0.049819,30,WALKING_UPSTAIRS
7348,0.273853,-0.007749,-0.147468,-0.235309,0.004816,0.059280,-0.322552,-0.029456,0.080585,0.117440,...,-0.680744,0.064907,0.875679,-0.879033,0.400219,-0.771840,0.252676,0.050053,30,WALKING_UPSTAIRS
7349,0.273387,-0.017011,-0.045022,-0.218218,-0.103822,0.274533,-0.304515,-0.098913,0.332584,0.043999,...,-0.304029,0.052806,-0.266724,0.864404,0.701169,-0.779133,0.249145,0.040811,30,WALKING_UPSTAIRS
7350,0.289654,-0.018843,-0.158281,-0.219139,-0.111412,0.268893,-0.310487,-0.068200,0.319473,0.101702,...,-0.344314,-0.101360,0.700740,0.936674,-0.589479,-0.785181,0.246432,0.025339,30,WALKING_UPSTAIRS


In [3]:
# DataFrame 객체를 numpy 배열로 변환
np_data = df_train.to_numpy()

In [4]:
# 학습 셋 분리
x_train = np_data[:, 1:]

In [5]:
# 타겟 분리
y_train = np_data[:, 0]

## 데이터 전처리 함수 정의

In [6]:
from sklearn.preprocessing import FunctionTransformer

In [7]:
# 정규화
def normalize_func(data):
    return data / 255.0

normalize = FunctionTransformer(normalize_func)

In [8]:
# Zero-centering
def zeroCentering_func(data):
    mean = np.mean(data, axis=0)
    return data - mean

zeroCentering = FunctionTransformer(zeroCentering_func)

In [9]:
# HOG 특징 추출 (36s 소요)
from skimage.feature import hog

def hog_extract_func(data):
    hog_features = []
    for img in data:
        img_reshaped = np.reshape(img, (28, 28)).T
        features = hog(img_reshaped, orientations=8, pixels_per_cell=(7, 7), cells_per_block=(1, 1))
        hog_features.append(features)
    return np.array(hog_features)

hog_extract = FunctionTransformer(hog_extract_func)

In [10]:
# PCA 적용 (9s 소요)
from sklearn.decomposition import PCA

def pca_process_func(data):
    pca = PCA(n_components=50)
    return pca.fit_transform(data)

pca_process = FunctionTransformer(pca_process_func)

In [11]:
# 데이터 전처리 (Zero Centering)
Z_data = make_pipeline(
    zeroCentering
).fit_transform(x_train)

In [12]:
# 데이터 전처리 (Zero Centering, HOG) (39s 소요)
ZH_data = make_pipeline(
    zeroCentering, 
    hog_extract
).fit_transform(x_train)

In [13]:
# 데이터 전처리 (Zero Centering, PCA) (15s 소요)
ZP_data = make_pipeline(
    zeroCentering,
    pca_process
).fit_transform(x_train)

## 데이터 학습

In [14]:
# 하이퍼파라미터 튜닝 함수 정의
from sklearn.model_selection import GridSearchCV

def tune_model(model, data, param_grid):
    grid_search = GridSearchCV(model, param_grid, cv=3, verbose=2, scoring='accuracy', n_jobs=-1)
    grid_search.fit(data, y_train)
    print(f"Best params for", model.__class__.__name__, ":", grid_search.best_params_)
    print("Best Cross-Validation Score:", grid_search.best_score_)
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_score_


## 로그 및 모델 저장

In [15]:
def save_log(desc, con):
    with open('log/ML-P1.log', 'a') as f:
        f.write("\n\n" + desc + "\n")
        f.writelines('\n'.join(con))

In [16]:
def save_model(model, filename):
    joblib.dump(model, filename)

## SVC

In [20]:
from sklearn.svm import SVC

#### Zero Centering, HOG 데이터 사용
---

In [None]:
# SVC GridSearch (23h 3m 소요)

# 결과
# 최적 파라미터 : c = 10 gamma = 0.1
# 최고 스코어 : 0.9846708333333332

model = SVC()
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}
model, params, score = tune_model(model, ZH_data, param_grid)

save_model(model, 'models/svc_ZH_gs.joblib')
save_log(f'SVC ZH GridSearch {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}', ["params : " + str(params), "score : " + str(score)])

In [38]:
# SVM 모델 학습 (10m 소요)
model = SVC(kernel='rbf', C=10, gamma=0.1)
model.fit(ZH_data, y_train)

joblib.dump(model, 'models/svc_ZH_c#10_gamma#0.1.joblib')

['models/svc_ZH_c#10_gamma#0.1.joblib']

In [39]:
# SVM 모델 교차 검증 (13m 36s 소요)
# 스코어 : 0.9851374999999999

model = joblib.load("models/svc_ZH_c#10_gamma#0.1.joblib")
score = cross_val_score(model, ZH_data, y_train, cv=10, scoring='accuracy', verbose=0, n_jobs=-1)
print("SVC Cross-Validation Accuracy:", score.mean())
save_log(f'SVC ZH Cross-Validation {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}', ["params : c = 10, gamma = 0.1", "score : " + str(score.mean())])

SVC Cross-Validation Accuracy: 0.9785208333333333


## KNN

In [18]:
from sklearn.neighbors import KNeighborsClassifier

#### Zero Centering, PCA 데이터 사용
---

In [23]:
# KNN GridSearch (3m 소요)
# 최적 파라미터 : n_neighbors = 5
# 최고 스코어 : 0.9842583333333333

model = KNeighborsClassifier()
param_grid = {
    'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}
model, params, score = tune_model(model, ZP_data, param_grid)

save_model(model, 'models/knn_ZP_gs.joblib')
save_log(f'KNN ZP GridSearch {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}', ["params : " + str(params), "score : " + str(score)])

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best params for KNeighborsClassifier : {'n_neighbors': 3}
Best Cross-Validation Score: 0.9846083333333334


In [20]:
# KNN 모델 학습 
model = KNeighborsClassifier(n_neighbors=3)
model.fit(ZP_data, y_train)

joblib.dump(model, 'models/knn_ZP_n#3.joblib')

['models/knn_ZP_n#3.joblib']

In [35]:
# KNN 모델 교차 검증 (31s 소요) - 0.9863
model = joblib.load("models/knn_ZP_n#3.joblib")
score = cross_val_score(model, ZP_data, y_train, cv=10, scoring='accuracy', verbose=0, n_jobs=-1)
print("KNN Cross-Validation Accuracy:", score.mean())
save_log(f'KNN ZP Cross-Validation {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}', ["params : n = 3", "score : " + str(score.mean())])

KNN Cross-Validation Accuracy: 0.9863


## Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

#### Zero Centering 데이터 사용
---

In [None]:
# Random Forest GridSearch (1h 30m 소요)

model = RandomForestClassifier()
param_grid = {
    'n_estimators': [500, 700, 1000],
    'max_depth': [50, 100, 500]
}

model, params, score = tune_model(model, Z_data, param_grid)

joblib.dump(model, 'models/randomForest_Z_gs.joblib')
save_log(f'Random Forest Z GridSearch {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}', ["params : " + str(params), "score : " + str(score)])

Fitting 3 folds for each of 9 candidates, totalling 27 fits


#### Zero Centering, PCA 데이터 사용
---

In [27]:
# Random Forest GridSearch (PCA 데이터 사용) (30m 소요)

# 결과
# 최적 파라미터 : ??
# 최고 스코어 : ??

model = RandomForestClassifier()
param_grid = {
    'n_estimators': [50, 100, 200, 300, 500],
    'max_depth': [10, 20, 50]
}

model, params, score = tune_model(model, ZP_data, param_grid)

joblib.dump(model, 'models/randomForest_ZP_gs.joblib')
save_log(f'Random Forest ZP GridSearch {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}', ["params : " + str(params), "score : " + str(score)])

Fitting 3 folds for each of 15 candidates, totalling 45 fits
[CV] END .....................max_depth=10, n_estimators=200; total time= 3.9min
[CV] END .....................max_depth=20, n_estimators=100; total time= 3.3min




[CV] END .....................max_depth=10, n_estimators=200; total time= 3.9min
[CV] END .....................max_depth=20, n_estimators=100; total time= 3.3min
[CV] END .....................max_depth=10, n_estimators=100; total time= 1.9min
[CV] END ......................max_depth=20, n_estimators=50; total time= 1.5min
[CV] END ......................max_depth=20, n_estimators=50; total time= 1.6min
[CV] END .....................max_depth=20, n_estimators=100; total time= 3.1min
[CV] END .....................max_depth=10, n_estimators=200; total time= 3.9min
[CV] END ......................max_depth=20, n_estimators=50; total time= 1.6min
[CV] END .....................max_depth=20, n_estimators=200; total time= 5.8min
[CV] END .....................max_depth=10, n_estimators=300; total time= 6.1min
[CV] END .....................max_depth=20, n_estimators=200; total time= 5.6min
[CV] END ......................max_depth=10, n_estimators=50; total time= 1.0min
[CV] END ...................

In [23]:
# Random Forest 모델 학습 (PCA 데이터 사용) (1m 30s 소요)
model = RandomForestClassifier(n_estimators=100, max_depth=10)
model.fit(ZP_data, y_train)

joblib.dump(model, 'models/randomForest_ZP_n#100_d#10.joblib')

['models/randomForest_ZP_n#100_d#10.joblib']

In [24]:
# Random Forest 모델 교차 검증 (PCA 데이터 사용) (1m 30s 소요) - 0.9863
model = joblib.load("models/randomForest_ZP_n#100_d#10.joblib")
score = cross_val_score(model, ZP_data, y_train, cv=10, scoring='accuracy', verbose=0, n_jobs=-1)
print("Random Forest Cross-Validation Accuracy:", score.mean())
save_log(f'Random Forest ZP Cross-Validation {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}', ["params : n = 100, d = 10", "score : " + str(score.mean())])

Random Forest Cross-Validation Accuracy: 0.9253041666666666


## XGBoost

In [17]:
from xgboost import XGBClassifier

#### Zero Centering 데이터 사용
---

In [None]:
# XGB GridSearch (1d+a 소요)
# 최적 파라미터 : ???
# 최고 스코어 : ???

model = XGBClassifier()
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

model, params, score = tune_model(model, Z_data, param_grid)

joblib.dump(model, 'models/xgb_Z_gs.joblib')
save_log(f'XGB Z GridSearch {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}', ["params : " + str(params), "score : " + str(score)])

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [None]:
# XGB 모델 교차 검증 (29m 2s 소요) - 0.9877916666666667
xgb_model_test = joblib.load("models/xgb_Z_gs.joblib")
xgb_cv_scores, xgb_best_params, xgb_best_score = cross_val_score(xgb_model_test, x_train, y_train, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
print("XGB Z Cross-Validation Accuracy:", xgb_cv_scores.mean())

In [42]:
# XGB 모델 학습 (3m 14s 소요)
xgb = XGBClassifier()
xgb.fit(Z_data, y_train)

joblib.dump(xgb, 'models/xgb_Z_default.joblib')

['models/xgb_Z_default.joblib']

In [45]:
# XGB 모델 교차 검증 (14m 54s 소요)
# 스코어 : 0.9863083333333335

model = joblib.load("models/xgb_Z_default.joblib")
score = cross_val_score(model, Z_data, y_train, cv=3, scoring='accuracy', verbose=0, n_jobs=-1)
print("XGB Cross-Validation Accuracy:", score.mean())
save_log(f'XGB Z Cross-Validation {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}', ["params : default", "score : " + str(score.mean())])

XGB Cross-Validation Accuracy: 0.9863083333333335


## CatBoost

In [47]:
from catboost import CatBoostClassifier

In [None]:
# CatBoost
model = CatBoostClassifier()
param_grid = {
    'iterations': [100, 200], 
    'learning_rate': [0.1, 0.01]
}
catboost_best = tune_model(model, Z_data, param_grid)

In [48]:
# CatBoost 모델 학습 (PCA 데이터 사용) (8m 소요)
model = CatBoostClassifier(iterations=200, learning_rate=0.1)
model.fit(x_train, y_train)

joblib.dump(model, 'models/catBoost_Z_i#200_lr#0.1.joblib')

0:	learn: 1.9288147	total: 2.75s	remaining: 9m 7s
1:	learn: 1.6956580	total: 5.31s	remaining: 8m 45s
2:	learn: 1.5339664	total: 7.42s	remaining: 8m 7s
3:	learn: 1.4112326	total: 9.25s	remaining: 7m 33s
4:	learn: 1.3030438	total: 11.1s	remaining: 7m 11s
5:	learn: 1.2183052	total: 12.8s	remaining: 6m 55s
6:	learn: 1.1365113	total: 14.7s	remaining: 6m 44s
7:	learn: 1.0671103	total: 16.5s	remaining: 6m 36s
8:	learn: 1.0045513	total: 18.3s	remaining: 6m 29s
9:	learn: 0.9533074	total: 20.4s	remaining: 6m 26s
10:	learn: 0.9026773	total: 22.2s	remaining: 6m 21s
11:	learn: 0.8565848	total: 24.1s	remaining: 6m 17s
12:	learn: 0.8215959	total: 26.4s	remaining: 6m 19s
13:	learn: 0.7827144	total: 28.3s	remaining: 6m 15s
14:	learn: 0.7502030	total: 30.3s	remaining: 6m 14s
15:	learn: 0.7170709	total: 32.1s	remaining: 6m 9s
16:	learn: 0.6875135	total: 33.9s	remaining: 6m 5s
17:	learn: 0.6638215	total: 36.1s	remaining: 6m 5s
18:	learn: 0.6372687	total: 38s	remaining: 6m 1s
19:	learn: 0.6173822	total: 40

['models/catBoost_Z_i#200_lr#0.1.joblib']

In [None]:
# CatBoost 모델 교차 검증 (PCA 데이터 사용) (?? 소요) - ??
model = joblib.load("models/catBoost_Z_i#200_lr#0.1.joblib")
score = cross_val_score(model, x_train, y_train, cv=10, scoring='accuracy', verbose=2, n_jobs=-1)
print("CatBoost Cross-Validation Accuracy:", score.mean())
save_log(f'CatBoost Z Cross-Validation {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}', ["params : iterations = 200, learning_rate = 0.1", "score : " + str(score.mean())])

## SDG Classifier

In [1]:
# SGD 모델 학습 (원본 데이터 사용) (12s 소요)
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=5, tol=None, random_state=42)
sgd_clf.fit(x_train, y_train)

NameError: name 'x_train' is not defined

In [19]:
# SGD 모델 교차 검증 (원본 데이터 사용) (27s 소요)
sgd_cv_scores = cross_val_score(sgd_clf, x_train, y_train, cv=10, scoring="accuracy").mean()
print("SGD Cross-Validation Accuracy:", sgd_cv_scores.mean())

0.8844583333333335

In [None]:
from sklearn.model_selection import cross_val_score

# SVM 모델 교차 검증
svm_cv_scores = cross_val_score(SVC(kernel='rbf', C=10, gamma=0.01), X_train_hog, y_train, cv=5, scoring='accuracy')
print("SVM Cross-Validation Accuracy:", svm_cv_scores.mean())

# KNN 모델 교차 검증
knn_cv_scores = cross_val_score(KNeighborsClassifier(n_neighbors=5), X_train_pca, y_train, cv=5, scoring='accuracy')
print("KNN Cross-Validation Accuracy:", knn_cv_scores.mean())

# 랜덤 포레스트 모델 교차 검증
rf_cv_scores = cross_val_score(RandomForestClassifier(n_estimators=100, max_depth=10), X_train_rf, y_train, cv=5, scoring='accuracy')
print("Random Forest Cross-Validation Accuracy:", rf_cv_scores.mean())

# 로지스틱 회귀 모델 교차 검증
lr_cv_scores = cross_val_score(LogisticRegression(max_iter=1000), X_train_scaled, y_train, cv=5, scoring='accuracy')
print("Logistic Regression Cross-Validation Accuracy:", lr_cv_scores.mean())

# 스태킹 모델 교차 검증
stacking_cv_scores = cross_val_score(stacking_clf, X_train_scaled, y_train, cv=5, scoring='accuracy')
print("Stacking Model Cross-Validation Accuracy:", stacking_cv_scores.mean())

In [None]:
# 개별 모델 정확도 평가
y_pred_svm = svm.predict(X_test_hog)
y_pred_knn = knn.predict(X_test_pca)
y_pred_rf = rf.predict(X_test_rf)
y_pred_lr = lr.predict(X_test_scaled)

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))

# 스태킹 모델 정확도 평가
y_pred_stack = stacking_clf.predict(X_test_scaled)
print("Stacking Model Accuracy:", accuracy_score(y_test, y_pred_stack))

In [None]:
# 개별 모델을 StackingClassifier에 넣음
estimators = [
    ('svm', SVC(kernel='rbf', C=10, gamma=0.01, probability=True)),  # SVM 모델
    ('knn', KNeighborsClassifier(n_neighbors=5)),                    # KNN 모델
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=10)),   # 랜덤 포레스트
    ('lr', LogisticRegression(max_iter=1000))                         # 로지스틱 회귀
]

# 스태킹 모델 구성 (메타 모델: 로지스틱 회귀)
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stacking_clf.fit(X_train_scaled, y_train)

In [None]:
# 로지스틱 회귀 모델 학습
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)

In [None]:
# 랜덤 포레스트 학습에 맞춘 이미지 구역 분할
def split_image(data, num_splits=4):
    split_features = []
    for img in data:
        img_reshaped = img.reshape((28, 28))
        splits = []
        for i in range(0, 28, 14):  # 두 개의 행 블록으로 나눔
            for j in range(0, 28, 14):  # 두 개의 열 블록으로 나눔
                splits.append(np.mean(img_reshaped[i:i+14, j:j+14]))  # 각 블록의 평균값
        split_features.append(splits)
    return np.array(split_features)

X_train_rf = split_image(X_train_scaled)
X_test_rf = split_image(X_test_scaled)

# 랜덤 포레스트 모델 학습
rf = RandomForestClassifier(n_estimators=100, max_depth=10)
rf.fit(X_train_rf, y_train)