# Mini Project #1
---

EMNIST 데이터셋의 손글씨 숫자 이미지 분류기 개발

---

## 데이터셋 입력 및 라벨 분리

In [18]:
# 패키지 임포트
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import cross_val_score

In [2]:
# 데이터 읽기
input_file_name = "data/emnist-digits-train.csv"
df_train = pd.read_csv(input_file_name, header=None)

In [3]:
# DataFrame 객체를 numpy 배열로 변환
np_data = df_train.to_numpy()

In [4]:
# 학습 셋 분리
x_train = np_data[:, 1:]

In [5]:
# 타켓 분리
y_train = np_data[:, 0]

## 데이터 전처리

In [None]:
# 픽셀 스케일링
# x_train = x_train / 255.0

In [None]:
# 라벨 One Hot 인코딩
# from sklearn.preprocessing import OneHotEncoder
# 
# OH_encoder = OneHotEncoder()
# y_train_res = y_train.reshape(-1, 1)
# y_train_encoded = OH_encoder.fit_transform(y_train_res).toarray()

In [15]:
# 데이터 확인
# import matplotlib.pyplot as plt
# 
# def show_dataset(data, num_images=25):
#     fig, axes = plt.subplots(5, 5, figsize=(10, 10))
#     axes = axes.ravel()
#     for i in range(num_images):
#         axes[i].imshow(np.reshape(data[i], (28, 28)).T, cmap='gray')
#         axes[i].axis('off')
#     plt.tight_layout()
#     plt.show()
# 
# show_dataset(x_train)

## 스태킹 기법 사용
### 모델 1 : SVM
### 모델 2 : KNN
### 모델 3 : Random Forest
### 모델 4 : Logistic Regression

---

## SVM

In [10]:
# SVM용 전처리 - HOG 특징 추출 (36s 소요)
from skimage.feature import hog

def extract_hog_features(data):
    hog_features = []
    for img in data:
        img_reshaped = np.reshape(img, (28, 28)).T
        features = hog(img_reshaped, orientations=8, pixels_per_cell=(7, 7), cells_per_block=(1, 1))
        hog_features.append(features)
    return np.array(hog_features)

x_train_hog = extract_hog_features(x_train)

In [11]:
# SVM 모델 정의 및 학습 - GridSearch (23h 3m 소요 : 최적 파라미터 c-10 gamma-0.1)
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

svm = SVC()
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}
grid_search = GridSearchCV(svm, param_grid, refit=True, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(x_train_hog, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

joblib.dump(grid_search, 'models/hog_svm_model_gridsearch.joblib')

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END ........................C=0.1, gamma=1, kernel=rbf; total time=323.7min
[CV] END .........................C=1, gamma=0.1, kernel=rbf; total time=10.4min
[CV] END ........................C=1, gamma=0.01, kernel=rbf; total time=16.8min
[CV] END .......................C=1, gamma=0.001, kernel=rbf; total time=42.2min
[CV] END .........................C=10, gamma=1, kernel=rbf; total time=623.9min
[CV] END .......................C=100, gamma=0.1, kernel=rbf; total time=11.7min




[CV] END ........................C=0.1, gamma=1, kernel=rbf; total time=323.0min
[CV] END .........................C=1, gamma=0.1, kernel=rbf; total time=10.3min
[CV] END ........................C=1, gamma=0.01, kernel=rbf; total time=16.8min
[CV] END .......................C=1, gamma=0.001, kernel=rbf; total time=42.1min
[CV] END .........................C=10, gamma=1, kernel=rbf; total time=623.0min
[CV] END .......................C=100, gamma=0.1, kernel=rbf; total time=11.5min
[CV] END ......................C=100, gamma=0.01, kernel=rbf; total time= 7.3min
[CV] END .....................C=100, gamma=0.001, kernel=rbf; total time= 8.7min
[CV] END ........................C=0.1, gamma=1, kernel=rbf; total time=323.0min
[CV] END .........................C=1, gamma=0.1, kernel=rbf; total time=10.4min
[CV] END ........................C=1, gamma=0.01, kernel=rbf; total time=16.8min
[CV] END .......................C=1, gamma=0.001, kernel=rbf; total time=42.1min
[CV] END ...................

['models/svm_model_multi.joblib']

In [31]:
# SVM 모델 정의 및 학습 (10m 소요)
from sklearn.svm import SVC

svm_model = SVC(kernel='rbf', C=10, gamma=0.1)
svm_model.fit(x_train_hog, y_train)

joblib.dump(svm_model, 'models/hog_svm_model.joblib')

['models/hog_svm_model.joblib']

In [None]:
# SVM 모델 교차 검증
svm_model_test = joblib.load("models/hog_svm_model.joblib")
svm_cv_scores = cross_val_score(svm_model_test, x_train_hog, y_train, cv=5, scoring='accuracy')
print("SVM Cross-Validation Accuracy:", svm_cv_scores.mean())

## KNN

In [16]:
# KNN용 전처리 - PCA 적용
from sklearn.decomposition import PCA

pca = PCA(n_components=50)
x_train_pca = pca.fit_transform(x_train)

In [27]:
# KNN 모델 학습
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(x_train, y_train)

joblib.dump(knn, 'models/pca_knn_model.joblib')

['models/pca_knn_model.joblib']

In [29]:
# KNN 모델 교차 검증
knn_cv_scores = cross_val_score(KNeighborsClassifier(n_neighbors=7), x_train_pca, y_train, cv=5, scoring='accuracy')
print("KNN Cross-Validation Accuracy:", knn_cv_scores.mean())

KNN Cross-Validation Accuracy: 0.9845333333333333


## SDG Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=5, tol=None, random_state=42)
sgd_clf.fit(x_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, x_train, y_train, cv=10, scoring="accuracy")

In [None]:
np.mean([0.92066667, 0.9195    , 0.91804167, 0.91754167, 0.9195    ,
       0.91554167, 0.92033333, 0.9245    , 0.92229167, 0.92108333]
)