# 데이터 로드

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np

from  sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [0]:
train = pd.read_csv("./drive/My Drive/data/kannada/train.csv")
test  = pd.read_csv("./drive/My Drive/data/kannada/test.csv")
submission  = pd.read_csv("./drive/My Drive/data/kannada/sample_submission.csv")

# 데이터 전처리

In [0]:
train_images = train.iloc[:,1:]
train_labels = train["label"]

In [0]:
x_train, x_valid, y_train, y_valid = train_test_split(train_images, train_labels, test_size=0.1, random_state=42)

# 차원 축소(PCA, Principal Component Analysis)

In [0]:
print(x_train.shape)

(54000, 784)


In [0]:
# n_components를 위한 누적 분포 비율 확인해보기
pca = PCA(n_components=784)
pca.fit_transform(x_train)
#print(pca.explained_variance_)
#print(pca.explained_variance_ratio_)
print(np.cumsum(pca.explained_variance_ratio_))

[0.07971241 0.13549834 0.18012867 0.2166521  0.25103953 0.27886661
 0.30327927 0.32426716 0.34465252 0.36194484 0.37809741 0.39250628
 0.40622903 0.41827402 0.4300728  0.44134467 0.45211229 0.46271131
 0.47291927 0.4830014  0.49244062 0.50162587 0.51018968 0.51858754
 0.52674151 0.53473595 0.54252885 0.54991928 0.55709825 0.56408637
 0.57080436 0.57748358 0.58407465 0.59045425 0.59661299 0.60260015
 0.60838321 0.61405861 0.61955563 0.62486656 0.630023   0.63505254
 0.63995473 0.64483959 0.64963026 0.65428394 0.65883857 0.66337297
 0.66781929 0.67214721 0.67639197 0.68057598 0.68473977 0.68884531
 0.69291798 0.6967848  0.70058976 0.70427714 0.70791451 0.71151448
 0.71503437 0.71848472 0.72190617 0.7252589  0.72856005 0.73180883
 0.73500057 0.73812474 0.74116659 0.744161   0.74712251 0.75004114
 0.75291525 0.75571808 0.75849293 0.7612411  0.76392205 0.76659113
 0.76919959 0.77178398 0.77433842 0.77685287 0.77928524 0.78169973
 0.78407764 0.78641858 0.78872653 0.79099863 0.79326375 0.7954

In [0]:
# 누적된 분산의 65% 비율 정도로 축소(784 -> 46)
print(x_train.shape)
pca = PCA(n_components=0.65, whiten=True)
x_train = pca.fit_transform(x_train)
print(x_train.shape)

(54000, 784)
(54000, 46)


In [0]:
print(x_valid.shape)
x_valid = pca.transform(x_valid)
print(x_valid.shape)

(6000, 784)
(6000, 46)


# SVM(Support Vector Machine)으로 학습

In [0]:
# SVM에서 Classification을 위해 제공하는 SVC(Support Vector Classifier) 클래스 사용
# 보통 많이 사용되는 RBF(Radial Basis Function) 또는 가우시안 커널(Gaussian Kernel)로 불리는 rbf 사용
# C 값은 그냥 임의로...
model = svm.SVC(kernel='rbf', C=10)
model.fit(x_train, y_train)



SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

# 예측

In [0]:
pred = model.predict(x_valid)

# 평가

In [0]:
print(accuracy_score(y_valid, pred))

0.9905


# 제출

In [0]:
#x_test = test.iloc[:,1:]
#x_test = pca.transform(x_test)
#y_test = model.predict(x_test)
#test['label'] = y_test
#sub = test[['id', 'label']]
#sub.to_csv('submission.csv', index=False)