In [1]:
# encoding: utf-8


import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn import metrics
import pickle

In [2]:
LABELS_TXT = 'digits4000_digits_labels.txt'
VECS_TXT = 'digits4000_digits_vec.txt'
# TESTSET_TXT = 'digits4000_testset.txt'
# TRAINSET_TXT = 'digits4000_trainset.txt'
VEC_LENGTH = 784
SET_NUM = 2

In [3]:
labels_df = pd.read_csv(LABELS_TXT, header=None, names=['label'])
labels = labels_df['label']

# check
print('labels_df.shape:', labels_df.shape)
print('labels.shape:', labels.shape)

labels_df.shape: (4000, 1)
labels.shape: (4000,)


In [4]:
vecs_df = pd.read_csv(VECS_TXT, sep='\t', header=None, names=['f_{}'.format(i) for i in range(VEC_LENGTH)])

# 数据归一化
vecs_df = vecs_df / 255.

# check
print('vecs_df.shape:', vecs_df.shape)

vecs_df.shape: (4000, 784)


In [5]:
X_train = vecs_df[0: 2000]
y_train = labels_df['label'][0: 2000]
X_test = vecs_df[2000: ]
y_test = labels_df['label'][2000: ]
X_train, y_train = shuffle(X_train, y_train)

In [6]:
def knn_func(X_train, X_test, y_train, y_test):
    n_neighbors_list = [1, 3, 5, 7, 9, 11]
    for n_neighbors in n_neighbors_list:
        clf = KNeighborsClassifier(n_neighbors=n_neighbors)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print('n_neighbors={}: {}'.format(n_neighbors, metrics.accuracy_score(y_test, y_pred)))

In [7]:
def svm_func(X_train, X_test, y_train, y_test):
    """
    默认参数，不寻参
    """
    svc = SVC()
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    print('accuracy_score: {:4f}'.format(metrics.accuracy_score(y_pred, y_test)))

In [8]:
def pca_func(X_train, X_test, n_components=20): 
    """
    n_components: 要降到的维度
    """
    pca = PCA(n_components=n_components)
    pca.fit(X_train)
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    
    # check
    print('X_train_pca shape:', X_train_pca.shape)
    print('X_test_pca shape:', X_test_pca.shape)
    
    return X_train_pca, X_test_pca

In [9]:
print('-' * 60)
print('knn:')
knn_func(X_train, X_test, y_train, y_test)
print('-' * 60)
print('svm:')
svm_func(X_train, X_test, y_train, y_test)
print('-' * 60)
print('pca:')
X_train_pca, X_test_pca = pca_func(X_train, X_test)
print('-' * 60)
print('pca + knn;')
knn_func(X_train_pca, X_test_pca, y_train, y_test)
print('-' * 60)
print('pca + svm:')
svm_func(X_train_pca, X_test_pca, y_train, y_test)

------------------------------------------------------------
knn:
n_neighbors=1: 0.9135
n_neighbors=3: 0.918
n_neighbors=5: 0.917
n_neighbors=7: 0.9185
n_neighbors=9: 0.9085
n_neighbors=11: 0.9055
------------------------------------------------------------
svm:
accuracy_score: 0.894000
------------------------------------------------------------
pca:
X_train_pca shape: (2000, 20)
X_test_pca shape: (2000, 20)
------------------------------------------------------------
pca + knn;
n_neighbors=1: 0.9215
n_neighbors=3: 0.928
n_neighbors=5: 0.9305
n_neighbors=7: 0.9345
n_neighbors=9: 0.9325
n_neighbors=11: 0.9315
------------------------------------------------------------
pca + svm:
accuracy_score: 0.951500
