In [None]:
import struct
import math
from array import array
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
%matplotlib inline

import random

from skimage import feature as ft
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
#标签读取
def read_MNIST_lable(location):
    with open(location, "rb") as f:
        magic, size = struct.unpack(">II", f.read(8))
        labels = array("B", f.read())
        labels = np.array(labels,dtype='int32')
    return magic, size, labels

#图片读取
def read_MNIST_Pics(location):
    with open(location, "rb") as f:
        magic, size, rows, cols = struct.unpack(">IIII", f.read(16))
        image_data = array("B", f.read())
        images = []
        for i in range(size):
            images.append([0] * rows * cols)
        for i in range(size):
            images[i][:] = image_data[i * rows * cols:(i + 1) * rows * cols]
        shape = (rows,cols)
        image_frame = pd.DataFrame(images,columns=['Pixel'+str(i) for i in range(784)])
    return magic,shape,image_frame

#图片显示            
def show_MNIST_pics(image_df,sele_index,shape,num):
    for i in range(num):
        plt.subplot(5,num/5+1,i+1)
        img = np.array(image_df.loc[sele_index[i],:],dtype='int32')
        img = img.reshape(shape[0],shape[1])
        img = Image.fromarray(img)
        plt.imshow(img, cmap='gray')
        plt.axis("off")

In [None]:
magic,size,labels = read_MNIST_lable('Y:/Kaggle/统计学习方法代码/train-labels.idx1-ubyte')
magic1,shape,image_frame = read_MNIST_Pics('Y:/Kaggle/统计学习方法代码/train-images.idx3-ubyte')

In [None]:
image_frame.head()

In [None]:
labels

In [None]:
num = 23
sele_index = np.random.randint(1,60000,num)
show_MNIST_pics(image_frame.loc[sele_index,:],sele_index,shape,num)

In [None]:
# 抽取HOG特征并转成dataframe
def get_hog(img_df):
    image_value = img_df.get_values()
    features = []
    for i in range(img_df.shape[0]):
        features.append(ft.hog(image_value[i].reshape(28,28),  # input image
                  orientations=9,  # number of bins
                  pixels_per_cell=(7,7), # pixel per cell
                  cells_per_block=(2,2), # cells per blcok
                  block_norm = 'L2-Hys', #  block norm : str {‘L1’, ‘L1-sqrt’, ‘L2’, ‘L2-Hys’}
                  transform_sqrt = True, # power law compression (also known as gamma correction)
                  feature_vector=True, # flatten the final vectors
                  visualise=False)) # return HOG map
    return pd.DataFrame(features,columns=['Fture_'+str(i) for i in range(324)])

In [None]:
#定义逻辑斯底回归模型
class LogisticRegression(object):

    def __init__(self):
        #初始化学习率和迭代次数
        self.learning_step = 0.00001
        self.max_iteration = 8000

    def predict_y(self,x):
        #计算经验y，用于模型训练
        wx = np.dot(self.w,x)
        exp_wx = math.exp(wx)
        
        predict1 = exp_wx / (1 + exp_wx)
        predict0 = 1 / (1 + exp_wx)
        if predict1 > predict0:
            return 1
        else:
            return 0


    def train(self,features, labels):
        #features应该是M*N矩阵
        self.w = np.zeros(features.shape[1] + 1)

        correct_count = 0
        learnig_times = 0

        while learnig_times < self.max_iteration:
            index = random.randint(0, len(labels) - 1)
            x = list(features[index])
            x.append(1.0)
            y = labels[index]

            if y == self.predict_y(x):
                correct_count += 1
                #全部正确分类时停止训练
                if correct_count > self.max_iteration:
                    break
                continue

            learnig_times += 1
            correct_count = 0

            wx = np.dot(self.w,x)
            exp_wx = math.exp(wx)
            
            self.w += self.learning_step * (np.dot(x,y) - np.dot(x,exp_wx)/(1+exp_wx))
            print('训练完成度：',learnig_times/self.max_iteration*100)
        return self.w
    
    def predict(self,features):
        labels = []
        
        for index in range(features.shape[0]):
            x = list(features[index])
            x.append(1)
            labels.append(self.predict_y(x))

        return labels

In [None]:
hog_df = get_hog(image_frame)

In [None]:
#拆分训练集，验证集
train_features_df, test_features_df, train_labels, test_labels = train_test_split(hog_df, 
                                                                            labels, 
                                                                            test_size=0.4,
                                                                            random_state=0)

In [None]:
# one vs all 训练模型
def train_one_vs_all(train_features_df,train_labels):
    module_container = []
    for num in range(10):
        labels_trans = np.where(train_labels==num,1,0)
        logistic_reg = LogisticRegression()
        module_container.append(logistic_reg.train(train_features_df.get_values(),labels_trans))
        print('第%d个模型训练完成'%(num+1))
    return np.array(module_container)

def predict_one_vs_all(module_container,test_features_df,test_labels):
    test_features_df['plusone'] = 1
    result_matrix = np.zeros((test_features_df.shape[0],10))
    result_matrix = np.dot(test_features_df.get_values(), module_container.transpose())
    result = np.argmax(result_matrix,axis=1)
    return result

In [None]:
module_container = train_one_vs_all(train_features_df,train_labels)

In [None]:
predict = predict_one_vs_all(module_container,test_features_df,test_labels)
score = accuracy_score(predict, test_labels)
score