In [2]:
"""
Prepared for JIZHI course: machine learning algorithms
Author: Zhengxia Zou
Date: 20180605
Description:
A simple implementation of two basic models:
1.logistic reg. and 2. linear svm.
Tested on two groups of dataset:
1. simple mnist (0 vs 1), 2. 2d gaussian toy-data
"""


import numpy as np
import matplotlib.pyplot as plt
import os

  'Matplotlib is building the font cache using fc-list. '


#### Load the gaussian data (.npy) from the disk and visualize the gaussian data.

In [3]:
# load the gaussian data (.npy) from the disk
def load_gaussian_data(in_path):
    data = np.load(in_path + '/points2d.npy')
    target = np.load(in_path + '/label.npy')
    return data, target


# visualize the gaussian data
def show_gaussian_data(data, target):
    plt.scatter(data[target == 1, 0], data[target == 1, 1], color='r')
    plt.scatter(data[target == -1, 0], data[target == -1, 1], color='b')
    plt.show()

#### Load the simple-mnist (.npy) from the disk and visualize the simple-mnist data.

In [4]:
# load the simple-mnist image data from disk
def load_mnist_data(in_path):
    data = np.load(in_path+'/data.npy')
    target = np.load(in_path+'/target.npy')

    return data, target


# visualize the mnist data
def show_mnist_data(data, target):

    nSamples = data.shape[0]
    if nSamples < 20:
        return

    D = (data.shape[1])
    imgSize = int(D**0.5)

    for i in range(0, 10):
        thisImg = data[i, :].reshape((imgSize, imgSize))
        plt.subplot(2, 10, i+1)
        plt.imshow(thisImg, cmap='gray')
        plt.title('%d' % target[i])
        thisImg = data[-i-1, :].reshape((imgSize, imgSize))
        plt.subplot(2, 10, i+11)
        plt.imshow(thisImg, cmap='gray')
        plt.title('%d' % target[-i-1])
    plt.show()

#### Split the data and label into training and test parts.

In [5]:
# split the data and label into training and test parts
def data_split(data, target, tr_te_ratio = 0.5):

    rnd_idx = np.random.permutation(len(target))
    data = data[rnd_idx, :]
    target = target[rnd_idx]

    split_idx = int(len(target) * tr_te_ratio)
    data_tr = data[:split_idx, :]
    data_te = data[split_idx:, :]
    target_tr = target[:split_idx]
    target_te = target[split_idx:]

    sorted_idx = np.argsort(target_tr)
    data_tr = data_tr[sorted_idx, :]
    target_tr = target_tr[sorted_idx]
    sorted_idx = np.argsort(target_te)
    data_te = data_te[sorted_idx, :]
    target_te = target_te[sorted_idx]

    return data_tr, data_te, target_tr, target_te

#### Define the Logistic Regression Loss Function.

In [6]:
def logistic_loss_func(w, m_b_X, m_b_y): 
    
    # solve: min 1/N*sum[ln(1+exp(y*<x, w>))] + reglr_beta*|w|_2^2
    # X: each col is a sample, y: is a vector
    # grad: 1/N*X*(-1*Y.*(1-logistic(Y.*(w'*X)'))) + 2*lambda*w;
    
    # define an element-wise sigmoid func
    def my_sigmoid(x):
        return 1/(1+np.exp(-1*x))
    
    
    reglr_beta = 0.0001  # regularization coefficient

    mini_batch = len(m_b_y)
    
    f_val = np.mean(np.log(1 + np.exp(-m_b_y*(w.dot(m_b_X)).T))) + reglr_beta*np.linalg.norm(w)
    g = 1/float(mini_batch)* m_b_X.dot(-1*m_b_y*(1-my_sigmoid(m_b_y*(w.dot(m_b_X)).T)))+ 2*reglr_beta*w
    
    return f_val, g

#### Define the Linear SVM Loss Function.

In [7]:
def linear_svm_loss_func(w, m_b_X, m_b_y): 
    # solve: min 0.5*|w|_2^2 + C*sum[max(0, 1 - y*<x, w>)]
    # X: each col is a sample, y: is a vector
    # grad: w + C*X*(y.*(sign(y.*(w'*X)'-1)-1))
    
    mini_batch = len(m_b_y)
    
    C = 1.0  # penalty coefficient
    
    f_val =  0.5*np.linalg.norm(w) + C*np.sum(np.maximum(0, 1 - m_b_y*(w.dot(m_b_X)).T))
    g = w + C * m_b_X.dot(m_b_y*(np.sign(m_b_y*(w.dot(m_b_X)).T - 1)-1))
    return f_val, g

#### Train a linear model w.

In [8]:
# train a linear model w: 
# method='lr': logistic regression;
# method='svm': linear svm

def my_model_train(X, y, method, learn_rate=0.001, mini_batch = 50):

    X = X.T

    # initialize params
    w = np.random.randn(X.shape[0])  # model
    max_iter_num = 1e4  # max iteration steps

    T = 0
    obj_curve = [0.0]  # loss curve
    while T < max_iter_num:
        mini_batch_idx = np.random.permutation(len(y))[:mini_batch]
        m_b_X = X[:, mini_batch_idx]
        m_b_y = y[mini_batch_idx]
        
        if method is 'lr':
            f_val, g = logistic_loss_func(w, m_b_X, m_b_y)
        if method is 'svm':
            f_val, g = linear_svm_loss_func(w, m_b_X, m_b_y)

        w = w - learn_rate*g

        if np.mod(T, 500) == 1:
            print('Iter: %5d, f_val: %.5f, ||g||: %.5f'
                  % (T, f_val, np.linalg.norm(g)))

        obj_curve.append(f_val)
        T = T + 1

        if np.linalg.norm(g) < 1e-3:
            break

    plt.plot(obj_curve[1:])
    plt.show()

    return w


#### Test a linear model w.

In [12]:
# test a linear model w
def my_model_test(X, y, w):
    X = X.T
    # X: each col is a sample, y: is a vector
    pred_score = w.T.dot(X)
    pred_class = np.sign(pred_score)
    err_flag = np.zeros(len(y))
    err_flag[pred_class != y] = 1.0
    acc = 1. - sum(err_flag)/len(y)
    return acc

#### Load the data...

In [13]:
""" data1: simple mnist classification """
# data, target = load_mnist_data('/home/kesci/input/mnist_simple6464/')
# show_mnist_data(data, target)
# data_tr, data_te, target_tr, target_te = data_split(data, target, tr_te_ratio = 0.5)

""" data2: gaussian-2d data classification """
data, target = load_gaussian_data('/home/kesci/input/gaussian_simple8268/')
show_gaussian_data(data, target)

data_tr, data_te, target_tr, target_te = data_split(data, target, tr_te_ratio = 0.5)

In [15]:
#data[target == 1, 0]

#### Demo 1: logistic regression model.

In [14]:
""" demo 1: logistic regression model """
model = my_model_train(data_tr, target_tr, method='lr', learn_rate=0.1, mini_batch =  50)
acc_tr = my_model_test(data_tr, target_tr, model)
acc_te = my_model_test(data_te, target_te, model)
print('Logistic-Regress Training Accuracy: %s %%' % (acc_tr*100))
print('Logistic-Regress Testing Accuracy: %s %%' % (acc_te*100))

Iter:     1, f_val: 0.37899, ||g||: 0.37942
Iter:   501, f_val: 0.04512, ||g||: 0.03023
Iter:  1001, f_val: 0.03467, ||g||: 0.03343
Iter:  1501, f_val: 0.02484, ||g||: 0.01641
Iter:  2001, f_val: 0.08527, ||g||: 0.00549
Iter:  2501, f_val: 0.08858, ||g||: 0.01544
Iter:  3001, f_val: 0.11544, ||g||: 0.03071
Iter:  3501, f_val: 0.06336, ||g||: 0.01081
Iter:  4001, f_val: 0.11390, ||g||: 0.02660
Iter:  4501, f_val: 0.01844, ||g||: 0.00991
Iter:  5001, f_val: 0.00866, ||g||: 0.00660
Iter:  5501, f_val: 0.02939, ||g||: 0.01192
Iter:  6001, f_val: 0.04710, ||g||: 0.01850
Iter:  6501, f_val: 0.01087, ||g||: 0.00607
Iter:  7001, f_val: 0.01016, ||g||: 0.00851
Iter:  7501, f_val: 0.01473, ||g||: 0.00598
Iter:  8001, f_val: 0.00912, ||g||: 0.00758
Iter:  8501, f_val: 0.11953, ||g||: 0.03177
Iter:  9001, f_val: 0.02523, ||g||: 0.00895
Iter:  9501, f_val: 0.00936, ||g||: 0.00378


Logistic-Regress Training Accuracy: 99.3 %
Logistic-Regress Testing Accuracy: 99.5 %


#### Demo 2: linear SVM model.

In [16]:
""" demo 2: linear SVM model """
model = my_model_train(data_tr, target_tr, method='svm', learn_rate=0.001, mini_batch = 50)
acc_tr = my_model_test(data_tr, target_tr, model)
acc_te = my_model_test(data_te, target_te, model)
print('Linear-SVM Training Accuracy: %s %%' % (acc_tr*100))
print('Linear-SVM Testing Accuracy: %s %%' % (acc_te*100))

Iter:     1, f_val: 63.05014, ||g||: 163.91080
Iter:   501, f_val: 1.44601, ||g||: 2.06279
Iter:  1001, f_val: 1.28042, ||g||: 1.10342
Iter:  1501, f_val: 1.26956, ||g||: 1.12171
Iter:  2001, f_val: 1.32190, ||g||: 3.37211
Iter:  2501, f_val: 0.95282, ||g||: 1.90836
Iter:  3001, f_val: 0.95821, ||g||: 1.91642
Iter:  3501, f_val: 3.15007, ||g||: 7.29011
Iter:  4001, f_val: 3.23882, ||g||: 2.37266
Iter:  4501, f_val: 2.29080, ||g||: 1.75084
Iter:  5001, f_val: 1.23106, ||g||: 4.37240
Iter:  5501, f_val: 2.45222, ||g||: 1.42891
Iter:  6001, f_val: 2.00989, ||g||: 3.34719
Iter:  6501, f_val: 1.41197, ||g||: 3.24067
Iter:  7001, f_val: 1.30273, ||g||: 4.17758
Iter:  7501, f_val: 0.96414, ||g||: 2.08859
Iter:  8001, f_val: 0.97448, ||g||: 7.19849
Iter:  8501, f_val: 0.96409, ||g||: 1.92818
Iter:  9001, f_val: 0.96902, ||g||: 3.72892
Iter:  9501, f_val: 2.75550, ||g||: 5.22020


Linear-SVM Training Accuracy: 99.2 %
Linear-SVM Testing Accuracy: 99.5 %
