# This Notebook will be for writing codes for basic ML like loss functions, optimization functions, etc.. 

### SVM Loss 

Required:
1. Weight matrix
2. Sample input data
3. bias vector

Methods:
1. Loss calculate:

    a. Non - vectorized
    b. Half - Vectorized
    c. Full - Vectorized

In [80]:
import numpy as np

In [81]:
def Debug(desc, value):
    print "\n" + desc +  "\n"
    print value

In [125]:
def L2_regularize(W):
    W_squared = np.square(W)
    L2_reg = np.sum(W_squared)
    return L2_reg

In [210]:
# Inputs: 
# @y : 1 x N (ground truth labels)
# @wts : C x D (class x feature dim)
# @datapts : => N x D (data points x features)
# @bias : C x 1

# output:
# loss : SVM loss

# TODO: 
# 1. bias can be incorporated inside wts itself by creating one more column vector and corresponding change in datapts.

def svm_loss(wts, datapts, y, bias):
    # TODO: check dim of matrices
    scores = np.add((np.dot(wts, datapts.T)), bias);  # C x N
    Debug("Scores: ", scores)
    
    loss_vec = np.zeros(y.size)
    delta = 1.0
    
    ## No vectorization ##  (Not updated)
    '''
    # loop for each data point
    for i in range(y.size):
        loss_i = 0;
        correct_label = y[i]
        correct_class_score = scores[correct_label, i]

        # loop for number of classes
        for j in range(scores.shape[0]):
            if(correct_label != j):
                hinge_loss_i_j = max(scores[j, i] - correct_class_score + delta, 0)
                loss_i += hinge_loss_i_j       
        
        loss_vec[i] = loss_i
    '''
    
    ## Half - vectorized  ## (Not updated)
    '''
    for i in range(y.size):
        loss_i = 0;
        correct_class_score = scores[y[i], i]
                
        loss_margins = np.maximum(0, scores[:, i] - correct_class_score + delta)
        loss_margins[y[i]] = 0   # for correct class margin is 0
        
        loss_i = np.sum(loss_margins)        
        
        loss_vec[i] = loss_i
    '''
    
    ##  FULL VECTORIZED ##
    ''
    correct_score_mat = scores[y, np.arange(scores.shape[1])]   # 1 x N
    Debug("Correct Label Scores:", correct_score_mat)
    
    loss_margin = np.maximum(scores - correct_score_mat + delta, 0)
    loss_margin[y, np.arange(loss_margin.shape[1])] = 0     # Making loss of actual labels 0
    Debug("Loss Margin: ", loss_margin)
    
    loss_vec = np.sum(loss_margin, axis=0)
    ''
    
    full_loss = np.sum(loss_vec) / loss_vec.size
    
    # Regularize
    reg_strength = 1e-3
    loss_reg = full_loss + 0.5 * reg_strength * L2_regularize(wts)
    
    return loss_reg

In [211]:
# INPUTS:
# N = number of points per class
# D = dimensionality
# K = number of classes

# OUTPUTS:
# X = data points
# y = labels
def get_spiral_data(N, D, K):
    
    X = np.zeros((N*K,D)) # data matrix (each row = single example)
    y = np.zeros(N*K, dtype='uint8') # class labels
    for j in xrange(K):
        ix = range(N*j,N*(j+1))
        r = np.linspace(0.0,1,N) # radius
        t = np.linspace(j*4,(j+1)*4,N) + np.random.randn(N)*0.2 # theta
        X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
        y[ix] = j
        
    return X, y

In [212]:
def get_sample_data():
    #wts = np.random.random((3,4))
    
    N = 100
    D = 2
    K = 3

    # TODO: Center the data
    # N x D
    
    datapts, y = get_spiral_data(N, D, K)
    
    # initialize parameters randomly
    wts = 0.01 * np.random.randn(K,D)
    bias = np.zeros((K,1))
    
    '''
    datapts = np.array([[2, 1, 4, 3],
                       [5, 8, 2, 9],
                       [11, 4, 1, 6],
                       [7, 3, 6, 5]
                       ], dtype='f')
    
    y = np.array([1, 0, 2, 1])
   
    # C x D
    wts = np.array([ [  1,   5,   9,   9],
                     [  6,   3,   3,   6],
                     [  7,   5,   8,   9]
                    ], dtype='f')
    
    bias = np.random.random(wts.shape[0])
    '''
    
    return wts, datapts, bias, y

In [213]:
def test_svm_loss():
    wts, datapts, bias, y = get_sample_data()
    Debug("Weights:", wts)
    
    #Debug("Data:", datapts)
    
    loss = svm_loss(wts, datapts, y, bias)
    Debug("Loss:", loss)


In [214]:
test_svm_loss()


Weights:

[[ 0.00840518 -0.01224155]
 [-0.01177045 -0.00184941]
 [ 0.00317353 -0.0145194 ]]

Scores: 

[[  0.00000000e+00  -1.20129457e-04  -1.13405658e-04  -4.24196923e-04
   -5.22564439e-04  -2.50664137e-04  -4.40229445e-04  -7.32676521e-04
   -4.83463106e-04  -7.07554966e-04  -5.05838660e-04  -1.21181319e-03
   -8.66139752e-04  -8.82304489e-04  -1.43247919e-03  -1.40621816e-04
   -1.06365379e-03  -1.02395476e-03  -1.05325857e-03  -3.60015409e-04
   -7.35673283e-04  -3.37834412e-04   8.48237935e-04  -2.05496161e-03
    1.48973524e-03   9.98446094e-04  -1.22665638e-03   9.24446098e-04
   -9.93734807e-04  -8.68911533e-04   1.18691842e-03  -5.95045523e-04
    9.53641081e-04   7.12541943e-04   2.31755822e-03   2.98077049e-03
    2.33308233e-03   2.70989447e-03   4.35209235e-03   4.17160175e-03
    5.30867032e-03   4.04102477e-03   4.81696911e-03   3.19813359e-03
    6.48090826e-03   5.46524185e-03   4.87136193e-03   6.37824942e-03
    6.11045396e-03   7.20775732e-03   5.76631230e-03   7

In [224]:
# TODO:
# 1. Take care of numerical stability for high scores
# 2. Regularization

# INPUT: 
# @y : 1 x N (ground truth labels)
# @wts : C x D (class x feature dim)
# @datapts : => N x D (features x data points)
# @bias : C x 1

# OUTPUT:
# tuple (loss, gradient of scores)
def softmax_loss(wts, datapts, y, bias):
    # TODO: check dim of matrices
    scores = np.add((np.dot(wts, datapts.T)), bias);   # C x N
    #Debug("Scores: ", scores)
    
    num_points = datapts.shape[0]
    reg_strength = 1e-3
    
    correct_scores = scores[y, np.arange(num_points)]
    #Debug("Correct Scores:", correct_scores)

    #correct_scores_exp = np.exp(correct_scores)
    
    # TODO: it is better to center the scores for each data point before applying exp and taking sum. 
    # This will avoid numerical bloating
    scores_exp = np.exp(scores)    
    scores_exp_sum = np.sum(scores_exp, axis=0)
    #Debug("Exponential Scores Sum:", scores_exp_sum)
    
    scores_prob = scores_exp / scores_exp_sum 
    
    # calculate cross entropy loss
    prob = scores_prob[y, np.arange(num_points)]   
    #Debug("Softmax output: ", prob)
    
    log_prob = np.log(prob)
    #Debug("Log prob: ", log_prob)
    
    loss = -1 *  np.sum(log_prob) / y.size
    
    # Regularize
    loss_reg = loss + 0.5 * reg_strength * L2_regularize(wts)
    
    # Compute gradient of scores
    # this is  (del Loss / del scores[j]): See derivation for more info
    dscores = scores_prob
    dscores[y, np.arange(num_points)] -= 1    
    dscores /= num_points
    
    return loss_reg, dscores

In [225]:
def test_softmax_loss():
    reg_strength = 1e-3
    
    wts, datapts, bias, y = get_sample_data()
    Debug("Weights:", wts)
    
    Debug("Data:", datapts)
    
    loss, dlossscores = softmax_loss(wts, datapts, y, bias)
    Debug("Loss:", loss)
    
    # Calculate weight Gradient
    dW = np.dot(datapts.T, dlossscores.T)    # backpropagate loss function scores gradient to weights
    dW = dW.T
    dW += reg_strength * wts
    Debug("Weight Gradient: ", dW)
    
    #bias gradient
    dB = np.sum(dlossscores, axis=1, keepdims=True)
    Debug("Bias Gradient: ", dB)

In [226]:
test_softmax_loss()


Weights:

[[ 0.00204381 -0.00669139]
 [ 0.00140105 -0.00211174]
 [ 0.00079769  0.0098836 ]]

Data:

[[ -0.00000000e+00   0.00000000e+00]
 [ -3.58925258e-04   1.00946311e-02]
 [ -4.94479282e-03   1.95875125e-02]
 [ -2.50128445e-03   3.01996229e-02]
 [  3.95655219e-03   4.02098517e-02]
 [  1.51724491e-02   4.81721591e-02]
 [  3.96337268e-02   4.58504339e-02]
 [  4.37615667e-02   5.55375110e-02]
 [  1.16498273e-02   7.99639134e-02]
 [  3.08835291e-02   8.55024587e-02]
 [  6.29500679e-02   7.89957559e-02]
 [  6.60109679e-02   8.93769049e-02]
 [  6.30896986e-02   1.03499122e-01]
 [  4.65796393e-02   1.22774084e-01]
 [  8.84651311e-02   1.10326243e-01]
 [  6.61566683e-02   1.36308974e-01]
 [  8.61157027e-02   1.36762091e-01]
 [  1.52678842e-01   7.85872654e-02]
 [  9.75379672e-02   1.53441182e-01]
 [  1.09484655e-01   1.57626415e-01]
 [  8.23999366e-02   1.84451653e-01]
 [  1.36724761e-01   1.62178137e-01]
 [  2.05848642e-01   8.37200848e-02]
 [  1.47760951e-01   1.79278514e-01]
 [  2.27962

In [241]:
def softmax_classifier(data, labels):
    reg_strength = 1e-3
    
    W, ignore_1, bias, ignore_2 = get_sample_data()
    #Debug("Weights:", wts)
    
    #Debug("Data:", datapts)
    
    step_size = 1e-0
    for i in xrange(200):
        loss, dlossscores = softmax_loss(W, data, labels, bias)
        #Debug("Loss:", loss)
    
        # Calculate weight Gradient
        dW = np.dot(data.T, dlossscores.T)    # backpropagate loss function scores gradient to weights
        dW = dW.T
        dW += reg_strength * W
        #Debug("Weight Gradient: ", dW)
    
        #bias gradient
        dB = np.sum(dlossscores, axis=1, keepdims=True)
        #Debug("Bias Gradient: ", dB)
        
        W -= step_size * dW
        bias -= step_size * dB
        
        if i % 10 == 0:
            print "iteration %d: loss %f" % (i, loss)
    return W, bias

In [259]:
def test_softmax_classifier():
    ignore_1, data, ignore_2, labels = get_sample_data()
    
    W, bias = softmax_classifier(data, labels)
    
    #print W.shape
    #print bias.shape
    
    # evaluate training set accuracy
    scores = np.dot(data, W.T) + bias.T
    #print scores.shape
    
    predicted_class = np.argmax(scores, axis=1)
    print 'training accuracy: %.2f' % (np.mean(predicted_class == labels))
    

In [257]:
test_softmax_classifier()

iteration 0: loss 1.098253
iteration 10: loss 0.900055
iteration 20: loss 0.825946
iteration 30: loss 0.791960
iteration 40: loss 0.774068
iteration 50: loss 0.763763
iteration 60: loss 0.757453
iteration 70: loss 0.753417
iteration 80: loss 0.750751
iteration 90: loss 0.748946
iteration 100: loss 0.747700
iteration 110: loss 0.746826
iteration 120: loss 0.746205
iteration 130: loss 0.745760
iteration 140: loss 0.745438
iteration 150: loss 0.745203
iteration 160: loss 0.745031
iteration 170: loss 0.744904
iteration 180: loss 0.744810
iteration 190: loss 0.744741
(3, 2)
(3, 1)
(300, 3)
training accuracy: 0.52
