## logstic

### y = {0,1}

In [1]:
from __future__ import division, print_function

import numpy as np


#######################################################################
# Replace TODO with your code
#######################################################################
def softmax(x):
    # avoid overflow

    
    if len(x.shape) ==1:
        c = np.max(x,0)
        exp_x = np.exp(x - c)
        sum_exp_x = np.sum(exp_x,0)
        y = exp_x / sum_exp_x
    else:
        c = np.transpose(np.tile(np.max(x,1),(x.shape[1],1)))
        exp_x = np.exp(x - c)
        sum_exp_x = np.sum(exp_x,1)
        y = np.transpose(np.divide(exp_x.T,sum_exp_x))

    return y 

def underlog(x):
    x = x - np.max(x)
    y = x - np.log(np.sum(np.exp(x)))
    
    return y

def binary_train(X, y, w0=None, b0=None, step_size=0.5, max_iterations=1000):
    """
    Inputs:
    - X: training features, a N-by-D numpy array, where N is the 
    number of training points and D is the dimensionality of features
    - y: binary training labels, a N dimensional numpy array where 
    N is the number of training points, indicating the labels of 
    training data
    - step_size: step size (learning rate)
    - max_iterations: number of iterations to perform gradient descent

    Returns:
    - w: D-dimensional vector, a numpy array which is the weight 
    vector of logistic regression
    - b: scalar, which is the bias of logistic regression

    Find the optimal parameters w and b for inputs X and y.
    Use the *average* of the gradients for all training examples
    multiplied by the step_size to update parameters.
    """
    N, D = X.shape
    assert len(np.unique(y)) == 2


    w = np.zeros(D)
    if w0 is not None:
        w = w0
    
    b = 0
    if b0 is not None:
        b = b0


    """
    TODO: add your code here
    """
    ## w = w - step * sum(σ(wxn+b)-yn)xn
    ## b = b - step * sum(σ(wxn+b)-yn)
    ## wtx + b = np.sum(X_train * w.T,1) +b
    # x * np.tile(dfn, (D,1)).T
    for i in range(max_iterations):
        z = (np.sum(X * w.T,1) +b)
        sgm = sigmoid(z) - y 
        b = b - step_size * np.sum(sgm,0)
        w = w - step_size * np.sum(X * np.tile(sgm, (D,1)).T, 0)
        
    assert w.shape == (D,)
    return w, b


def binary_predict(X, w, b):
    """
    Inputs:
    - X: testing features, a N-by-D numpy array, where N is the 
    number of training points and D is the dimensionality of features
    
    Returns:
    - preds: N dimensional vector of binary predictions: {0, 1}
    """
    N, D = X.shape
    preds = np.zeros(N) 


    """
    TODO: add your code here
    """
    y = np.sum(X * w, 1) + b
    preds = np.round(sigmoid(y))
    
    assert preds.shape == (N,) 
    return preds


def multinomial_train(X, y, C, 
                     w0=None, 
                     b0=None, 
                     step_size=0.5, 
                     max_iterations=1000):
    """
    Inputs:
    - X: training features, a N-by-D numpy array, where N is the 
    number of training points and D is the dimensionality of features
    - y: multiclass training labels, a N dimensional numpy array where
    N is the number of training points, indicating the labels of 
    training data
    - C: number of classes in the data
    - step_size: step size (learning rate)
    - max_iterations: number of iterations to perform gradient descent

    Returns:
    - w: C-by-D weight matrix of multinomial logistic regression, where 
    C is the number of classes and D is the dimensionality of features.
    - b: bias vector of length C, where C is the number of classes

    Implement multinomial logistic regression for multiclass 
    classification. Again use the *average* of the gradients for all training 
    examples multiplied by the step_size to update parameters.
    
    You may find it useful to use a special (one-hot) representation of the labels, 
    where each label y_i is represented as a row of zeros with a single 1 in
    the column, that corresponds to the class y_i.
    """

    N, D = X.shape

    w = np.zeros((C, D))
    if w0 is not None:
        w = w0
    
    b = np.zeros(C)
    if b0 is not None:
        b = b0


    """
    TODO: add your code here
    """
    ## w = w - step * sum(σ(wxn+b)-yn)xn
    ## b = b - step * sum(σ(wxn+b)-yn)
    ## wtx + b = np.sum(X_train * w.T,1) +b
    # x * np.tile(dfn, (D,1)).T
    for i in range(max_iterations):
        z = (np.sum(X * w.T,1) +b)
        sgm = sigmoid(z) - y 
        b = b - step_size * np.sum(sgm,0)
        w = w - step_size * np.sum(X * np.tile(sgm, (D,1)).T, 0)
        
    assert w.shape == (C, D)
    assert b.shape == (C,)
    return w, b


def multinomial_predict(X, w, b):
    """
    Inputs:
    - X: testing features, a N-by-D numpy array, where N is the 
    number of training points and D is the dimensionality of features
    - w: weights of the trained multinomial classifier
    - b: bias terms of the trained multinomial classifier
    
    Returns:
    - preds: N dimensional vector of multiclass predictions.
    Outputted predictions should be from {0, C - 1}, where
    C is the number of classes

    Make predictions for multinomial classifier.
    """
    N, D = X.shape
    C = w.shape[0]
    preds = np.zeros(N) 

    """
    TODO: add your code here
    """   

    assert preds.shape == (N,)
    return preds


def OVR_train(X, y, C, w0=None, b0=None, step_size=0.5, max_iterations=1000):
    """
    Inputs:
    - X: training features, a N-by-D numpy array, where N is the 
    number of training points and D is the dimensionality of features
    - y: multiclass training labels, a N dimensional numpy array, 
    indicating the labels of each training point
    - C: number of classes in the data
    - w0: initial value of weight matrix
    - b0: initial value of bias term
    - step_size: step size (learning rate)
    - max_iterations: number of iterations to perform gradient descent

    Returns:
    - w: a C-by-D weight matrix of OVR logistic regression
    - b: bias vector of length C

    Implement multiclass classification using one-versus-rest with binary logistic 
    regression as the black-box. Recall that the one-versus-rest classifier is 
    trained by training C different classifiers. 
    """
    N, D = X.shape
    
    w = np.zeros((C, D))
    if w0 is not None:
        w = w0
    
    b = np.zeros(C)
    if b0 is not None:
        b = b0

    """
    TODO: add your code here
    """
    for c in range(C):
        yc = (y==c).astype(int)
        w[c,:], b[c] = binary_train(X, yc, w0=w[c,:], b0=b[c], step_size=0.5, max_iterations=1000)
    
    
    assert w.shape == (C, D), 'wrong shape of weights matrix'
    assert b.shape == (C,), 'wrong shape of bias terms vector'
    return w, b


def OVR_predict(X, w, b):
    """
    Inputs:
    - X: testing features, a N-by-D numpy array, where N is the 
    number of training points and D is the dimensionality of features
    - w: weights of the trained OVR model
    - b: bias terms of the trained OVR model
    
    Returns:
    - preds: vector of class label predictions.
    Outputted predictions should be from {0, C - 1}, where
    C is the number of classes.

    Make predictions using OVR strategy and probability predictions from binary
    classifiers. 
    """
    N, D = X.shape
    C = w.shape[0]
    preds = np.zeros(N) 
    
    """
    TODO: add your code here
    """
    yc = np.dot(X, w.T) +  np.tile(b,(N,1))
    preds = np.argmax(yc, axis = 1)
    
    assert preds.shape == (N,)
    return preds


#######################################################################
# DO NOT MODIFY THE CODE BELOW 
#######################################################################

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def accuracy_score(true, preds):
    return np.sum(true == preds).astype(float) / len(true)

def run_binary():
    from data_loader import toy_data_binary, \
                            data_loader_mnist 

    print('Performing binary classification on synthetic data')
    X_train, X_test, y_train, y_test = toy_data_binary()
        
    w, b = binary_train(X_train, y_train)
    
    train_preds = binary_predict(X_train, w, b)
    preds = binary_predict(X_test, w, b)
    print('train acc: %f, test acc: %f' % 
            (accuracy_score(y_train, train_preds),
             accuracy_score(y_test, preds)))
    
    print('Performing binary classification on binarized MNIST')
    X_train, X_test, y_train, y_test = data_loader_mnist()

    binarized_y_train = [0 if yi < 5 else 1 for yi in y_train] 
    binarized_y_test = [0 if yi < 5 else 1 for yi in y_test] 
    
    w, b = binary_train(X_train, binarized_y_train)
    
    train_preds = binary_predict(X_train, w, b)
    preds = binary_predict(X_test, w, b)
    print('train acc: %f, test acc: %f' % 
            (accuracy_score(binarized_y_train, train_preds),
             accuracy_score(binarized_y_test, preds)))

def run_multiclass():
    from data_loader import toy_data_multiclass_3_classes_non_separable, \
                            toy_data_multiclass_5_classes, \
                            data_loader_mnist 
    
    datasets = [(toy_data_multiclass_3_classes_non_separable(), 
                        'Synthetic data', 3), 
                (toy_data_multiclass_5_classes(), 'Synthetic data', 5), 
                (data_loader_mnist(), 'MNIST', 10)]

    for data, name, num_classes in datasets:
        print('%s: %d class classification' % (name, num_classes))
        X_train, X_test, y_train, y_test = data
        
        print('One-versus-rest:')
        w, b = OVR_train(X_train, y_train, C=num_classes)
        train_preds = OVR_predict(X_train, w=w, b=b)
        preds = OVR_predict(X_test, w=w, b=b)
        print('train acc: %f, test acc: %f' % 
            (accuracy_score(y_train, train_preds),
             accuracy_score(y_test, preds)))
    
        print('Multinomial:')
        w, b = multinomial_train(X_train, y_train, C=num_classes)
        train_preds = multinomial_predict(X_train, w=w, b=b)
        preds = multinomial_predict(X_test, w=w, b=b)
        print('train acc: %f, test acc: %f' % 
            (accuracy_score(y_train, train_preds),
             accuracy_score(y_test, preds)))

"""
if __name__ == '__main__':
    
    import argparse
    import sys

    parser = argparse.ArgumentParser()
    parser.add_argument("--type", )
    parser.add_argument("--output")
    args = parser.parse_args()

    if args.output:
            sys.stdout = open(args.output, 'w')

    if not args.type or args.type == 'binary':
        run_binary()

    if not args.type or args.type == 'multiclass':
        run_multiclass()
"""

'\nif __name__ == \'__main__\':\n    \n    import argparse\n    import sys\n\n    parser = argparse.ArgumentParser()\n    parser.add_argument("--type", )\n    parser.add_argument("--output")\n    args = parser.parse_args()\n\n    if args.output:\n            sys.stdout = open(args.output, \'w\')\n\n    if not args.type or args.type == \'binary\':\n        run_binary()\n\n    if not args.type or args.type == \'multiclass\':\n        run_multiclass()\n'

In [2]:
from __future__ import division, print_function
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def accuracy_score(true, preds):
    return np.sum(true == preds).astype(float) / len(true)

def run_binary():
    from data_loader import toy_data_binary, \
                            data_loader_mnist 

    print('Performing binary classification on synthetic data')
    X_train, X_test, y_train, y_test = toy_data_binary()
        
    w, b = binary_train(X_train, y_train)
    
    train_preds = binary_predict(X_train, w, b)
    preds = binary_predict(X_test, w, b)
    print('train acc: %f, test acc: %f' % 
            (accuracy_score(y_train, train_preds),
             accuracy_score(y_test, preds)))
    
    print('Performing binary classification on binarized MNIST')
    X_train, X_test, y_train, y_test = data_loader_mnist()

    binarized_y_train = [0 if yi < 5 else 1 for yi in y_train] 
    binarized_y_test = [0 if yi < 5 else 1 for yi in y_test] 
    
    w, b = binary_train(X_train, binarized_y_train)
    
    train_preds = binary_predict(X_train, w, b)
    preds = binary_predict(X_test, w, b)
    print('train acc: %f, test acc: %f' % 
            (accuracy_score(binarized_y_train, train_preds),
             accuracy_score(binarized_y_test, preds)))

def run_multiclass():
    from data_loader import toy_data_multiclass_3_classes_non_separable, \
                            toy_data_multiclass_5_classes, \
                            data_loader_mnist 
    
    datasets = [(toy_data_multiclass_3_classes_non_separable(), 
                        'Synthetic data', 3), 
                (toy_data_multiclass_5_classes(), 'Synthetic data', 5), 
                (data_loader_mnist(), 'MNIST', 10)]

    for data, name, num_classes in datasets:
        print('%s: %d class classification' % (name, num_classes))
        X_train, X_test, y_train, y_test = data
        
        print('One-versus-rest:')
        w, b = OVR_train(X_train, y_train, C=num_classes)
        train_preds = OVR_predict(X_train, w=w, b=b)
        preds = OVR_predict(X_test, w=w, b=b)
        print('train acc: %f, test acc: %f' % 
            (accuracy_score(y_train, train_preds),
             accuracy_score(y_test, preds)))
    
        print('Multinomial:')
        w, b = multinomial_train(X_train, y_train, C=num_classes)
        train_preds = multinomial_predict(X_train, w=w, b=b)
        preds = multinomial_predict(X_test, w=w, b=b)
        print('train acc: %f, test acc: %f' % 
            (accuracy_score(y_train, train_preds),
             accuracy_score(y_test, preds)))

In [3]:
from data_loader import toy_data_binary, \
                            data_loader_mnist 

print('Performing binary classification on synthetic data')
X_train, X_test, y_train, y_test = toy_data_binary()
N, D = X_train.shape

Performing binary classification on synthetic data




In [4]:
y_test

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1])

In [5]:
-1 < 0

True

In [6]:
a= False
1 if a else 0

0

In [7]:
y = -1 <= 0
0 if y else 1

0

In [8]:
def y0ne(a):
    return -1 if a == 0 else 1
y0ne(1)

1

In [9]:
y = np.array([1, 0, 1])
y1 = 2*y - np.ones(len(y)).astype(int)

In [10]:
y1

array([ 1, -1,  1])

In [11]:
(y1 +1)/2

array([1., 0., 1.])

In [12]:
y = np.array([1, 0, 1])
def tobinary(y):
    return (2*y - 1).astype(int)
y1 = tobinary(y)

In [13]:
y1

array([ 1, -1,  1])

In [14]:
type(y_test[1])

numpy.int64

In [15]:
type(y1[0])

numpy.int64

In [16]:
X_train

array([[ 1.09135723, -0.55892185],
       [-0.9425491 , -1.12970685],
       [ 1.0654138 , -0.24751864],
       [ 0.54117719,  1.14927333],
       [ 0.91515201,  1.04416088],
       [-0.94611747, -0.27272357],
       [ 1.05845318, -0.04946371],
       [ 0.88265028,  0.09612078],
       [ 0.75749448,  1.20650897],
       [ 1.054367  ,  0.65020118],
       [-0.48477958, -0.92323325],
       [-0.93222036,  0.05921843],
       [-1.19845466,  2.5733598 ],
       [ 0.80463752, -0.56407863],
       [ 0.88558004, -0.53086877],
       [ 0.88289999, -1.51574411],
       [-0.91447135,  0.01392929],
       [ 0.1243599 , -0.09671311],
       [ 0.78492314,  0.4933179 ],
       [ 1.09711512, -0.06575026],
       [ 1.23054129,  0.24380071],
       [ 1.03449554,  0.38240975],
       [-0.89035836, -0.90431663],
       [ 1.30998108,  2.52693243],
       [-0.80990157,  1.10330188],
       [-1.28907266,  0.05572491],
       [ 1.12140739,  1.42050425],
       [-1.06521415, -0.2403254 ],
       [ 0.89253975,

In [17]:
w = np.array([1, 0])
b = 1
np.sum(X_train * w.T,1) +1

array([ 2.09135723e+00,  5.74508969e-02,  2.06541380e+00,  1.54117719e+00,
        1.91515201e+00,  5.38825266e-02,  2.05845318e+00,  1.88265028e+00,
        1.75749448e+00,  2.05436700e+00,  5.15220425e-01,  6.77796376e-02,
       -1.98454663e-01,  1.80463752e+00,  1.88558004e+00,  1.88289999e+00,
        8.55286536e-02,  1.12435990e+00,  1.78492314e+00,  2.09711512e+00,
        2.23054129e+00,  2.03449554e+00,  1.09641637e-01,  2.30998108e+00,
        1.90098429e-01, -2.89072656e-01,  2.12140739e+00, -6.52141464e-02,
        1.89253975e+00, -3.41060551e-01,  6.25676814e-01,  2.19439988e-01,
        1.84887127e+00,  8.80019126e-02, -4.42084656e-01,  2.12480193e+00,
        2.05053790e+00,  1.50305051e-01,  2.08161545e+00,  2.23107861e+00,
        2.01048861e+00, -2.59590983e-01,  1.67414784e+00,  2.00144856e+00,
       -3.22866695e-01, -5.91006439e-01, -1.38605419e-01,  2.19164958e+00,
        2.39905502e-01,  1.97070587e-02,  1.90094923e+00,  2.30479721e+00,
        2.00955769e+00,  

In [18]:
sigmoid(y1)

array([0.73105858, 0.26894142, 0.73105858])

In [19]:
a = np.array([[1, 2],[3,4], [-1,-2]])

In [20]:
a

array([[ 1,  2],
       [ 3,  4],
       [-1, -2]])

In [21]:
b = np.array([2,1,0])

In [22]:
b1 = np.tile(b, (D,1))

In [23]:
a * b1.T

array([[2, 4],
       [3, 4],
       [0, 0]])

In [24]:
a * np.tile(b,(D,1)).T

array([[2, 4],
       [3, 4],
       [0, 0]])

In [25]:
X=X_train
y=y_train

step_size=0.5
max_iterations=1000

y = tobinary(y)
N, D = X.shape


w = np.zeros(D)
#w = np.ones(D)

    
b = 0


    ## w = w + step * sum(σ(-yn(wtx + b)) ynxn)
    ## b = b + step * sum(σ(-yn(wtx + b)) yn)
    ## wtx + b = np.sum(X_train * w.T,1) +b
    # x * np.tile(dfn, (D,1)).T
F = 0.0
for i in range(max_iterations):
    z = y * (np.sum(X * w.T,1) +b)
    sgm = sigmoid(-z) * y
    b1 = b + step_size * np.sum(sgm,0) /N
    w = w + step_size * np.sum(X * np.tile(sgm, (D,1)).T, 0)/N
    b = b1
    fz = np.multiply((np.sum(X * w, 1) + b),y)
    F = np.sum(np.log(1 + np.exp(-z)))
    if i%100 ==0:
        print(F)

242.60151319598086
8.10702163578771
7.917556825309926
7.882984841067894
7.873871215143685
7.871115948987946
7.870226378119814
7.86992898218911
7.869827610634751
7.869792671524879
7.869780551624985
7.869776331567634
7.869774858927504
7.869774344362835
7.869774164427351
7.86977410147808
7.869774079449774
7.869774071740022
7.869774069041412
7.869774068096775
7.869774067766097
7.869774067650339
7.869774067609819
7.869774067595633
7.869774067590669
7.869774067588928
7.869774067588321
7.869774067588107
7.869774067588033
7.869774067588007
7.869774067588001
7.8697740675879935
7.869774067587992
7.869774067587995
7.869774067587993
7.869774067587992
7.869774067587995
7.869774067587994
7.869774067587993
7.86977406758799
7.869774067587991
7.86977406758799
7.869774067587992
7.869774067587992
7.869774067587989
7.869774067587994
7.869774067587993
7.869774067587992
7.869774067587992
7.8697740675879935
7.86977406758799
7.869774067587992
7.8697740675879935
7.869774067587993
7.869774067587993
7.8697740675

In [26]:
X=X_train
y=y_train

step_size=0.5
max_iterations=1000

#y = tobinary(y)
N, D = X.shape
assert len(np.unique(y)) == 2


w = np.zeros(D)
#w = np.ones(D)

    
b = 0


    ## w = w - step * sum(σ(wxn+b)-yn)xn
    ## b = b - step * sum(σ(wxn+b)-yn)
    ## wtx + b = np.sum(X_train * w.T,1) +b
    # x * np.tile(dfn, (D,1)).T
for i in range(max_iterations):
    z = (np.sum(X * w.T,1) +b)
    sgm = sigmoid(z) - y 
    b = b - step_size * np.sum(sgm,0)
    w = w - step_size * np.sum(X * np.tile(sgm, (D,1)).T, 0)
    print(b)

2.5
2.5000021392477048
2.500004504155641
2.5000071185186927
2.5000100086380384
2.500013203583367
2.5000167354811595
2.5000206398303564
2.500024955845572
2.5000297268254825
2.5000350005388587
2.5000408296103975
2.500047271868196
2.500054390574977
2.500062254387789
2.500070936740553
2.5000805140521734
2.500091061597886
2.5001026447883934
2.500115301490823
2.5001290069647046
2.5001436052194936
2.5001586759085663
2.5001732786526274
2.5001854682626146
2.5001913954921253
2.50018370707216
2.500148942814878
2.5000642169894483
2.499896752914183
2.4996235364166863
2.499339598202104
2.4997007230836172
2.5034786947861125
2.5199147624987575
2.570223858070088
2.672352691198323
2.8080527701342
2.948487989256663
3.08026034125395
3.1987197288274936
3.3021382233694467
3.3900124263208924
3.4626358153634564
3.5209040820203965
3.5661275858164014
3.599841803554935
3.6236438363967585
3.639072800746972
3.647535969632872
3.6502727086186373
3.648345136582887
3.642645353318086
3.6339115143250185
3.62274755486907

-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117836
-0.19508823411178372
-0.1950882341117

In [27]:
w, b = binary_train(X_train, y_train, w0=None, b0=None, step_size=0.5, max_iterations=1000)

In [28]:
y_p =binary_predict(X_test, w, b)

In [29]:
np.sum(X_test * w,1)

array([ -8.16336951,  -2.43560776,  -9.00648278,  -7.19670806,
        -7.34618021,  -6.34837233,  -8.35634149,  -9.07129849,
         6.37162733,  -3.94962175,  -8.38959261,   7.27266234,
         8.57298293,   6.33071848,  -6.2881268 ,  -6.78585889,
        -9.90584665,   6.65743033,  -6.56358337,  -5.76214388,
         7.14766421,  -6.33360276,   8.11429372,  -4.04207364,
        -6.73138658,  10.47913348,   7.87770012,   8.62491101,
        -5.25908636,  -3.37177141,   7.13369534,  -5.66236834,
         7.72645908,   6.81120575,  -9.00898401,  -9.21085196,
         7.40017326,   5.34387566,  -8.20556839,   6.25309791,
         7.23351916,  -8.9771956 ,   6.05891788,   8.30907799,
         5.38920126,   9.38889139,  -7.12630952,   7.72269236,
         5.07023082,  -5.92488581,   5.7861725 ,   8.84634521,
        -6.61700086,  -8.7377394 , -11.36714803,   6.76365358,
         9.47589489,  -6.36526793,  -4.12426397,   8.01524905,
        -5.62659769,   9.05046899,   9.03548859,   9.60

In [30]:
y_p

array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 0., 0., 0.,
       1., 0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 0., 0., 1., 0., 1., 1.,
       0., 0., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 0., 1.,
       1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 1., 1., 1., 0., 1., 1., 0.,
       0., 0., 0., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0., 0., 1., 1., 1.,
       0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 1., 1., 0., 1., 1.,
       0., 1., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 1., 0., 1.,
       0., 1., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1.,
       1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1.])

In [31]:
y_test

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1])

In [32]:
y_p == y_test

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [33]:
sgm

array([-1.13593744e-03,  3.74052234e-04, -9.84326236e-04, -8.63961910e-03,
       -7.29656427e-04,  8.87155902e-04, -8.41227927e-04, -2.43841983e-03,
       -1.83413579e-03, -4.19010682e-04,  1.08993606e-02,  1.37746561e-03,
        2.95496794e-03, -8.25005760e-03, -4.56870008e-03, -1.28191414e-02,
        1.48589290e-03,  6.37695103e-01, -3.17427763e-03, -6.54805578e-04,
       -1.88712378e-04, -6.34583271e-04,  6.77970110e-04, -1.02017099e-05,
        9.40935760e-03,  1.16238958e-04, -1.18538090e-04,  4.02529236e-04,
       -1.37441180e-03,  9.65670497e-05,  7.44362626e-02,  6.00687953e-03,
       -1.74380222e-03,  3.15742203e-03,  1.26618028e-04, -4.18222036e-03,
       -8.79482720e-04,  2.64962322e-04, -3.50989919e-03, -4.63758644e-04,
       -5.34415194e-04,  9.43026174e-05, -2.20054059e-02, -6.01207747e-03,
        3.01756548e-04,  7.63558985e-05,  1.64849052e-04, -7.41761749e-04,
        2.73025965e-04,  4.21225742e-04, -2.60462897e-03, -2.48615560e-04,
       -3.24301953e-03,  

In [34]:
x = np.array([[2,1,0],[1,1,1],[3,0,1]])

In [35]:
w = np.array([1,-1,2])

In [36]:
np.dot(x,w)

array([1, 2, 5])

In [37]:
C = 3
for c in range(C):
    print(c)

0
1
2


### confrim OVA

In [38]:
from data_loader import toy_data_multiclass_3_classes_non_separable, \
                        toy_data_multiclass_5_classes, \
                        data_loader_mnist 
    
datasets = [(toy_data_multiclass_3_classes_non_separable(), 
                    'Synthetic data', 3), 
            (toy_data_multiclass_5_classes(), 'Synthetic data', 5)]#, 
            #(data_loader_mnist(), 'MNIST', 10)]

for data, name, num_classes in datasets:
    print('%s: %d class classification' % (name, num_classes))
    X_train, X_test, y_train, y_test = data
        
    print('One-versus-rest:')
    w, b = OVR_train(X_train, y_train, C=num_classes)
    train_preds = OVR_predict(X_train, w=w, b=b)
    preds = OVR_predict(X_test, w=w, b=b)
    print('train acc: %f, test acc: %f' % 
          (accuracy_score(y_train, train_preds),
         accuracy_score(y_test, preds)))

Synthetic data: 3 class classification
One-versus-rest:
train acc: 0.871429, test acc: 0.846667
Synthetic data: 5 class classification
One-versus-rest:




train acc: 0.697143, test acc: 0.666667


In [39]:
w

array([[ -36.0448073 ,  179.81691404],
       [  97.46514605,  -35.74131472],
       [ -19.25540391,  -70.9456114 ],
       [-107.57775957,   17.83314932],
       [ 158.69822292,   15.87422578]])

In [40]:
yc = np.dot(X_test, w.T) +  np.tile(b,(X_test.shape[0],1))

In [41]:
yc.shape

(150, 5)

In [42]:
X_test.shape

(150, 2)

In [43]:
yc

array([[-1667.90678351,  -207.55495101,    89.02624715,  -905.48424371,
         -111.56142864],
       [-1232.81846316,   328.71558557,  -313.50386617, -1579.58280689,
         1043.24118639],
       [-1358.27079172,   -67.88168684,  -107.73505264, -1106.52761372,
          276.49282957],
       [ -758.63883269,    -2.36568691,  -412.78944643, -1259.78854615,
          660.5396405 ],
       [-1322.11613411,   269.67705799,  -249.80495537, -1499.99913883,
          897.70207295],
       [  395.66096259, -1252.60414813,  -489.77430929,    30.39016806,
        -1067.53380166],
       [-1719.00060712,  -394.90505141,   182.40514927,  -683.07584862,
         -470.13662363],
       [-1531.63908901,  -378.56923612,    88.62067255,  -726.2010141 ,
         -357.55042235],
       [ -857.45787604,  -192.81779731,  -295.91414619, -1027.61499527,
          275.18480448],
       [ -486.22290055, -1112.05478262,  -128.95435026,   -17.06016775,
        -1207.66112486],
       [-2133.02607273,  -607.

In [44]:
 np.argmax(yc, axis = 1)

array([2, 4, 4, 4, 4, 0, 2, 2, 4, 3, 2, 3, 3, 2, 4, 4, 4, 4, 2, 0, 0, 3,
       0, 4, 4, 0, 3, 2, 4, 4, 4, 4, 4, 0, 2, 4, 2, 3, 0, 0, 4, 4, 2, 0,
       0, 4, 3, 0, 2, 4, 4, 0, 4, 4, 4, 2, 2, 2, 4, 0, 3, 4, 4, 0, 4, 0,
       2, 2, 2, 2, 4, 0, 4, 4, 4, 4, 3, 4, 0, 3, 2, 0, 3, 4, 0, 2, 3, 3,
       0, 3, 2, 3, 3, 4, 4, 0, 4, 2, 4, 0, 2, 4, 4, 4, 0, 4, 0, 0, 3, 2,
       0, 2, 2, 2, 0, 4, 0, 4, 2, 4, 3, 0, 4, 2, 0, 4, 3, 4, 4, 2, 2, 2,
       4, 4, 3, 3, 4, 0, 2, 0, 4, 4, 4, 2, 2, 4, 4, 3, 4, 0])

## multiclass

In [131]:
def softmax(x):
    # avoid overflow

    
    if len(x.shape) ==1:
        c = np.max(x,0)
        exp_x = np.exp(x - c)
        sum_exp_x = np.sum(exp_x,0)
        y = exp_x / sum_exp_x
    else:
        c = np.transpose(np.tile(np.max(x,1),(x.shape[1],1)))
        exp_x = np.exp(x - c)
        sum_exp_x = np.sum(exp_x,1)
        y = np.transpose(np.divide(exp_x.T,sum_exp_x))
    return y 

In [132]:
len(a.shape)

2

In [133]:
a = np.array([2,4,8])
softmax(a)

array([0.00242826, 0.01794253, 0.97962921])

In [134]:
a = np.array([[2,4,8],[1,1,1]])
softmax(a)

array([[0.00242826, 0.01794253, 0.97962921],
       [0.33333333, 0.33333333, 0.33333333]])

In [90]:
b = np.array([[2,4,8],[1,1,1]])
c = np.array([2,2])
np.transpose(np.divide(b.T,c))

array([[1. , 2. , 4. ],
       [0.5, 0.5, 0.5]])

In [129]:
        x = a
        c = np.transpose(np.tile(np.max(x,1),(x.shape[1],1)))
        exp_x = np.exp(x - c)
        sum_exp_x = np.sum(exp_x,1)
        y = np.transpose(np.divide(exp_x.T,sum_exp_x))

In [130]:
y

array([[0.00242826, 0.01794253, 0.97962921],
       [0.33333333, 0.33333333, 0.33333333]])

In [118]:
np.subtract(a,np.max(x,1).T)

ValueError: operands could not be broadcast together with shapes (2,3) (2,) 

In [46]:
underlog(a)

array([-1.55144471, -1.55144471, -0.55144471])

In [47]:
x = a
x = x - np.max(x)
y = x - np.log(np.sum(np.exp(x)))

In [48]:
y

array([-1.55144471, -1.55144471, -0.55144471])

In [55]:
np.log(softmax(a))

array([-1.55144471, -1.55144471, -0.55144471])

In [56]:
from data_loader import toy_data_multiclass_3_classes_non_separable, \
                        toy_data_multiclass_5_classes, \
                        data_loader_mnist 


X_train, X_test, y_train, y_test = toy_data_multiclass_5_classes()
    




In [57]:
y_train

array([3, 1, 2, 0, 4, 3, 0, 3, 3, 1, 4, 4, 3, 1, 2, 0, 4, 4, 4, 2, 2, 0,
       2, 2, 3, 4, 4, 0, 0, 0, 4, 3, 0, 2, 0, 4, 4, 0, 0, 0, 4, 0, 0, 0,
       2, 2, 2, 2, 1, 3, 3, 1, 3, 2, 3, 2, 0, 0, 3, 4, 4, 0, 3, 1, 1, 0,
       1, 4, 3, 0, 0, 1, 3, 2, 2, 3, 1, 3, 2, 1, 2, 1, 3, 4, 4, 2, 4, 4,
       1, 3, 3, 2, 2, 3, 0, 4, 1, 1, 3, 0, 4, 4, 1, 0, 2, 3, 2, 4, 4, 1,
       1, 4, 4, 4, 4, 3, 3, 4, 2, 1, 3, 4, 0, 4, 2, 2, 1, 2, 3, 2, 2, 0,
       3, 1, 3, 0, 1, 3, 0, 0, 3, 2, 3, 4, 1, 4, 1, 2, 3, 2, 4, 0, 2, 2,
       0, 0, 3, 1, 1, 2, 1, 1, 0, 1, 3, 2, 0, 4, 3, 4, 3, 3, 1, 1, 2, 1,
       0, 1, 0, 2, 4, 3, 2, 0, 4, 0, 2, 2, 3, 4, 0, 0, 0, 3, 4, 2, 4, 0,
       4, 3, 0, 1, 2, 0, 1, 3, 3, 3, 4, 1, 1, 1, 2, 4, 4, 3, 4, 1, 0, 0,
       3, 4, 3, 0, 4, 2, 1, 0, 3, 2, 4, 0, 2, 1, 4, 3, 3, 0, 0, 1, 2, 2,
       1, 3, 0, 0, 0, 2, 1, 1, 1, 1, 3, 0, 0, 2, 1, 2, 0, 4, 0, 4, 0, 1,
       3, 4, 4, 1, 3, 3, 3, 2, 2, 3, 2, 1, 0, 2, 0, 3, 4, 4, 2, 2, 0, 0,
       0, 2, 4, 4, 2, 2, 3, 1, 1, 1, 4, 2, 3, 1, 4,

In [60]:
    X = X_train
    y = y_train
    C = 5
    w0=None, 
    b0=None, 
    step_size=0.5, 
    max_iterations=1000
"""
    Inputs:
    - X: training features, a N-by-D numpy array, where N is the 
    number of training points and D is the dimensionality of features
    - y: multiclass training labels, a N dimensional numpy array where
    N is the number of training points, indicating the labels of 
    training data
    - C: number of classes in the data
    - step_size: step size (learning rate)
    - max_iterations: number of iterations to perform gradient descent

    Returns:
    - w: C-by-D weight matrix of multinomial logistic regression, where 
    C is the number of classes and D is the dimensionality of features.
    - b: bias vector of length C, where C is the number of classes

    Implement multinomial logistic regression for multiclass 
    classification. Again use the *average* of the gradients for all training 
    examples multiplied by the step_size to update parameters.
    
    You may find it useful to use a special (one-hot) representation of the labels, 
    where each label y_i is represented as a row of zeros with a single 1 in
    the column, that corresponds to the class y_i.
"""

    N, D = X.shape

    w = np.zeros((C, D))
    if w0 is not None:
        w = w0
    
    b = np.zeros(C)
    if b0 is not None:
        b = b0


    """
    TODO: add your code here
    """
    ## w = w - step * sum(σ(wxn+b)-yn)xn
    ## b = b - step * sum(σ(wxn+b)-yn)
    ## wtx + b = np.sum(X_train * w.T,1) +b
    # x * np.tile(dfn, (D,1)).T
    
    for c in range(C):
        yc[:,c] = (y==c)
    
    for i in range(max_iterations):
        
        b = b - step_size * np.sum(sgm,0)
        

AttributeError: 'tuple' object has no attribute 'T'

In [61]:
y

array([3, 1, 2, 0, 4, 3, 0, 3, 3, 1, 4, 4, 3, 1, 2, 0, 4, 4, 4, 2, 2, 0,
       2, 2, 3, 4, 4, 0, 0, 0, 4, 3, 0, 2, 0, 4, 4, 0, 0, 0, 4, 0, 0, 0,
       2, 2, 2, 2, 1, 3, 3, 1, 3, 2, 3, 2, 0, 0, 3, 4, 4, 0, 3, 1, 1, 0,
       1, 4, 3, 0, 0, 1, 3, 2, 2, 3, 1, 3, 2, 1, 2, 1, 3, 4, 4, 2, 4, 4,
       1, 3, 3, 2, 2, 3, 0, 4, 1, 1, 3, 0, 4, 4, 1, 0, 2, 3, 2, 4, 4, 1,
       1, 4, 4, 4, 4, 3, 3, 4, 2, 1, 3, 4, 0, 4, 2, 2, 1, 2, 3, 2, 2, 0,
       3, 1, 3, 0, 1, 3, 0, 0, 3, 2, 3, 4, 1, 4, 1, 2, 3, 2, 4, 0, 2, 2,
       0, 0, 3, 1, 1, 2, 1, 1, 0, 1, 3, 2, 0, 4, 3, 4, 3, 3, 1, 1, 2, 1,
       0, 1, 0, 2, 4, 3, 2, 0, 4, 0, 2, 2, 3, 4, 0, 0, 0, 3, 4, 2, 4, 0,
       4, 3, 0, 1, 2, 0, 1, 3, 3, 3, 4, 1, 1, 1, 2, 4, 4, 3, 4, 1, 0, 0,
       3, 4, 3, 0, 4, 2, 1, 0, 3, 2, 4, 0, 2, 1, 4, 3, 3, 0, 0, 1, 2, 2,
       1, 3, 0, 0, 0, 2, 1, 1, 1, 1, 3, 0, 0, 2, 1, 2, 0, 4, 0, 4, 0, 1,
       3, 4, 4, 1, 3, 3, 3, 2, 2, 3, 2, 1, 0, 2, 0, 3, 4, 4, 2, 2, 0, 0,
       0, 2, 4, 4, 2, 2, 3, 1, 1, 1, 4, 2, 3, 1, 4,

In [65]:
yc = np.zeros((N,C)).astype(int)

In [66]:
yc

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [69]:
yc[:,0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [70]:
for c in range(C):
    yc[:,c] = (y==c)

In [74]:
yc

array([[0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0],
       ...,
       [0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1]])

In [None]:
def softmax(x):
    # avoid overflow
    c = np.max(x,0)
    exp_x = np.exp(x - c)
    sum_exp_x = np.sum(exp_x,0)
    y = exp_x / sum_exp_x

    return y 