# Importing Libraries

In [1]:
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt 

from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
np.random.seed(42)

In [2]:
print(load_wine().DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [3]:
wine_data = load_wine()
X = wine_data.data
Y = wine_data.target.reshape((-1,1))

In [4]:
y = np.zeros((X.shape[0],3))
for i in range(len(Y)):
    y[i][Y[i]] = 1


In [5]:
X = X.T
for i in range(len(X)):
    X[i] = X[i]/np.max(X[i])
X = X.T
X.shape

(178, 13)

In [6]:
(X,y) = shuffle(X,y,random_state = 40)

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.20)

In [8]:
def relu(x):
    x[x<0]=0
    return x

def softmax(x):
    return np.exp(x)/(np.sum(np.exp(x),axis=0))

def diff_relu(x):
    y = np.zeros(x.shape)
    y[x<=0] = 0
    y[x>0] = 1
    return y

def weight_init(x,y):
    return np.sqrt(2.0/(x+y))*np.random.normal(0,1,(x,y))

In [9]:
inp_dim = 13
hl1_units = 15
hl2_units = 15
out_dim = 3

W1 = weight_init(hl1_units,inp_dim)
b1 = weight_init(hl1_units,1)
W2 = weight_init(hl2_units,hl1_units)
b2 = weight_init(hl2_units,1)
W3 = weight_init(out_dim,hl2_units)
b3 = weight_init(out_dim,1)

# Fitting training data using Nesterov Momentum approach

In [10]:
epochs = 500
alpha = 0.5
epsilon = 0.01

for i in range(epochs):
    (x_train,y_train) = shuffle(X_train,Y_train,random_state = 40)
    loss = 0
    v = {"W1":np.zeros(W1.shape), "W2": np.zeros(W2.shape), "W3": np.zeros(W3.shape), "b1":np.zeros(b1.shape), "b2":np.zeros(b2.shape), "b3":np.zeros(b3.shape)}
    for j in range(0,len(x_train)):
        W1 += alpha*v["W1"]
        b1 += alpha*v["b1"]
        W2 += alpha*v["W2"]
        b2 += alpha*v["b2"]
        W3 += alpha*v["W3"]
        b3 += alpha*v["b3"]
        
        a1 = relu(np.matmul(W1,x_train[j]).reshape((-1,1)) + b1)
        a2 = relu(np.matmul(W2,a1).reshape((-1,1)) + b2)
        x = np.matmul(W3,a2) + b3
        a3 = softmax(x)
        loss += -np.log(a3[np.argmax(y_train[j])]) 
        
        delta3 = a3 - y_train[j].reshape(-1,1)
        delta2 = np.matmul(W3.T,delta3)*diff_relu(a2)
        delta1 = np.matmul(W2.T,delta2)*diff_relu(a1)
        
        grd_b3 = delta3
        grd_W3 = np.matmul(delta3,a2.T)
        grd_b2 = delta2
        grd_W2 = np.matmul(delta2,a1.T)
        grd_b1 = delta1
        grd_W1 = np.matmul(delta1,x_train[j].reshape((-1,1)).T)
        
        v["W1"] = alpha*v["W1"] - epsilon*grd_W1
        v["b1"] = alpha*v["b1"] - epsilon*grd_b1
        v["W2"] = alpha*v["W2"] - epsilon*grd_W2
        v["b2"] = alpha*v["b2"] - epsilon*grd_b2
        v["W3"] = alpha*v["W3"] - epsilon*grd_W3
        v["b3"] = alpha*v["b3"] - epsilon*grd_b3
        W1 += v["W1"]
        b1 += v["b1"]
        W2 += v["W2"]
        b2 += v["b2"]
        W3 += v["W3"]
        b3 += v["b3"]
        
    if i%10 == 0:
        print (str(i) + ":" + " loss =" + str(loss/len(x_train)))

0: loss =[1.01007249]
10: loss =[0.23823526]
20: loss =[0.14447832]
30: loss =[0.06722767]
40: loss =[0.05404278]
50: loss =[0.05291757]
60: loss =[0.0487949]
70: loss =[0.05589437]
80: loss =[0.00826877]
90: loss =[0.00526162]
100: loss =[0.00335676]
110: loss =[0.0029054]
120: loss =[0.00258988]
130: loss =[0.00223517]
140: loss =[0.00169243]
150: loss =[0.00136697]
160: loss =[0.00074901]
170: loss =[0.00056993]
180: loss =[0.00048276]
190: loss =[0.00042538]
200: loss =[0.000386]
210: loss =[0.00034996]
220: loss =[0.0003236]
230: loss =[0.00029771]
240: loss =[0.00027999]
250: loss =[0.00026095]
260: loss =[0.00024284]
270: loss =[0.00023131]
280: loss =[0.00021978]
290: loss =[0.00020752]
300: loss =[0.0001981]
310: loss =[0.00018808]
320: loss =[0.0001815]
330: loss =[0.00017393]
340: loss =[0.00016613]
350: loss =[0.00015925]
360: loss =[0.00015504]
370: loss =[0.00014792]
380: loss =[0.0001443]
390: loss =[0.00013844]
400: loss =[0.00013359]
410: loss =[0.00013009]
420: loss =

# Measuring accuracy of model

In [11]:
y_predicted = list()
y_actual = list()
for j in range(0,len(X_test)):
    a1 = relu(np.matmul(W1,X_test[j]).reshape((-1,1)) + b1)
    a2 = relu(np.matmul(W2,a1).reshape((-1,1)) + b2)
    x = np.matmul(W3,a2) + b3
    out = softmax(x)
    y_predicted.append(np.argmax(out))
    y_actual.append(np.argwhere(Y_test[j] == 1)[0][0])
print ("actual out:"+str(y_actual))
print ("predic out:"+str(y_predicted))

actual out:[0, 1, 1, 0, 0, 2, 1, 0, 1, 1, 2, 0, 2, 1, 2, 0, 2, 0, 0, 1, 1, 1, 1, 2, 1, 0, 1, 1, 2, 0, 1, 1, 1, 0, 0, 2]
predic out:[0, 1, 1, 0, 0, 2, 1, 0, 1, 1, 2, 0, 2, 1, 2, 0, 2, 0, 0, 1, 1, 1, 1, 2, 1, 0, 1, 0, 2, 0, 1, 1, 1, 0, 0, 2]


# Comparision with SGD and Momentum approach

### Loss obtained with 1000 epochs with SGD is nearly 0.00024098, in Momentum approach loss of 0.00020955 is achieved in 500 epochs where as in Nesterov case loss of 0.00010372 is achieved in 300 epochs which clearly shows that model converges at a faster rate using Nesterov momentum approach than that of using Momentum and SGD approach