In [10]:
# Import libraries
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import itertools
import argparse
import sys
import time
from sklearn import preprocessing
import pandas as pd
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='3'


In [11]:
## Preprocessing of data
# Function to load data

def get_power_data():
    """
    Read the Individual household electric power consumption dataset
    """
    
    # Assume that the dataset is located on folder "data"
    data = pd.read_csv('data/household_power_consumption.txt',
                       sep=';', low_memory=False)

    # Drop some non-predictive variables
    data = data.drop(columns=['Date', 'Time'], axis=1)

    #print(data.head())

    # Replace missing values
    data = data.replace('?', np.nan)

    # Drop NA
    data = data.dropna(axis=0)

    # Normalize
    standard_scaler = preprocessing.StandardScaler()
    np_scaled = standard_scaler.fit_transform(data)
    data = pd.DataFrame(np_scaled)

    # Goal variable assumed to be the first
    X = data.values[:, 1:].astype('float32')
    y = data.values[:, 0].astype('float32')

    # Create categorical y for binary classification with balanced classes
    y = np.sign(y+0.46)

    # Split train and test data here: (X_train, Y_train, X_test, Y_test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    no_class = 2                 #binary classification

    return X_train, X_test, y_train, y_test, no_class


In [12]:
X_train, X_test, y_train, y_test, no_class = get_power_data()
print("X,y types: {} {}".format(type(X_train), type(y_train)))
print("X size {}".format(X_train.shape))
print("Y size {}".format(y_train.shape))

# Create a binary variable from one of the columns.
# You can use this OR not

idx = y_train >= 0
notidx = y_train < 0
y_train[idx] = 1
y_train[notidx] = -1


# X_test = X_test/np.linalg.norm(X_test)
# X_train = X_train/np.linalg.norm(X_train)


X,y types: <class 'numpy.ndarray'> <class 'numpy.ndarray'>
X size (1536960, 6)
Y size (1536960,)


In [13]:
# print(y_train.shape)
# #print(X_train.min())
# print(X_test.max(), X_test.min())


# print(X_test.max(), X_test.min())


In [120]:
# Sigmoid function
def sigmoid(x, derivative=False):
    sigm = 1. / (1. + np.exp(-x)) 
    if derivative:
        return sigm * (1. - sigm)
    return sigm

# Define weights initialization
def initialize_w(N, d):
    return 2*np.random.random((N,d)) - 1

# Fill in feed forward propagation
def feed_forward_propagation(X, y, w_1, w_2, w_3, lmbda):
    # Fill in
    # X (N,d)
    # w_1 (d,h)
    # w_2 (h,g)
    # w_3 (g,1)
    N,d = X.shape
    layer_0 = X # (N,d)
    layer_1 = sigmoid(np.dot(layer_0, w_1)) # (N, h)
    layer_2 = sigmoid(np.dot(layer_1, w_2)) # (N, g)
    layer_3 = np.dot(layer_2, w_3) # (N, 1)
    
    return layer_0, layer_1, layer_2, layer_3
def back_propagation(y, w_1, w_2, w_3, layer_0, layer_1, layer_2, layer_3, lmbda):
    N = y.shape[0]
    y = y.reshape((-1,1))
    layer_3_delta = np.zeros_like(w_3) # (g,1)
    layer_2_delta = np.zeros_like(w_2) # (h, g)
    layer_1_delta = np.zeros_like(w_1) # (d, h)
    layer_3_delta = 2 * np.dot(layer_2.T, (layer_3 - y)) # (g,1)
    # print("np.dot(w_3,(layer_3-y)) shape ", np.dot(w_3,(layer_3-y).T).shape)
    # print(" sigmoid(np.dot(layer_1,w_2), derivative=True).T shape", sigmoid(np.dot(layer_1,w_2), derivative=True).T.shape)
    dJ_dl2 = 2 * np.dot(w_3,(layer_3-y).T) # # (g,N)
    dl2_ds2 = sigmoid(np.dot(layer_1,w_2), derivative=True).T # (g,N)

    layer_2_delta  = np.dot(dJ_dl2 * dl2_ds2, layer_1).T    
    # layer_2_delta = 2 * np.dot(np.dot(w_3,(layer_3-y).T)*sigmoid(np.dot(layer_1,w_2), derivative=True).T, layer_1).T
    ds2_dl1 = w_2 # (h,g)
    dl1_ds1 =  sigmoid(np.dot(layer_0,w_1), derivative=True).T # (h,N)
    ds1_dw1 = layer_0 # (N,d)

    layer_1_delta = np.dot(np.dot(ds2_dl1, dJ_dl2 * dl2_ds2) * dl1_ds1, ds1_dw1).T # (d,h)
    return layer_1_delta/N, layer_2_delta/N, layer_3_delta/N

# Cost function
def cost(X, y, w_1, w_2, w_3, lmbda):
    N, d = X.shape
    a1,a2,a3,a4 = feed_forward_propagation(X,y,w_1,w_2,w_3,lmbda)

    return np.linalg.norm(a4[:,0] - y,2) ** 2 / N + lmbda * (np.linalg.norm(w_1)**2 + np.linalg.norm(w_2)**2 + np.linalg.norm(w_3)**2)

# Define SGD
def SGD(X, y, w_1, w_2, w_3, lmbda, learning_rate, batch_size):
    # Complete here:
    # y = y.reshape((-1,1))
    randomInd = np.arange(X.shape[0])[:batch_size]
    layer_0, layer_1, layer_2, layer_3 = feed_forward_propagation(X[randomInd,:],y[randomInd,:],w_1,w_2,w_3,lmbda)
    layer_1_delta, layer_2_delta, layer_3_delta = back_propagation(y[randomInd,:], w_1, w_2, w_3, layer_0, layer_1, layer_2, layer_3, lmbda)
    w_1 = w_1 - learning_rate*layer_1_delta
    w_2 = w_2 - learning_rate*layer_2_delta
    w_3 = w_3 - learning_rate*layer_3_delta

    return w_1, w_2, w_3

# Define SVRG here:
def SVRG(X, y, w_1, w_2, w_3, lmbda, learning_rate, T, batch_size, iterations):
    # Complete here:
    y = y.reshape((-1,1))
    N = X.shape[0]

    for i in range(iterations):
        # compute all gradient and store
        layer_0, layer_1, layer_2, layer_3 = feed_forward_propagation(X,y,w_1,w_2,w_3,lmbda)
        layer_1_delta, layer_2_delta, layer_3_delta = back_propagation(y, w_1, w_2, w_3, layer_0, layer_1, layer_2, layer_3, lmbda)
            
        # g = function_gradient_vectorization(x, y, w, lambda_, gradclip)
        
        # initialize the w_previous
        # w_previous = w.copy()
        w_1_previous, w_2_previous, w_3_previous = w_1.copy(), w_2.copy(), w_3.copy()
        for t in range(T//batch_size):
            # random sample
            # randomInd = int(np.random.rand() * N)
            randomInd = np.arange(N)
            np.random.shuffle(randomInd)
            randomInd = randomInd[:batch_size]
            # randomInd = np.random.randint(0,N)
            layer_0_p1, layer_1_p1, layer_2_p1, layer_3_p1 = feed_forward_propagation(X[randomInd,:],y[randomInd,:],w_1_previous,w_2_previous,w_3_previous,lmbda)
            layer_1_delta_p1, layer_2_delta_p1, layer_3_delta_p1 = back_propagation(y[randomInd,:], w_1_previous, w_2_previous, w_3_previous, layer_0_p1, layer_1_p1, layer_2_p1, layer_3_p1, lmbda)

            layer_0_p2, layer_1_p2, layer_2_p2, layer_3_p2 = feed_forward_propagation(X[randomInd,:],y[randomInd,:],w_1,w_2,w_3,lmbda)
            layer_1_delta_p2, layer_2_delta_p2, layer_3_delta_p2 = back_propagation(y[randomInd,:], w_1, w_2, w_3, layer_0_p2, layer_1_p2, layer_2_p2, layer_3_p2, lmbda)
            
            # calculate the update term
            # part1 = function_gradient_vectorization(x[:,randomInd], y[:,randomInd], w_previous, lambda_, gradclip = gradclip)
            # part2 = function_gradient_vectorization(x[:,randomInd], y[:,randomInd], w, lambda_, gradclip = gradclip)
            # part3 = g

            w_1_previous = w_1_previous - learning_rate * (layer_1_delta_p1 - layer_1_delta_p2 + layer_1_delta)
            w_2_previous = w_2_previous - learning_rate * (layer_2_delta_p1 - layer_2_delta_p2 + layer_2_delta)
            w_3_previous = w_3_previous - learning_rate * (layer_3_delta_p1 - layer_3_delta_p2 + layer_3_delta)

            # w_previous = w_previous - alpha * (part1 - part2 + part3)
            
        # w = w_previous
        w_1, w_2, w_3 = w_1_previous, w_2_previous, w_3_previous
        print(i,loss)
    
    return w_1, w_2, w_3

# Define GD here:
def GD(X, y, w_1,w_2,w_3, learning_rate, lmbda, iterations):
    N = X.shape[0]
    for i in range(iterations):

        layer_0, layer_1, layer_2, layer_3 = feed_forward_propagation(X, y, w_1,w_2,w_3,lmbda)
        layer_1_delta, layer_2_delta, layer_3_delta = back_propagation(y, w_1, w_2, w_3, layer_0, layer_1, layer_2, layer_3,lmbda)

        w_1 = w_1 -  learning_rate * layer_1_delta + (lmbda / N * w_1)
        w_2 = w_2 - learning_rate * layer_2_delta + (lmbda / N * w_2)
        w_3 = w_3 - learning_rate * layer_3_delta + (lmbda / N * w_3)
        loss = cost(X_train, y_train, w_1, w_2, w_3, lmbda)
        print(i,loss)
    return w_1, w_2, w_3

# Define projected GD here:
def PGD(X, y, w_1,w_2,w_3, learning_rate, lmbda, iterations, noise):
    # Complete here:
    
    return w_1, w_2, w_3

# Define BCD here:
def BCD(X, y, w_1,w_2,w_3, learning_rate, lmbda, iterations):
    # Complete here:
    
    return w_1, w_2, w_3

In [121]:
w_size = 10

# Initialize weights for debug
w_1 = initialize_w(X_train.shape[1], w_size)

w_2 = initialize_w(w_size,w_size+1)

w_3 = initialize_w(w_size+1, 1)

lmbda = 0

layer_0, layer_1, layer_2, layer_3 = feed_forward_propagation(X_test, y_test, w_1, w_2, w_3, lmbda)

print("layer_0 shape ", layer_0.shape)
print("layer_1 shape ",layer_1.shape)
print("layer_2 shape ",layer_2.shape)
print("layer_3 shape ",layer_3.shape)

layer_1_delta, layer_2_delta, layer_3_delta = back_propagation(y_test, w_1, w_2, w_3, layer_0, layer_1, layer_2, layer_3, lmbda)


print("w_3 shape ",w_3.shape)
print("w_2 shape ",w_2.shape)
print("w_1 shape ",w_1.shape)

print("layer_3_delta shape", layer_3_delta.shape)
print("layer_2_delta shape", layer_2_delta.shape)
print("layer_1_delta shape", layer_1_delta.shape)





layer_0 shape  (512320, 6)
layer_1 shape  (512320, 10)
layer_2 shape  (512320, 11)
layer_3 shape  (512320, 1)
w_3 shape  (11, 1)
w_2 shape  (10, 11)
w_1 shape  (6, 10)
layer_3_delta shape (11, 1)
layer_2_delta shape (10, 11)
layer_1_delta shape (6, 10)


In [127]:
# Should be a hyperparameter that you tune, not an argument - Fill in the values
lmbda =0.001
w_size = 50
lr = 0.02
iterations = 100
T = 2000
batch_size = 100

# Initialize weights
w_1 = initialize_w(X_train.shape[1], w_size)

w_2 = initialize_w(w_size,w_size)

w_3 = initialize_w(w_size, 1)

print("GD\tinitial loss is :", cost(X_train, y_train, w_1, w_2, w_3, lmbda))
# for i in range(iterations):
w_1_star,w_2_star,w_3_star = GD(X_train, y_train, w_1, w_2, w_3, lr, lmbda, iterations)
    # w_1,w_2,w_3 = SVRG(X_train, y_train, w_1, w_2, w_3, lmbda, lr, T, batch_size)


print("SVRG\tinitial loss is :", cost(X_train, y_train, w_1, w_2, w_3, lmbda))
w_1_star1,w_2_star1,w_3_star1 = SVRG(X_train, y_train, w_1, w_2, w_3, lmbda, lr, T, batch_size, iterations)


GD	initial loss is : 2.176887181318325
0 1.558352053446679
1 1.5362190903458717
2 1.521202612974463
3 1.5073730339443716
4 1.4945735354266714
5 1.4827116508612987
6 1.4717039930196636
7 1.4614749291501168
8 1.4519558476769736
9 1.4430845001507904
10 1.4348044060208522
11 1.427064313716575
12 1.4198177123841176
13 1.4130223892939604
14 1.4066400285012008
15 1.4006358468240268
16 1.3949782636222436
17 1.3896386012188429
18 1.3845908131229379
19 1.3798112374895346
20 1.3752783734966798
21 1.3709726785385166
22 1.3668763843275131
23 1.362973330174002
24 1.359248811868643
25 1.355689444735821
26 1.3522830395549483
27 1.349018490163795
28 1.3458856716645078
29 1.3428753482499727
30 1.339979089756624
31 1.3371891961304403
32 1.3344986290664504
33 1.331900950149212
34 1.3293902648830014
35 1.3269611720563361
36 1.3246087179364587
37 1.3223283548358864
38 1.3201159036355128
39 1.3179675198873597
40 1.3158796631552248
41 1.3138490692834814
42 1.3118727253133773
43 1.3099478467926686
44 1.3080718

In [None]:
# Should be a hyperparameter that you tune, not an argument - Fill in the values
parser = argparse.ArgumentParser()
parser.add_argument('--lambda', type=float, default=0., dest='lmbda') 
parser.add_argument('--w_size', type=int, default=10, dest='w_size')
parser.add_argument('--lr', type=float, default=0.01)
parser.add_argument('--iterations', type=int, default=10)

args = parser.parse_args()

batch_size = 100

# Initialize weights
w_1 = initialize_w(X_train.shape[1], args.w_size)

w_2 = initialize_w(args.w_size,args.w_size)

w_3 = initialize_w(args.w_size, 1)

# Get iterations
iterations = args.iterations
# Define plotting variables
fig, ax = plt.subplots(2, 1, figsize=(16, 8))

# Define the optimizers for the loop
optimizers = [
        {# Fill in the hyperparameters
            "opt": SGD(X_train, y_train, w_1, w_2, w_3, args.lmbda, args.lr, batch_size),
            "name": "SGD",
            "inner": # Fill in
        },
        {# Fill in the hyperparameters
            "opt": SVRG(X_train, y_train, w_1, w_2, w_3, args.lmbda, args.lr),
            "name": "SVRG",
            "inner": # Fill in
        },
        {# Fill in the hyperparameters
            "opt": GD(
                X_train, y_train, w_1, w_2, w_3, learning_rate=args.lr,
                lmbda=args.lmbda, iterations=iterations),
            "name": "GD",
            "inner": # Fill in
        },
        {# Fill in the hyperparameters
            "opt": PGD(
                X_train, y_train, w_1, w_2, w_3, learning_rate=args.lr,
                lmbda=args.lmbda, iterations=iterations, noise=),
            "name": "PGD",
            "inner": # Fill in
        },
        {# Fill in the hyperparameters
            "opt": BCD(
                X_train, y_train, w_1, w_2, w_3, learning_rate=args.lr,
                lmbda=args.lmbda, iterations=iterations),
            "name": "BCD",
            "inner": # Fill in
        }
    ]

In [None]:
# Run the iterates over the algorithms above

for opt in optimizers:
    #
    # Fill in



In [None]:
# Plot results
ax[0].legend(loc="upper right")
ax[0].set_xlabel(r"Iteration", fontsize=16)
ax[0].set_ylabel("Loss", fontsize=16)
ax[0].set_title("CA3 - Training a deep neural network for the power consumption Dataset")
ax[0].set_ylim(ymin=0)

ax[1].legend(loc="upper right")
ax[1].set_xlabel(r"Time [s]", fontsize=16)
ax[1].set_ylabel("Loss", fontsize=16)
ax[1].set_ylim(ymin=0)

plt.savefig("power.png")