In [1]:
# Import libraries
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import itertools
import argparse
import sys
import time
from sklearn import preprocessing
import pandas as pd
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='3'


In [2]:
## Preprocessing of data
# Function to load data

def get_power_data():
    """
    Read the Individual household electric power consumption dataset
    """
    
    # Assume that the dataset is located on folder "data"
    data = pd.read_csv('data/household_power_consumption.txt',
                       sep=';', low_memory=False)

    # Drop some non-predictive variables
    data = data.drop(columns=['Date', 'Time'], axis=1)

    #print(data.head())

    # Replace missing values
    data = data.replace('?', np.nan)

    # Drop NA
    data = data.dropna(axis=0)

    # Normalize
    standard_scaler = preprocessing.StandardScaler()
    np_scaled = standard_scaler.fit_transform(data)
    data = pd.DataFrame(np_scaled)

    # Goal variable assumed to be the first
    X = data.values[:, 1:].astype('float32')
    y = data.values[:, 0].astype('float32')

    # Create categorical y for binary classification with balanced classes
    y = np.sign(y+0.46)

    # Split train and test data here: (X_train, Y_train, X_test, Y_test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    no_class = 2                 #binary classification

    return X_train, X_test, y_train, y_test, no_class


In [3]:
X_train, X_test, y_train, y_test, no_class = get_power_data()
print("X,y types: {} {}".format(type(X_train), type(y_train)))
print("X size {}".format(X_train.shape))
print("Y size {}".format(y_train.shape))

# Create a binary variable from one of the columns.
# You can use this OR not

idx = y_train >= 0
notidx = y_train < 0
y_train[idx] = 1
y_train[notidx] = -1


# X_test = X_test/np.linalg.norm(X_test)
# X_train = X_train/np.linalg.norm(X_train)


X,y types: <class 'numpy.ndarray'> <class 'numpy.ndarray'>
X size (1536960, 6)
Y size (1536960,)


In [4]:
# print(y_train.shape)
# #print(X_train.min())
# print(X_test.max(), X_test.min())


# print(X_test.max(), X_test.min())


In [5]:
# Sigmoid function
def sigmoid(x, derivative=False):
    sigm = 1. / (1. + np.exp(-x)) 
    if derivative:
        return sigm * (1. - sigm)
    return sigm

# Define weights initialization
def initialize_w(N, d):
    return 2*np.random.random((N,d)) - 1

# Fill in feed forward propagation
def feed_forward_propagation(X, y, w_1, w_2, w_3, lmbda):
    # Fill in
    # X (N,d)
    # w_1 (d,h)
    # w_2 (h,h)
    # w_3 (h,1)
    # y = y.reshape((-1,1))

    layer_0 = X
    layer_1 = sigmoid(np.dot(X, w_1)) # (N,h)
    layer_2 = sigmoid(np.dot(layer_1, w_2)) # (N,h)
    layer_3 = np.dot(layer_2, w_3) # (N,1)

    # np.linalg.norm(np.dot(np.dot(sigmoid(np.dot(X, w_1)),w_2),w_3) - y.reshape((-1,1)))**2/X.shape[0]
    
    return layer_0, layer_1, layer_2, layer_3
    
# Fill in backpropagation    
def back_propagation(y, w_1, w_2, w_3, layer_0, layer_1, layer_2, layer_3):
    y = y.reshape((-1,1))
    N = layer_0.shape[0]
    # Calculate the gradient here
    # print("w_1 shape: ",w_1.shape)
    # print("layer_0 shape: ",layer_0.shape)
    dl_dl3 = 2 * (layer_3-y) # (N,1)
    dl3_dw3 = layer_2 # (N, h)
    dl3_dl2 = w_3 # (h,1)
    dl2_dw2 = -sigmoid(np.dot(layer_1, w_2), derivative=True) * layer_1 # (N,h)
    # print("dl2_dw2.shape: ", dl2_dw2.shape)

    dl2_dl1 = -np.dot(sigmoid(np.dot(layer_1, w_2), derivative=True), w_2) # (N,h)
    # print("dl2_dl1.shape: ", dl2_dl1.shape)
    
    # (N,h) (N,d)
    dl1_dw1 = -np.dot(sigmoid(np.dot(layer_0, w_1), derivative=True).T, layer_0) # (h,d)
    # print("dl1_dw1.shape: ", dl1_dw1.shape)

    # TODO
    # gradient w.r.t w_3 # (h,1)
    layer_3_delta = np.dot(dl3_dw3.T, dl_dl3)/N

    # gradient w.r.t w_2 # (h,h)
    layer_2_delta = np.dot(np.dot(dl3_dl2,dl_dl3.T), dl2_dw2)/N

    # gradient w.r.t w_1

    # (d,h) = (N,1) (h,1) (N,h) (h,d)
    # layer_1_delta = dl_dl3 * dl3_dl2 * dl2_dl1 *dl1_dw1
    layer_1_delta = np.dot(dl1_dw1.T, np.dot(dl2_dl1.T, np.dot(dl_dl3, dl3_dl2.T)))/N
    return layer_1_delta, layer_2_delta, layer_3_delta

# Cost function
def cost(X, y, w_1, w_2, w_3, lmbda):
    N, d = X.shape
    a1,a2,a3,a4 = feed_forward_propagation(X,y,w_1,w_2,w_3,lmbda)

    return np.linalg.norm(a4[:,0] - y,2) ** 2 / N

# Define SGD
def SGD(X, y, w_1, w_2, w_3, lmbda, learning_rate, batch_size):
    # Complete here:
    # y = y.reshape((-1,1))
    randomInd = np.arange(X.shape[0])[:batch_size]
    layer_0, layer_1, layer_2, layer_3 = feed_forward_propagation(X[randomInd,:],y[randomInd,:],w_1,w_2,w_3,lmbda)
    layer_1_delta, layer_2_delta, layer_3_delta = back_propagation(y[randomInd,:], w_1, w_2, w_3, layer_0, layer_1, layer_2, layer_3)
    w_1 = w_1 - learning_rate*layer_1_delta
    w_2 = w_2 - learning_rate*layer_2_delta
    w_3 = w_3 - learning_rate*layer_3_delta

    return w_1, w_2, w_3

# Define SVRG here:
def SVRG(X, y, w_1, w_2, w_3, lmbda, learning_rate, T, batch_size):
    # Complete here:
    y = y.reshape((-1,1))
    N = X.shape[0]

    # compute all gradient and store
    layer_0, layer_1, layer_2, layer_3 = feed_forward_propagation(X,y,w_1,w_2,w_3,lmbda)
    layer_1_delta, layer_2_delta, layer_3_delta = back_propagation(y, w_1, w_2, w_3, layer_0, layer_1, layer_2, layer_3)
        
    # g = function_gradient_vectorization(x, y, w, lambda_, gradclip)
    
    # initialize the w_previous
    # w_previous = w.copy()
    w_1_previous, w_2_previous, w_3_previous = w_1.copy(), w_2.copy(), w_3.copy()
    for t in range(T//batch_size):
        # random sample
        # randomInd = int(np.random.rand() * N)
        randomInd = np.arange(N)
        np.random.shuffle(randomInd)
        randomInd = randomInd[:batch_size]
        # randomInd = np.random.randint(0,N)
        layer_0_p1, layer_1_p1, layer_2_p1, layer_3_p1 = feed_forward_propagation(X[randomInd,:],y[randomInd,:],w_1_previous,w_2_previous,w_3_previous,lmbda)
        layer_1_delta_p1, layer_2_delta_p1, layer_3_delta_p1 = back_propagation(y[randomInd,:], w_1_previous, w_2_previous, w_3_previous, layer_0_p1, layer_1_p1, layer_2_p1, layer_3_p1)

        layer_0_p2, layer_1_p2, layer_2_p2, layer_3_p2 = feed_forward_propagation(X[randomInd,:],y[randomInd,:],w_1,w_2,w_3,lmbda)
        layer_1_delta_p2, layer_2_delta_p2, layer_3_delta_p2 = back_propagation(y[randomInd,:], w_1, w_2, w_3, layer_0_p2, layer_1_p2, layer_2_p2, layer_3_p2)
        
        # calculate the update term
        # part1 = function_gradient_vectorization(x[:,randomInd], y[:,randomInd], w_previous, lambda_, gradclip = gradclip)
        # part2 = function_gradient_vectorization(x[:,randomInd], y[:,randomInd], w, lambda_, gradclip = gradclip)
        # part3 = g

        w_1_previous = w_1_previous - learning_rate * (layer_1_delta_p1 - layer_1_delta_p2 + layer_1_delta)
        w_2_previous = w_2_previous - learning_rate * (layer_2_delta_p1 - layer_2_delta_p2 + layer_2_delta)
        w_3_previous = w_3_previous - learning_rate * (layer_3_delta_p1 - layer_3_delta_p2 + layer_3_delta)

        # w_previous = w_previous - alpha * (part1 - part2 + part3)
        
    # w = w_previous
    w_1, w_2, w_3 = w_1_previous, w_2_previous, w_3_previous

    
    return w_1, w_2, w_3

# Define GD here:
def GD(X, y, w_1,w_2,w_3, learning_rate, lmbda, iterations):
    # Complete here:
    layer_0, layer_1, layer_2, layer_3 = feed_forward_propagation(X,y,w_1,w_2,w_3,lmbda)
    layer_1_delta, layer_2_delta, layer_3_delta = back_propagation(y, w_1, w_2, w_3, layer_0, layer_1, layer_2, layer_3)
    w_1 = w_1 - learning_rate*layer_1_delta
    w_2 = w_2 - learning_rate*layer_2_delta
    w_3 = w_3 - learning_rate*layer_3_delta
    
    return w_1, w_2, w_3

# Define projected GD here:
def PGD(X, y, w_1,w_2,w_3, learning_rate, lmbda, iterations, noise):
    # Complete here:
    
    return w_1, w_2, w_3

# Define BCD here:
def BCD(X, y, w_1,w_2,w_3, learning_rate, lmbda, iterations):
    # Complete here:
    
    return w_1, w_2, w_3

In [6]:
# w_size = 10

# # Initialize weights
# w_1 = initialize_w(X_train.shape[1], w_size)

# w_2 = initialize_w(w_size,w_size)

# w_3 = initialize_w(w_size, 1)

# lmbda = 0

# layer_0, layer_1, layer_2, layer_3 = feed_forward_propagation(X_test, y_test, w_1, w_2, w_3, lmbda)

# print(layer_0.shape)
# print(layer_1.shape)
# print(layer_2.shape)
# print(layer_3.shape)

# layer_1_delta, layer_2_delta, layer_3_delta = back_propagation(y_test, w_1, w_2, w_3, layer_0, layer_1, layer_2, layer_3)
# # print(layer_1_delta.shape)
# print("layer_3_delta shape", layer_3_delta.shape)
# print("layer_2_delta shape", layer_2_delta.shape)
# print("layer_1_delta shape", layer_1_delta.shape)


In [8]:
# Should be a hyperparameter that you tune, not an argument - Fill in the values
lmbda =0.
w_size = 40
lr = 0.02
iterations = 5
T = 2000
batch_size = 100

# Initialize weights
w_1 = initialize_w(X_train.shape[1], w_size)

w_2 = initialize_w(w_size,w_size)

w_3 = initialize_w(w_size, 1)

print("SVRG\tinitial loss is :", cost(X_train, y_train, w_1, w_2, w_3, lmbda))
# print("SVRG")
for i in range(iterations):
    # w_1,w_2,w_3 = SGD(X_train, y_train, w_1, w_2, w_3, lmbda, lr, batch_size)
    w_1,w_2,w_3 = SVRG(X_train, y_train, w_1, w_2, w_3, lmbda, lr, T, batch_size)

    loss = cost(X_train, y_train, w_1, w_2, w_3, lmbda)
    # if i%10==0:
    print(i,loss)

SVRG	initial loss is : 2.110085126794523
  sigm = 1. / (1. + np.exp(-x))
0 0.40845452413559913
1 0.4051852536128354
2 0.4046117444497941
3 0.4044774968702398
4 0.4044507897522062


In [None]:
# Should be a hyperparameter that you tune, not an argument - Fill in the values
parser = argparse.ArgumentParser()
parser.add_argument('--lambda', type=float, default=0., dest='lmbda') 
parser.add_argument('--w_size', type=int, default=10, dest='w_size')
parser.add_argument('--lr', type=float, default=0.01)
parser.add_argument('--iterations', type=int, default=10)

args = parser.parse_args()

batch_size = 100

# Initialize weights
w_1 = initialize_w(X_train.shape[1], args.w_size)

w_2 = initialize_w(args.w_size,args.w_size)

w_3 = initialize_w(args.w_size, 1)

# Get iterations
iterations = args.iterations
# Define plotting variables
fig, ax = plt.subplots(2, 1, figsize=(16, 8))

# Define the optimizers for the loop
optimizers = [
        {# Fill in the hyperparameters
            "opt": SGD(X_train, y_train, w_1, w_2, w_3, args.lmbda, args.lr, batch_size),
            "name": "SGD",
            "inner": # Fill in
        },
        {# Fill in the hyperparameters
            "opt": SVRG(X_train, y_train, w_1, w_2, w_3, args.lmbda, args.lr),
            "name": "SVRG",
            "inner": # Fill in
        },
        {# Fill in the hyperparameters
            "opt": GD(
                X_train, y_train, w_1, w_2, w_3, learning_rate=args.lr,
                lmbda=args.lmbda, iterations=iterations),
            "name": "GD",
            "inner": # Fill in
        },
        {# Fill in the hyperparameters
            "opt": PGD(
                X_train, y_train, w_1, w_2, w_3, learning_rate=args.lr,
                lmbda=args.lmbda, iterations=iterations, noise=),
            "name": "PGD",
            "inner": # Fill in
        },
        {# Fill in the hyperparameters
            "opt": BCD(
                X_train, y_train, w_1, w_2, w_3, learning_rate=args.lr,
                lmbda=args.lmbda, iterations=iterations),
            "name": "BCD",
            "inner": # Fill in
        }
    ]

In [None]:
# Run the iterates over the algorithms above

for opt in optimizers:
    #
    # Fill in



In [None]:
# Plot results
ax[0].legend(loc="upper right")
ax[0].set_xlabel(r"Iteration", fontsize=16)
ax[0].set_ylabel("Loss", fontsize=16)
ax[0].set_title("CA3 - Training a deep neural network for the power consumption Dataset")
ax[0].set_ylim(ymin=0)

ax[1].legend(loc="upper right")
ax[1].set_xlabel(r"Time [s]", fontsize=16)
ax[1].set_ylabel("Loss", fontsize=16)
ax[1].set_ylim(ymin=0)

plt.savefig("power.png")