In [None]:
##imports from libraries
import pandas as pd
import numpy as np
import time
import math
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import resource
import os

In [None]:
## Preprocessing of data
# Load data here:
# get data function
def get_data(data_folder = './ghg_data/'):
    data = []
    filelist = os.listdir(data_folder)
    for file in filelist:
        data_single = np.genfromtxt(data_folder+file,dtype=np.float)
        data.append(data_single)
    return data

data = np.array(get_data())

In [None]:
# Split train and test data here: (X_train, Y_train, X_test, Y_test)
def splitDataset_v2(totaldata, train = 0.8, seed = 123, normalize = True):
    # seed
    np.random.seed(seed)
    # number
    numdata = totaldata.shape[0]
    numtrain = int(numdata*train)
    numtest = numdata - numtrain
    # index
    index = np.arange(numdata)
    np.random.shuffle(index)
    # shuffle
    totaldata = totaldata[index,:,:]
    # split
    traindata = totaldata[:numtrain, :, :].reshape(numtrain,-1)
    testdata = totaldata[numtrain:, :, :].reshape(numtest,-1)
    # split X, Y
    # train
    X_train = traindata[:,:-1].T
    Y_train = traindata[:,-1].reshape(-1,1).T
    
    # test
    X_test = testdata[:,:-1].T
    Y_test = testdata[:,-1].reshape(-1,1).T
    
    # normalization
    if normalize is True:
        X_train = X_train/np.linalg.norm(X_train, axis=0)
#         Y_train = Y_train/np.linalg.norm(Y_train, axis=0)
        X_test = X_test/np.linalg.norm(X_test, axis=0)
#         Y_test = Y_test/np.linalg.norm(Y_test, axis=0)
    
#     Y_test = Y_test.reshape(numtest, 1, -1)
    return X_train, Y_train, X_test, Y_test
# Split train and test data here: (X_train, Y_train, X_test, Y_test)
X_train, Y_train, X_test, Y_test = splitDataset_v2(data)
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

In [None]:
## Logistic ridge regression with different optimizers
# cost function and gradient calculation
def cost_vectorization(x,y,w,lambda_=0.01):
    # x.shape (5231, 2336)
    # y.shape (1, 2336)
    # w.shape (5231, 1)
    D, N = x.shape
    value = 0
    Z = -1 * y * np.dot(w.T , x) # (1, 2336)
    value = np.sum(np.log(1+np.exp(Z)))
    norm_w = np.linalg.norm(w)
    c = lambda_ * norm_w ** 2
    return value/N + c


def cost(x,y,w,lambda_ = 0.01):
    D, N = x.shape
    value = 0
    for i in range(N):
        Z = -1 * y[:,i] * np.dot(w.T , (x[:,i].reshape(D,1)).reshape(D,1))
        value += np.sum(np.log(1+np.exp(Z))) # y 1-d value
    norm_w = np.linalg.norm(w)
    c = lambda_ * norm_w ** 2
#     print("=========================")
#     print("loss part 1: {}".format(value/N))
#     print("loss part 2, lambda term : {}".format(c))
    return value/N + c 



def function_gradient_vectorization(X, Y, w, lambda_, gradclip = None):
    # x.shape (5231, 2336)
    # y.shape (1, 2336)
    # w.shape (5231, 1)
    # dw.shape (5231,1)
    D, N = X.shape
    d, _ = Y.shape
    
    dw = np.zeros((D, d))
    Z = Y * np.dot(w.T , X) # (1, 2336)
    dw = np.dot(X , (Y/(1 + np.exp(Z))).T)

    c = lambda_ * w *2
    
    if gradclip != None:
        c[c>=gradclip] = gradclip
    
    return c-dw/N



def function_gradient(X, Y, w, lambda_, gradclip = None):
    # Calculate the gradient here:
    # X: DxN
    # Y: dxN
    # w: Dxd
    D, N = X.shape
    d, _ = Y.shape
    
    dw = np.zeros((D, d))
    
    for i in range(N):
            
        Z = Y[:,i] * np.dot(w.T , X[:,i].reshape(D,1)) # 1 x 1
        dw[:] += X[:,i].reshape(D,1) * Y[:,i]/(1 + np.exp(Z[0]))
    c = lambda_ * w *2
    
    if gradclip != None:
        c[c>=gradclip] = gradclip
    
    return c-dw/N

In [None]:
## Define solvers: GD, SGD, SVRG and SAG. 
# Setting the values here:

alpha = 0.1 # change the value
num_iters = 5 # change the value
lambda_ = 0.00001 # change the value
epsilon = 0.001 # change the value

# ---------------------- Complete the blank definitions: --------------------------------------

def solver(x,y, w, alpha, num_iters , lambda_ , epsilon , optimizer = "GD", batchsize = None, gradclip = None, mem=False):
    if (optimizer == "GD") :
        for i in range(num_iters):
            # update the parameter w for GD here:
            g = function_gradient_vectorization(x, y, w, lambda_, gradclip)
            w = w - alpha*g
            if (i%100==0) and (mem):
                usage=resource.getrusage(resource.RUSAGE_SELF)
                loss = cost_vectorization(x,y,w,lambda_)
                print(i, loss)
                # print("mem for GD (mb):", (usage[2]*resource.getpagesize())/1000000.0)
            if (np.linalg.norm(g) <= epsilon):
                break
    elif (optimizer == "SGD"):
        for i in range(num_iters):
            # Complete SGD here:
                # randomly choose NSample points for calculating the estimated gradient
                if (batchsize == None):
                    NSample = int(np.random.rand() * x.shape[1])
                    while NSample == 0:
                        NSample = int(np.random.rand() * x.shape[1])
                else:
                    NSample = batchsize
                randomInd = np.arange(x.shape[1])[:NSample]
                g = function_gradient_vectorization(x[:,randomInd], y[:,randomInd], w, lambda_, gradclip = gradclip)
                w = w - alpha*g
                
                if (i%100==0) and (mem):
                    usage=resource.getrusage(resource.RUSAGE_SELF)     
                    loss = cost_vectorization(x,y,w,lambda_)
                    print(i, loss)
                    # print("mem for GD (mb):", (usage[2]*resource.getpagesize())/1000000.0)
                if (np.linalg.norm(g) <= epsilon):
                    break
                
                
    elif (optimizer == "SVRG"):
        i = 0
        T = 2000
        K = 1000
        N = x.shape[1]
        for k in range(K):

            # compute all gradient and store
            g = function_gradient_vectorization(x, y, w, lambda_, gradclip)
            # initialize the w_previous
            w_previous = w.copy()
            
            for t in range(T):
                # random sample
                if (batchsize == None):
                    NSample = int(np.random.rand() * x.shape[1])
                    while NSample == 0:
                        NSample = int(np.random.rand() * x.shape[1])
                else:
                    NSample = batchsize
                randomInd = np.arange(x.shape[1])
                np.random.shuffle(randomInd)
                randomInd = randomInd[:NSample]

                # calculate the update term
                part1 = function_gradient_vectorization(x[:,randomInd], y[:,randomInd], w_previous, lambda_, gradclip = gradclip)
                part2 = function_gradient_vectorization(x[:,randomInd], y[:,randomInd], w, lambda_, gradclip = gradclip)
                part3 = g

                w_previous = w_previous - alpha * (part1 - part2 + part3)
                
            w = w_previous
            i = i+1

            if (i%100==0) and (mem):
                    usage=resource.getrusage(resource.RUSAGE_SELF)
                    loss = cost_vectorization(x,y,w,lambda_)
                    print(i, loss)
                    # print("mem for GD (mb):", (usage[2]*resource.getpagesize())/1000000.0)
            if (np.linalg.norm((part1 - part2 + part3)) <= epsilon):
                break
                
    elif (optimizer == "SAG"):
        # Complete SAG here:
        # X: DxN
        # Y: dxN
        # w: Dxd
        D, N = x.shape
        d = y.shape[0]
        dw = np.zeros_like(w, dtype=np.float)
        gk = np.zeros((N,D,d), dtype=np.float)


        for k in range(num_iters):
            NSample  = batchsize

            randomInd = np.arange(x.shape[1])
            np.random.shuffle(randomInd)
            index_all = randomInd[:NSample]

            g_i = function_gradient_vectorization(x[:,index_all].reshape(D,index_all.size), y[:,index_all].reshape(d,index_all.size), w, lambda_, gradclip = gradclip)


            gk[index_all,:,:] = g_i

            dw = np.sum(gk[:,:,:],axis=0)
            w = w - alpha * dw/N
            if (np.linalg.norm(dw/N) <= epsilon):
                break

            if (k%100==0) and (mem):
                usage=resource.getrusage(resource.RUSAGE_SELF)
                loss = cost_vectorization(x,y,w,lambda_)
                print(k, loss)
                # print("mem for SAG (mb):", (usage[2]*resource.getpagesize())/1000000.0)
            # i_t = 
        i=k
    return w,i

In [None]:
# ## Solving the optimization problem:

alpha = 0.01 # change the value
num_iters = 100000 # change the value
lambda_ = 0.1 # change the value
epsilon = 0.01 # change the value

w = np.random.normal(0,0.1, D*d).reshape(D,d)
#-------------------- GD Solver -----------------------
loss = cost_vectorization(x,y,w,lambda_)
print("initial loss {}".format(loss))

# #-------------------- GD Solver -----------------------
print("\nGD optimization")
opt = "GD"
w = np.random.normal(0,0.1, D*d).reshape(D,d)

start_gd = time.time()
gde, iters_total = solver(x,y, w, alpha, num_iters  , lambda_ , epsilon , optimizer = opt, mem=True) # complete the command 
end_gd = time.time()
cost_value = cost_vectorization(x,y,gde,lambda_)  # Calculate the cost value
print("Cost of GD after convergence: ",cost_value)
print("Training time for GD: ", end_gd-start_gd)
print("iters_total: ", iters_total)

# # # #-------------------- SGD Solver -----------------------
opt = 'SGD'
print("\nSGD optimization")
w = np.random.normal(0,0.1, D*d).reshape(D,d)

start_sgd = time.time()
gde, iters_total = solver(x,y, w, alpha, num_iters  , lambda_ , epsilon , optimizer = opt, mem=True) # complete the command 
end_sgd = time.time()
cost_value = cost_vectorization(x,y,gde,lambda_)  # Calculate the cost value
print("Cost of SGD after convergence: ",cost_value)
print("Training time for SGD: ", end_sgd-start_sgd)
print("iters_total: ", iters_total)

# #-------------------- SVRG Solver -----------------------


opt = 'SVRG'
print("\nSVRG optimization")
w = np.random.normal(0,0.1, D*d).reshape(D,d)

start_srvg = time.time()
gde, iters_total = solver(x,y, w, alpha, num_iters  , lambda_ , epsilon , optimizer = opt, mem=True, batchsize = 50) # complete the command 
end_svrg = time.time()
cost_value = cost_vectorization(x,y,gde,lambda_)  # Calculate the cost value
print("Cost of SVRG after convergence: ",cost_value)
print("Training time for SVRG: ", end_svrg-start_srvg)
print("iters_total: ", iters_total)


#-------------------- SAG Solver -----------------------

opt = 'SAG'
print("\nSAG optimization")
w = np.random.normal(0,0.1, D*d).reshape(D,d)

start_sag = time.time()
gde, iters_total = solver(x,y, w, alpha, num_iters, lambda_ , epsilon , optimizer = opt, mem=True, batchsize = 50) # complete the command 
end_sag = time.time()
cost_value = cost_vectorization(x,y,gde,lambda_)  # Calculate the cost value
print("Cost of SAG after convergence: ",cost_value)
print("Training time for SAG: ", end_sag-start_sag)
print("iters_total: ", iters_total)