In [21]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import warnings

In [22]:
#Generate Data

f = lambda x: 1/(1+25*x**2)
np.random.seed(123)
n_samples = 1000
x_=np.random.uniform(-1,1,n_samples)
noise = np.random.normal(loc=0, scale=0.1, size=n_samples)
y_with_noise = f(x_) + noise
degree = 5
X = PolynomialFeatures(degree).fit_transform(x_.reshape(-1,1))
X_scaled = StandardScaler().fit_transform(X)
y_centered = y_with_noise - y_with_noise.mean()
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_centered, test_size = 0.3)

In [23]:
# Define makers for the objectives of OLS, Ridge and Lasso
#It returns the corresponding objective: a one dimensional function
#which only depends on theta and is minimized in regression
def maker_objective_ols(X,y):
    def objective_ols(theta):
        return mean_squared_error(X @ theta, y)
    return objective_ols

def maker_objective_ridge(X, y, lmbda):
    def objective_ridge(theta):
        return mean_squared_error(X @ theta, y) + lmbda * np.sum(theta**2)
    return objective_ridge

def maker_objective_lasso(X, y, lmbda):
    def objective_lasso(theta):
        return mean_squared_error(X @ theta, y) + lmbda *np.sum(np.abs(theta))
    return objective_lasso

In [24]:
# Define the gradients for OLS, Ridge and Lasso
#For Ridge and Lasso a maker is used which takes lambda as input and returns
#the corresponding gradient
#definitions a
#Parameters of the gradients:
#xtx: np.array matrix product of X.T @ X
#xty: np.array matrix product of X.T @ y
#theta: np.array 
#n_samples: int number of samples i.e. the number of rows of X

def gradient_ols(xtx, xty, theta, n_samples):
    return 2/n_samples* (xtx @theta - xty)

def maker_gradient_ridge(lmbda):
    def gradient_ridge(xtx, xty, theta, n_samples):
        return 2/n_samples*(xtx @theta - xty) + lmbda*theta
    return gradient_ridge

def maker_gradient_lasso(lmbda):
    def gradient_lasso(xtx, xty, theta, n_samples):
        return 2/n_samples*(xtx @theta - xty) + lmbda*np.sign(theta)
    return gradient_lasso

In [25]:
#calculates the effective gradient, i.e. for non stochastic gradient the normal
#gradient with precalculated xtx and xty and for stochastic gradient random batches are created,
#the gradient for every batch is calculated and effective gradient is the average gradient of all batches
#is used in the gradient descent implementations
#parameters:
#X np.array
#y np.array
#gradient: function with parameters xtx, xty, theta and number_of samples which returns the gradient at theta as np.array
#possible functions are gradient_ols, or the returned functions of maker_gradient_ridge or maker_gradient_lasso
#stochastic: boolean if stochastic gradient descent should be used
#batch_size: int used for gradient descent
def gradient_eff_calculator(X, y, gradient, stochastic, batch_size):
    n_samples = X.shape[0]
    n_features = X.shape[1]
    if stochastic:
        def gradient_eff(theta):
            shuffled_indices = np.random.choice(range(n_samples), n_samples, replace = False)
            X_shuffled = X[shuffled_indices]
            y_shuffled = y[shuffled_indices]
            m = int(n_samples/batch_size) # number of batches
            array_batch_gradients = np.zeros((m, n_features))
            for i in range(m):
                xi = X_shuffled[i*batch_size:(i+1)*batch_size]
                yi = y_shuffled[i*batch_size: (i+1)*batch_size] #exclude the last samples?
                batch_gradient_i = gradient(xi.T @ xi, xi.T @ yi, theta, n_samples)
                array_batch_gradients[i] = batch_gradient_i
            return np.mean(array_batch_gradients, axis = 0)
    else:
        xtx = X.T @ X
        xty = X.T @ y
        def gradient_eff(theta):
            return gradient(xtx, xty, theta, n_samples)    
    return gradient_eff



In [26]:
#generates a random initial value for the gradient decents methods
#multivariate uniform distribution where the bounds for every feature
#are the corresponding minima and maxima in X
#used in the gradient descents methods
def initial_value_generator(X):
    n_features = X.shape[1]
    return np.min(X, axis =0) + np.random.uniform(size =n_features) * (np.max(X, axis =0) - np.min(X, axis =0))

In [27]:
#calculates a normal gradient descent

#parameters:
#X: np.array features matrix
#y: np.arry targets
#gradient: function with parameters xtx, xty, theta and number_of samples which returns the gradient at theta as np.array
#possible functions are gradient_ols, or the returned functions of maker_gradient_ridge or maker_gradient_lasso
#learning rate: float used for stochastic gradient descent
#max_iter: int maximum iteration number in gradient descent
#precision: float stopping criterion before iteration limit is defined as ||theta_old - theta_new||_2 <= precision
#stochastic: boolean if stochastic gradient descent should be used
#batch size: int used for stochastic gradient descent
#initial value: np.array for gradient descent, if None a random value in range of X is chosen

#returns a list containing the calculated theta and the number of used iterations
def gradient_descent_normal(X, y, gradient, learning_rate, max_iter, precision, stochastic, batch_size, initial_value = None):
    X = X.astype(np.float64)
    y = y.astype(np.float64)

    gradient_eff = gradient_eff_calculator(X, y, gradient, stochastic, batch_size)

    if initial_value is None:
        initial_value = initial_value_generator(X)
    
    theta_new = initial_value 
    theta_old = initial_value + 1
    count = 0
    while (np.linalg.norm(theta_old - theta_new, ord = None) > precision) and count < max_iter:
        theta_old = theta_new
        theta_new = theta_old - learning_rate * gradient_eff(theta_old)
        count += 1
    if(count == max_iter):
        print('calculation limit exceeded')
    return [theta_new, count]

In [28]:
gradient_descent_normal(X_train, y_train, gradient_ols, learning_rate= 0.1, max_iter= 100000, precision=0.000001, stochastic=False, batch_size=100)

[array([ 0.        ,  0.00282054, -0.66959885, -0.00840943,  0.47905071,
         0.00443768]),
 6441]

In [29]:
#calculates a momentum gradient descent

#parameters:
#X: np.array features matrix
#y: np.arry targets
#gradient: function with parameters xtx, xty, theta and number_of samples which returns the gradient at theta as np.array
#possible functions are gradient_ols, or the returned functions of maker_gradient_ridge or maker_gradient_lasso
#learning rate: float used for stochastic gradient descent
#max_iter: int maximum iteration number in gradient descent
#precision: float stopping criterion before iteration limit is defined as ||theta_old - theta_new||_2 <= precision
#stochastic: boolean if stochastic gradient descent should be used
#batch size: int used for stochastic gradient descent
#initial value: np.array for gradient descent, if None a random value in range of X is chosen
#momentum: float used for stochastic gradient descent

#returns a list containing the calculated theta and the number of used iterations
def gradient_descent_momentum(X, y, gradient, learning_rate, max_iter, precision, stochastic, batch_size, initial_value = None, momentum = 0.9):
    X = X.astype(np.float64)
    y = y.astype(np.float64)

    gradient_eff = gradient_eff_calculator(X, y, gradient, stochastic, batch_size)

    if initial_value is None:
        initial_value = initial_value_generator(X)
    
    theta_new = initial_value 
    theta_old = initial_value + 1
    change = 0
    count = 0
    while (np.linalg.norm(theta_old - theta_new, ord = None) > precision) and count < max_iter: 
        theta_old = theta_new
        change = learning_rate * gradient_eff(theta_old) + momentum * change
        theta_new = theta_old - change 
        count += 1
    if(count == max_iter):
        print('calculation limit exceeded')
    return [theta_new, count]

In [30]:
#calculates a Adagrad gradient descent

#parameters:
#X: np.array features matrix
#y: np.arry targets
#gradient: function with parameters xtx, xty, theta and number_of samples which returns the gradient at theta as np.array
#possible functions are gradient_ols, or the returned functions of maker_gradient_ridge or maker_gradient_lasso
#learning rate: float used for stochastic gradient descent
#max_iter: int maximum iteration number in gradient descent
#precision: float stopping criterion before iteration limit is defined as ||theta_old - theta_new||_2 <= precision
#stochastic: boolean if stochastic gradient descent should be used
#batch size: int used for stochastic gradient descent
#initial value: np.array for gradient descent, if None a random value in range of X is chosen
#epsilon: float used for gradient descent

#returns a list containing the calculated theta and the number of used iterations
def gradient_descent_adagrad(X, y, gradient, learning_rate, max_iter, precision, stochastic, batch_size, initial_value = None, epsilon = 1e-7):
    X = X.astype(np.float64)
    y = y.astype(np.float64)
    n_features = X.shape[1]
    gradient_eff = gradient_eff_calculator(X, y, gradient, stochastic, batch_size)

    if initial_value is None:
        initial_value = initial_value_generator(X)
    
    theta_new = initial_value 
    theta_old = initial_value + 1
    G = np.zeros(n_features)
    gradient1 = np.zeros(n_features)
    count = 0
    while (np.linalg.norm(theta_old - theta_new, ord = None) > precision) and count < max_iter:
        theta_old = theta_new
        gradient1 = gradient_eff(theta_old)
        G = G + np.square(gradient1)
        theta_new = theta_old - learning_rate * gradient1 / np.sqrt(epsilon + G)
        count += 1
    if(count == max_iter):
        print('calculation limit exceeded')
    return [theta_new, count]

In [31]:
#calculates a RMS Prop gradient descent

#parameters:
#X: np.array features matrix
#y: np.arry targets
#gradient: function with parameters xtx, xty, theta and number_of samples which returns the gradient at theta as np.array
#possible functions are gradient_ols, or the returned functions of maker_gradient_ridge or maker_gradient_lasso
#learning rate: float used for stochastic gradient descent
#max_iter: int maximum iteration number in gradient descent
#precision: float stopping criterion before iteration limit is defined as ||theta_old - theta_new||_2 <= precision
#stochastic: boolean if stochastic gradient descent should be used
#batch size: int used for stochastic gradient descent
#initial value: np.array for gradient descent, if None a random value in range of X is chosen
#epsilon: float used for gradient descent
#rho: float used for gradient descent

#returns a list containing the calculated theta and the number of used iterations
def gradient_descent_rmsprop(X, y, gradient, learning_rate, max_iter, precision, stochastic, batch_size, initial_value = None, epsilon = 1e-7 , rho= 0.9):
    X = X.astype(np.float64)
    y = y.astype(np.float64)
    n_features = X.shape[1]
    gradient_eff = gradient_eff_calculator(X, y, gradient, stochastic, batch_size)

    if initial_value is None:
        initial_value = initial_value_generator(X)
        
    theta_new = initial_value 
    theta_old = initial_value + np.ones(n_features)
    v = np.zeros(n_features)
    gradient1 = np.zeros(n_features)
    count = 0
    while (np.linalg.norm(theta_old - theta_new, ord = None) > precision) and count < max_iter: 
        theta_old = theta_new
        gradient1 = gradient_eff(theta_old)
        v= rho * v + (1-rho) * gradient1**2
        theta_new = theta_old - learning_rate / np.sqrt(v + epsilon) * gradient1
        count += 1    
    
    if(count == max_iter):
        print('calculation limit exceeded')
    return [theta_new, count]

In [32]:
#calculates a Adam gradient descent

#parameters:
#X: np.array features matrix
#y: np.arry targets
#gradient: function with parameters xtx, xty, theta and number_of samples which returns the gradient at theta as np.array
#possible functions are gradient_ols, or the returned functions of maker_gradient_ridge or maker_gradient_lasso
#learning rate: float used for stochastic gradient descent
#max_iter: int maximum iteration number in gradient descent
#precision: float stopping criterion before iteration limit is defined as ||theta_old - theta_new||_2 <= precision
#stochastic: boolean if stochastic gradient descent should be used
#batch size: int used for stochastic gradient descent
#initial value: np.array for gradient descent, if None a random value in range of X is chosen
#epsilon: float used for gradient descent
#beta_1: float used for gradient descent
#beta_2: float used for gradient descent

#returns a list containing the calculated theta and the number of used iterations
def gradient_descent_adam(X, y, gradient, learning_rate, max_iter, precision, stochastic, batch_size, initial_value = None,  epsilon = 1e-7, beta_1 = 0.9, beta_2 = 0.999):
    X = X.astype(np.float64)
    y = y.astype(np.float64)
    n_features = X.shape[1]
    gradient_eff = gradient_eff_calculator(X, y, gradient, stochastic, batch_size)

    if initial_value is None:
        initial_value = initial_value_generator(X)
    
    theta_new = initial_value 
    theta_old = initial_value + np.ones(n_features)
    count = 0
    m = 0
    v = 0
    while (np.linalg.norm(theta_old - theta_new, ord = None) > precision) and count < max_iter: 
        count+=1
        theta_old = theta_new
        gradient1 = gradient_eff(theta_old)
        m = beta_1 * m + (1-beta_1)*gradient1
        v = beta_2 * v + (1-beta_2)*gradient1**2
        m_tilde = m/(1-beta_1**count)
        v_tilde = v/(1-beta_2**count)
        theta_new = theta_old - learning_rate * m_tilde / (np.sqrt(v_tilde) + epsilon)

    if(count == max_iter):
        print('calculation limit exceeded')
    return [theta_new, count]

Finding the optimal learning rates via Tuning

In [None]:
warnings.filterwarnings("error", category=RuntimeWarning)
# tuning to find the best hyperparameters
# parameters:
# function: optimizer function which depends on one parameter and returns a list with the calculated optimium
# and the number of iterations, particularly the gradient descent methods are such functions if all input parameters are fixed
# apart from one hyperparameter
# parameters: list of the parameters which are possible input values for function
# objective: objective of the optimizer function, possible functions are
# objective_ols or the returned values of maker_gradient_ridge or maker_gradient_lasso
# returns:
# a list containing a Pandas DataFrame with the columns Parameters, Result Objective and Number of iterations and dictionary with the optimum for each parameter
def tuning(function, parameters, objective):
    result_objective = np.zeros(len(parameters))
    result_iterations = np.zeros(len(parameters))
    result_optimum = {}
    for i, parameter in enumerate(parameters):
        print(f"test parameter {parameter}")
        try:
            function_run = function(parameter)
            result_objective[i] = objective(function_run[0])
            result_iterations[i] = function_run[1]
            result_optimum[parameter] = function_run[0]
        except RuntimeWarning:
            result_objective[i] = np.inf
            result_iterations[i] = np.inf
            print("error")

    dataframe = pd.DataFrame({"Parameters": parameters, "Result Objective":result_objective, "Number of Iterations": result_iterations})
    return [dataframe, result_optimum]

In [34]:
#find learning rate for normal gradient descent ols non stochastic
function = lambda param: gradient_descent_normal(X_train, y_train, gradient_ols, learning_rate =param, max_iter = 10000, precision = 0.000001, stochastic = False, batch_size=100)
parameters =[10**i for i in range(-6, 2)]
results_normal_ols_nonstochastic = tuning(function, parameters, maker_objective_ols(X_train, y_train))
print(results_normal_ols_nonstochastic[0])

test parameter 1e-06
calculation limit exceeded
test parameter 1e-05
calculation limit exceeded
test parameter 0.0001
calculation limit exceeded
test parameter 0.001
calculation limit exceeded
test parameter 0.01
calculation limit exceeded
test parameter 0.1
test parameter 1
error
test parameter 10
error
   Parameters  Result Objective  Number of Iterations
0    0.000001          0.894485               10000.0
1    0.000010          1.751385               10000.0
2    0.000100          0.249079               10000.0
3    0.001000          0.050915               10000.0
4    0.010000          0.027417               10000.0
5    0.100000          0.025538                7052.0
6    1.000000               inf                   0.0
7   10.000000               inf                   0.0


In [35]:
values_normal_ols_nonstochastic = results_normal_ols_nonstochastic[1][0.1]
print(values_normal_ols_nonstochastic)

[ 0.          0.00337317 -0.66959392 -0.00999685  0.47904601  0.00554336]


In [36]:
#find learning rate for normal gradient descent ridge non stochastic
lmbda = 0.1
function = lambda param: gradient_descent_normal(X_train, y_train, maker_gradient_ridge(lmbda), learning_rate =param, max_iter = 10000, precision = 0.000001, stochastic = False, batch_size=100)
parameters =[10**i for i in range(-6, 2)]
results_normal_ridge_nonstochastic = tuning(function, parameters, maker_objective_ridge(X_train, y_train, lmbda = lmbda))
print(results_normal_ridge_nonstochastic[0])

test parameter 1e-06
calculation limit exceeded
test parameter 1e-05
calculation limit exceeded
test parameter 0.0001
calculation limit exceeded
test parameter 0.001
calculation limit exceeded
test parameter 0.01
test parameter 0.1
test parameter 1
error
test parameter 10
error
   Parameters  Result Objective  Number of Iterations
0    0.000001         10.547166               10000.0
1    0.000010          7.075605               10000.0
2    0.000100          0.847404               10000.0
3    0.001000          0.047890               10000.0
4    0.010000          0.048526                7304.0
5    0.100000          0.048526                 743.0
6    1.000000               inf                   0.0
7   10.000000               inf                   0.0


In [37]:
values_normal_ridge_nonstochastic = results_normal_ridge_nonstochastic[1][0.0001]
print(values_normal_ridge_nonstochastic)

[ 0.          1.31206975 -0.88268098 -0.62116035  0.70449394 -0.66243896]


In [38]:
Ridge(alpha =0.1).fit(X_train, y_train).coef_

array([ 0.        ,  0.00302554, -0.6676647 , -0.00904645,  0.47710896,
        0.00488546])

In [39]:
gradient_descent_normal(X_train, y_train, maker_gradient_ridge(lmbda), learning_rate =0.0001, max_iter = 50000, precision = 0.00000001, stochastic = False, batch_size=100)

calculation limit exceeded


[array([ 0.        , -0.20772495, -0.79202489,  0.25354047,  0.60757   ,
        -0.05980012]),
 50000]

In [40]:
#find learning rate for normal gradient descent lasso non stochastic
lmbda = 0.1
function = lambda param: gradient_descent_normal(X_train, y_train, maker_gradient_lasso(lmbda), learning_rate =param, max_iter = 50000, precision = 0.000001, stochastic = False, batch_size=100)
parameters =[10**i for i in range(-6, 2)]
results_normal_lasso_nonstochastic = tuning(function, parameters, maker_objective_lasso(X_train, y_train, lmbda = lmbda))
print(results_normal_lasso_nonstochastic[0])

test parameter 1e-06
calculation limit exceeded
test parameter 1e-05
calculation limit exceeded
test parameter 0.0001
calculation limit exceeded
test parameter 0.001
calculation limit exceeded
test parameter 0.01
calculation limit exceeded
test parameter 0.1
calculation limit exceeded
test parameter 1
error
test parameter 10
error
   Parameters  Result Objective  Number of Iterations
0    0.000001          8.980139               50000.0
1    0.000010          0.871942               50000.0
2    0.000100          0.239399               50000.0
3    0.001000          0.062085               50000.0
4    0.010000          0.062230               50000.0
5    0.100000          0.064887               50000.0
6    1.000000               inf                   0.0
7   10.000000               inf                   0.0


In [41]:
values_normal_lasso_nonstochastic = results_normal_lasso_nonstochastic[1][0.001]
print(values_normal_lasso_nonstochastic)

[ 0.00000000e+00 -2.59732813e-05 -1.59179595e-01  3.94478382e-05
  2.50245257e-05  4.81079302e-05]


In [42]:
#find learning rate for adam gradient descent lasso non stochastic
lmbda = 0.1
function = lambda param: gradient_descent_adam(X_train, y_train, maker_gradient_lasso(lmbda), learning_rate =param, beta_1 = 0.9, beta_2 = 0.999, epsilon = 1e-7,max_iter = 50000, precision = 0.000001, stochastic = False, batch_size=100)
parameters =[10**i for i in range(-6, 2)]
results_adam_lasso_nonstochastic = tuning(function, parameters, maker_objective_lasso(X_train, y_train, lmbda = lmbda))
print(results_normal_lasso_nonstochastic[0])

test parameter 1e-06
calculation limit exceeded
test parameter 1e-05
calculation limit exceeded
test parameter 0.0001
calculation limit exceeded
test parameter 0.001
calculation limit exceeded
test parameter 0.01
calculation limit exceeded
test parameter 0.1
calculation limit exceeded
test parameter 1
calculation limit exceeded
test parameter 10
calculation limit exceeded
   Parameters  Result Objective  Number of Iterations
0    0.000001          8.980139               50000.0
1    0.000010          0.871942               50000.0
2    0.000100          0.239399               50000.0
3    0.001000          0.062085               50000.0
4    0.010000          0.062230               50000.0
5    0.100000          0.064887               50000.0
6    1.000000               inf                   0.0
7   10.000000               inf                   0.0


In [43]:
values_normal_lasso_nonstochastic = results_normal_lasso_nonstochastic[1][0.001]
print(values_normal_lasso_nonstochastic)

[ 0.00000000e+00 -2.59732813e-05 -1.59179595e-01  3.94478382e-05
  2.50245257e-05  4.81079302e-05]


In [44]:
Lasso(alpha = 0.1).fit(X_train, y_train).coef_

array([ 0.       , -0.       , -0.1086299, -0.       , -0.       ,
       -0.       ])

In [45]:
#find learning rate for normal gradient descent ols stochastic
function = lambda param: gradient_descent_normal(X_train, y_train, gradient_ols, learning_rate =param, max_iter = 10000, precision = 0.000001, stochastic = True, batch_size=100)
parameters =[10**i for i in range(-6, 2)]
results_normal_ols_stochastic = tuning(function, parameters, maker_objective_ols(X_train, y_train))
print(results_normal_ols_stochastic[0])

test parameter 1e-06
test parameter 1e-05
calculation limit exceeded
test parameter 0.0001
calculation limit exceeded
test parameter 0.001
calculation limit exceeded
test parameter 0.01
calculation limit exceeded
test parameter 0.1
calculation limit exceeded
test parameter 1
test parameter 10
error
   Parameters  Result Objective  Number of Iterations
0    0.000001          2.400984                   1.0
1    0.000010          5.419392               10000.0
2    0.000100          5.930559               10000.0
3    0.001000          0.072172               10000.0
4    0.010000          0.048857               10000.0
5    0.100000          0.025540               10000.0
6    1.000000          0.025538                4469.0
7   10.000000               inf                   0.0


In [46]:
values_normal_ols_stochastic = results_normal_ols_stochastic[1][1]
print(values_normal_ols_stochastic)

[ 0.          0.00329015 -0.66959466 -0.00975837  0.47904672  0.00537725]


In [47]:
#find learning rate for momentum gradient descent ols
function = lambda param: gradient_descent_momentum(X_train, y_train, gradient_ols, learning_rate =param, momentum = 0.3, max_iter = 10000, precision = 0.000001, stochastic = False, batch_size=100)
parameters =[10**i for i in range(-6, 2)]
results_momentum_ols_nonstochastic = tuning(function, parameters, maker_objective_ols(X_train, y_train))
print(results_momentum_ols_nonstochastic[0])

test parameter 1e-06
calculation limit exceeded
test parameter 1e-05
calculation limit exceeded
test parameter 0.0001
calculation limit exceeded
test parameter 0.001
calculation limit exceeded
test parameter 0.01
calculation limit exceeded
test parameter 0.1
test parameter 1
error
test parameter 10
error
   Parameters  Result Objective  Number of Iterations
0    0.000001          7.084845               10000.0
1    0.000010          6.774898               10000.0
2    0.000100          0.092126               10000.0
3    0.001000          0.055868               10000.0
4    0.010000          0.025627               10000.0
5    0.100000          0.025538                4368.0
6    1.000000               inf                   0.0
7   10.000000               inf                   0.0


In [48]:
values_momentum_ols_nonstochastic = results_momentum_ols_nonstochastic[1][0.1]
print(values_momentum_ols_nonstochastic)

[ 0.          0.00290373 -0.66959811 -0.00864841  0.47905     0.00460413]


In [49]:
#find learning rate for adagrad gradient descent ols
function = lambda param: gradient_descent_adagrad(X_train, y_train, gradient_ols, learning_rate =param, epsilon = 1e-7, max_iter = 10000, precision = 0.000001, stochastic = False, batch_size=100)
parameters =[10**i for i in range(-6, 2)]
results_adagrad_ols_nonstochastic = tuning(function, parameters, maker_objective_ols(X_train, y_train))
print(results_adagrad_ols_nonstochastic[0])

test parameter 1e-06
test parameter 1e-05
test parameter 0.0001
calculation limit exceeded
test parameter 0.001
calculation limit exceeded
test parameter 0.01
calculation limit exceeded
test parameter 0.1
calculation limit exceeded
test parameter 1
test parameter 10
   Parameters  Result Objective  Number of Iterations
0    0.000001         24.641544                   5.0
1    0.000010         11.940741                 500.0
2    0.000100         21.821853               10000.0
3    0.001000         11.633731               10000.0
4    0.010000          0.164456               10000.0
5    0.100000          0.025690               10000.0
6    1.000000          0.025538                3378.0
7   10.000000          0.025538                3711.0


In [50]:
values_adagrad_ols_nonstochastic = results_adagrad_ols_nonstochastic[1][10]
print(values_adagrad_ols_nonstochastic)

[ 0.          0.00294851 -0.66959765 -0.00877704  0.47904956  0.00469377]


In [51]:
#find learning rate for rmsprop gradient descent ols
function = lambda param: gradient_descent_rmsprop(X_train, y_train, gradient_ols, learning_rate =param, epsilon = 1e-7, rho= 0.9, max_iter = 10000, precision = 0.000001, stochastic = False, batch_size=100)
parameters =[10**i for i in range(-6, 2)]
results_rmsprop_ols_nonstochastic = tuning(function, parameters, maker_objective_ols(X_train, y_train))
print(results_rmsprop_ols_nonstochastic[0])

test parameter 1e-06
calculation limit exceeded
test parameter 1e-05
calculation limit exceeded
test parameter 0.0001
calculation limit exceeded
test parameter 0.001
calculation limit exceeded
test parameter 0.01
calculation limit exceeded
test parameter 0.1
calculation limit exceeded
test parameter 1
calculation limit exceeded
test parameter 10
calculation limit exceeded
   Parameters  Result Objective  Number of Iterations
0    0.000001         34.785867               10000.0
1    0.000010         20.407346               10000.0
2    0.000100          0.263431               10000.0
3    0.001000          0.025542               10000.0
4    0.010000          0.025864               10000.0
5    0.100000          0.058106               10000.0
6    1.000000          3.282334               10000.0
7   10.000000        325.705101               10000.0


In [52]:
values_rmsprop_ols_nonstochastic = results_rmsprop_ols_nonstochastic[1][0.001]
print(values_rmsprop_ols_nonstochastic)

[ 0.          0.00359395 -0.66910239 -0.00870584  0.47954231  0.00548765]


In [53]:
#find learning rate for adam gradient descent ols
function = lambda param: gradient_descent_adam(X_train, y_train, gradient_ols, learning_rate =param, epsilon = 1e-7, beta_1= 0.9, beta_2= 0.999, max_iter = 10000, precision = 0.000001, stochastic = False, batch_size=100)
parameters =[10**i for i in range(-6, 2)]
results_adam_ols_nonstochastic = tuning(function, parameters, maker_objective_ols(X_train, y_train))
print(results_adam_ols_nonstochastic[0])

test parameter 1e-06
calculation limit exceeded
test parameter 1e-05
calculation limit exceeded
test parameter 0.0001
calculation limit exceeded
test parameter 0.001
calculation limit exceeded
test parameter 0.01
test parameter 0.1
test parameter 1
test parameter 10
   Parameters  Result Objective  Number of Iterations
0    0.000001         10.629663               10000.0
1    0.000010         15.830854               10000.0
2    0.000100          0.807611               10000.0
3    0.001000          0.028278               10000.0
4    0.010000          0.025538                5728.0
5    0.100000          0.025538                1998.0
6    1.000000          0.025538                 276.0
7   10.000000          0.025538                 557.0


In [54]:
values_adam_ols_nonstochastic = results_adam_ols_nonstochastic[1][10]
print(values_adam_ols_nonstochastic)

[ 0.          0.00310434 -0.66959632 -0.00922465  0.4790483   0.00500549]
