In [265]:
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# i think this combines everything that i did below so i'll remove/clean up the extra stuff above once i confirm this function works correctly

In [272]:
def get_losses(samples, dim, optimizer_list, criterion, test_runs):
    loss_list = []
    
    samples = samples #number of samples from each distribution
    n = dim

    for i in range(test_runs):
        
        test_run_loss = []
        
        # get random samples from normal(0,1) distribution
        x_dataset = torch.randn((samples, n))
        linear_factor = torch.randn(n, 1) # We need to use the same linear factor for the test data
        y_dataset = torch.matmul(x_dataset, linear_factor) + 0.1*torch.randn((samples, 1)) # Linear transform + random noise

        # get testing samples
        x_test = torch.randn((samples, n))
        y_test = torch.matmul(x_test, linear_factor) + 0.1*torch.randn((samples, 1)) # Linear transform + random noise. same linear factor as training data
        
        for optimizer in optimizer_list:
            
            # Linear regression model
            model = torch.nn.Sequential(
            torch.nn.Linear(n, 1)
            )
            
            train(optimizer, x_dataset, y_dataset, model, criterion)
            test_run_loss.append(test(model, x_test, y_test, criterion))
            
        loss_list.append(test_run_loss)
        
    return loss_list

In [273]:
"""
Calculate the average loss for each optimizer over several test runs.
"""
def calc_task_avg_loss(loss_list):
    avg_loss = len(loss_list[0])*[0]
    for test_run in range(len(loss_list)):
        for optimizer in range(len(loss_list[test_run])):
            avg_loss[optimizer] += loss_list[test_run][optimizer]

    for i in range(len(avg_loss)):
        avg_loss[i] /= len(loss_list) 
    
    return avg_loss

In [274]:
# list of optimizers to loop through
optimizer_list=[]
optimizer_list.append(optim.SGD(model.parameters(), lr=0.01))
optimizer_list.append(optim.SGD(model.parameters(), lr=0.01,momentum=0.9))
optimizer_list.append(optim.SGD(model.parameters(), lr=0.01,momentum=0.9,nesterov=True))
optimizer_list.append(optim.Adagrad(model.parameters(), lr=0.01))
optimizer_list.append(optim.RMSprop(model.parameters(), lr=0.01))
optimizer_list.append(optim.Adam(model.parameters(), lr=0.01))

# Loss
criterion = torch.nn.MSELoss()

In [275]:
all_losses = [] # store average losses for each type of parameterization
test_runs = 10

In [276]:
# exactly parameterized
samples = 100 #number of samples from each distribution
dim = 100

exact_param = get_losses(samples, dim, optimizer_list, criterion, test_runs)
all_losses.append(calc_task_avg_loss(exact_param))

loss = 103.57756042480469
Loss: 96.18888854980469
loss = 106.38829040527344
Loss: 98.74159240722656
loss = 107.45361328125
Loss: 98.94036865234375
loss = 105.07817077636719
Loss: 94.28800964355469
loss = 108.47374725341797
Loss: 99.03488159179688
loss = 104.93363189697266
Loss: 96.7685775756836
loss = 126.70720672607422
Loss: 98.69624328613281
loss = 129.86695861816406
Loss: 100.77242279052734
loss = 123.75627899169922
Loss: 100.0716323852539
loss = 123.70311737060547
Loss: 99.52873229980469
loss = 126.77615356445312
Loss: 105.5424575805664
loss = 127.20201873779297
Loss: 105.12014770507812
loss = 108.95611572265625
Loss: 94.65900421142578
loss = 106.96150207519531
Loss: 92.64361572265625
loss = 105.92987060546875
Loss: 95.18988037109375
loss = 105.5101547241211
Loss: 92.11715698242188
loss = 108.13522338867188
Loss: 96.44393920898438
loss = 106.72901153564453
Loss: 93.26188659667969
loss = 108.00425720214844
Loss: 88.32024383544922
loss = 104.83251953125
Loss: 88.68097686767578
loss =

KeyboardInterrupt: 

In [None]:
# overparameterized
samples = 100 #number of samples from each distribution
dim = 200

over_param = get_losses(samples, dim, optimizer_list, criterion, test_runs)
all_losses.append(calc_task_avg_loss(over_param))

In [None]:
# underparameterized
samples = 100 #number of samples from each distribution
dim = 3

under_param = get_losses(samples, dim, optimizer_list, criterion, test_runs)
all_losses.append(calc_task_avg_loss(under_param))

In [921]:
index = ['linear_regression_gaussian_exact_param','linear_regression_gaussian_over_param','linear_regression_gaussian_over_param']
col = ['SGD','Momentum','Nesterov','Adagrad','RMSProp','Adam']
df = pd.DataFrame(data=all_losses, index=index, columns=col)
df

Unnamed: 0,SGD,Momentum,Nesterov,Adagrad,RMSProp,Adam
logistic_regression_gaussian_exact_param,4.17068,4.158137,4.142331,4.137851,4.140847,4.156715
logistic_regression_gaussian_over_param,4.060101,4.064485,4.097406,4.081241,4.067134,4.0751
logistic_regression_gaussian_over_param,4.501864,4.477825,4.482738,4.490011,4.51353,4.515857


In [None]:
print(all_losses)

# repetitive stuff below, will delete later

In [None]:
# Loss
loss = torch.nn.MSELoss()

In [None]:
def train(optimizer, x_dataset, y_dataset, model,loss):
    for t in range(1000):
        # Set the gradients to 0
        optimizer.zero_grad()
        
        # Compute the current predicted y from x_dataset
        y_predicted = model(x_dataset)

        current_loss = loss(y_predicted, y_dataset)
        
        # Compute the gradient of the loss with respect to A and b
        current_loss.backward()
        
        # Update A and b accordingly.
        optimizer.step()
    print(f"loss = {current_loss}")
#     print(f"t = {t}, loss = {current_loss}, A = {A.detach().numpy()}, b = {b.item()}")

In [None]:
def test(model, x_test, y_test, loss_fn):
    # Returns accuracy, loss.
    
    # Get predicted probability vectors from test data.
    y_predicted = model(x_test)

    loss = loss_fn(y_predicted, y_test)
    
    print('Loss: {}'.format(loss.item()))
    
    return loss.item()

In [None]:
all_loss = []

# Exactly parameterized
Number of parameters = number of samples

In [None]:
n = 100 # number of predictor variables
samples = 100

x_dataset = torch.randn((samples, n))
linear_factor = torch.randn(n, 1) # We need to use the same linear factor for the test data
y_dataset = torch.matmul(x_dataset, linear_factor) + 0.1*torch.randn((samples, 1)) # Linear transform + random noise

In [None]:
x_test = torch.randn((samples, n))
y_test = torch.matmul(x_test, linear_factor) + 0.1*torch.randn((samples, 1)) # Linear transform + random noise. same linear factor as training data

In [None]:
model = torch.nn.Sequential(
    torch.nn.Linear(n, 1)
)

In [None]:
exact_param_loss = []

## SGD

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01)
train(optimizer, x_dataset, y_dataset, model,loss)

In [None]:
test_loss = test(model, x_test,y_test, loss)
exact_param_loss.append(test_loss)

## SGD Momentum

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01,momentum=0.9)
train(optimizer, x_dataset, y_dataset, model,loss)

In [None]:
test_loss = test(model, x_test, y_test, loss)
exact_param_loss.append(test_loss)

## SGD Nesterov

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01,momentum=0.9,nesterov=True)
train(optimizer, x_dataset, y_dataset, model,loss)

In [None]:
test_loss = test(model, x_test,y_test, loss)
exact_param_loss.append(test_loss)

## Adagrad

In [None]:
optimizer = optim.Adagrad(model.parameters(), lr=0.01)
train(optimizer, x_dataset, y_dataset, model,loss)

In [None]:
test_loss = test(model, x_test,y_test, loss)
exact_param_loss.append(test_loss)

## RMSprop

In [None]:
optimizer = optim.RMSprop(model.parameters(), lr=0.01)
train(optimizer, x_dataset, y_dataset, model,loss)

In [None]:
test_loss = test(model, x_test,y_test, loss)
exact_param_loss.append(test_loss)

## Adam

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.01)
train(optimizer, x_dataset, y_dataset, model,loss)

In [None]:
test_loss = test(model, x_test,y_test, loss)
exact_param_loss.append(test_loss)

In [None]:
all_loss.append(exact_param_loss)

# Overparameterized
Number of parameters >> number of samples

In [None]:
n = 30 # number of predictor variables
samples = 20

# get random samples from normal(0,1) distribution
x_dataset = torch.randn((samples, n))
linear_factor = torch.randn(n, 1) # We need to use the same linear factor for the test data
y_dataset = torch.matmul(x_dataset, linear_factor) + 0.1*torch.randn((samples, 1)) # Linear transform + random noise

In [None]:
model = torch.nn.Sequential(
    torch.nn.Linear(n, 1)
)

In [None]:
x_test = torch.randn((samples, n))
y_test = torch.matmul(x_test, linear_factor) + 0.1*torch.randn((samples, 1)) # Linear transform + random noise. same linear factor as training data

In [None]:
overparam_loss = []

## SGD

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01)
train(optimizer, x_dataset, y_dataset, model,loss)

In [None]:
test_loss = test(model, x_test,y_test, loss)
overparam_loss.append(test_loss)

## SGD Momentum

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01,momentum=0.9)
train(optimizer, x_dataset, y_dataset, model,loss)

In [None]:
test_loss = test(model, x_test,y_test, loss)
overparam_loss.append(test_loss)

## SGD Nesterov

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01,momentum=0.9,nesterov=True)
train(optimizer, x_dataset, y_dataset, model,loss)

In [None]:
test_loss = test(model, x_test,y_test, loss)
overparam_loss.append(test_loss)

## Adagrad

In [None]:
optimizer = optim.Adagrad(model.parameters(), lr=0.01)
train(optimizer, x_dataset, y_dataset, model,loss)

In [None]:
test_loss = test(model, x_test,y_test, loss)
overparam_loss.append(test_loss)

## RMSprop

In [None]:
optimizer = optim.RMSprop(model.parameters(), lr=0.01)
train(optimizer, x_dataset, y_dataset, model,loss)

In [None]:
test_loss = test(model, x_test,y_test, loss)
overparam_loss.append(test_loss)

## Adam

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.01)
train(optimizer, x_dataset, y_dataset, model,loss)

In [None]:
test_loss = test(model, x_test,y_test, loss)
overparam_loss.append(test_loss)

In [None]:
all_loss.append(overparam_loss)

# Underparameterized
Number of parameters << number of samples

In [None]:
n = 10 # number of predictor variables
samples = 100

# get random samples from normal(0,1) distribution
x_dataset = torch.randn((samples, n))
linear_factor = torch.randn(n, 1) # We need to use the same linear factor for the test data
y_dataset = torch.matmul(x_dataset, linear_factor) + 0.1*torch.randn((samples, 1)) # Linear transform + random noise

In [None]:
model = torch.nn.Sequential(
    torch.nn.Linear(n, 1)
)

In [None]:
x_test = torch.randn((samples, n))
y_test = torch.matmul(x_test, linear_factor) + 0.1*torch.randn((samples, 1)) # Linear transform + random noise. same linear factor as training data

In [None]:
underparam_loss = []

## SGD

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01)
train(optimizer, x_dataset, y_dataset, model,loss)

In [None]:
test_loss = test(model, x_test,y_test, loss)
underparam_loss.append(test_loss)

## SGD Momentum

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01,momentum=0.9)
train(optimizer, x_dataset, y_dataset, model,loss)

In [None]:
test_loss = test(model, x_test,y_test, loss)
underparam_loss.append(test_loss)

## SGD Nesterov

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01,momentum=0.9,nesterov=True)
train(optimizer, x_dataset, y_dataset, model,loss)

In [None]:
test_loss = test(model, x_test,y_test, loss)
underparam_loss.append(test_loss)

## Adagrad

In [None]:
optimizer = optim.Adagrad(model.parameters(), lr=0.01)
train(optimizer, x_dataset, y_dataset, model,loss)

In [None]:
test_loss = test(model, x_test,y_test, loss)
underparam_loss.append(test_loss)

## RMSprop

In [None]:
optimizer = optim.RMSprop(model.parameters(), lr=0.01)
train(optimizer, x_dataset, y_dataset, model,loss)

In [None]:
test_loss = test(model, x_test,y_test, loss)
underparam_loss.append(test_loss)

## Adam

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.01)
train(optimizer, x_dataset, y_dataset, model,loss)

In [None]:
test_loss = test(model, x_test,y_test, loss)
underparam_loss.append(test_loss)

In [None]:
all_loss.append(underparam_loss)

In [None]:
index = ['Linear regression random samples - exactly parameterized','Linear regression random samples - overparameterized','Linear regression random samples - underparameterized']
col = ['SGD','Momentum','Nesterov','Adagrad','RMSProp','Adam']
df = pd.DataFrame(data=all_loss, index=index, columns=col)
df

In [None]:
df.to_csv('linear_regression_gaussian_loss.csv')

# Normalize results

In [None]:
all_loss = np.asarray(all_loss)
all_loss
normalized_test_losses = []

for i in range(len(all_loss)):
    mean = np.mean(all_loss[i])
    minus_mean = all_loss[i] - mean
    normalized_test_losses.append((minus_mean)/np.linalg.norm(minus_mean))
print(normalized_test_losses)

In [None]:
index = ['linear_regression_gaussian_exact_param','linear_regression_gaussian_over_param','linear_regression_gaussian_under_param']
col = ['SGD','Momentum','Nesterov','Adagrad','RMSProp','Adam']
df = pd.DataFrame(data=normalized_test_losses, index=index, columns=col)
df

In [None]:
df.to_csv('linear_regression_gaussian_normalized_loss.csv')

# Overparameterized dimensions vs. losses graph

In [None]:
def get_losses(samples, dim, optimizer, criterion):
    samples = samples #number of samples from each distribution
    n = dim

    # get random samples from normal(0,1) distribution
    x_dataset = torch.randn((samples, n))
    linear_factor = torch.randn(n, 1) # We need to use the same linear factor for the test data
    y_dataset = torch.matmul(x_dataset, linear_factor) + 0.1*torch.randn((samples, 1)) # Linear transform + random noise
    
    # get testing samples
    x_test = torch.randn((samples, n))
    y_test = torch.matmul(x_test, linear_factor) + 0.1*torch.randn((samples, 1)) # Linear transform + random noise. same linear factor as training data
    
    # Linear regression model
    model = torch.nn.Sequential(
    torch.nn.Linear(n, 1)
    )
    
    train(optimizer, x_dataset, y_dataset, model, criterion)
    return test(model, x_test, y_test, criterion)

In [None]:
optimizer_list=[]
optimizer_list.append(optim.SGD(model.parameters(), lr=0.01))
optimizer_list.append(optim.SGD(model.parameters(), lr=0.01,momentum=0.9))
optimizer_list.append(optim.SGD(model.parameters(), lr=0.01,momentum=0.9,nesterov=True))
optimizer_list.append(optim.Adagrad(model.parameters(), lr=0.01))
optimizer_list.append(optim.RMSprop(model.parameters(), lr=0.01))
optimizer_list.append(optim.Adam(model.parameters(), lr=0.01))

optimizer_names=['SGD', 'Momentum', 'Nesterov', 'Adagrad', 'RMSprop', 'Adam']

In [None]:
#INDIVIDUAL GRAPHS

samples = 100
criterion = torch.nn.MSELoss()

for opt in range(len(optimizer_list)):
    losses = []
    sample_sizes = []
    for dim in range(1,500,20):
        losses.append(get_losses(samples, dim, optimizer_list[opt], criterion))
        sample_sizes.append(dim)
    plt.plot(sample_sizes, losses)
    plt.title('Linear regression losses for ' + optimizer_names[opt]+' with '+str(samples)+' samples')
    plt.xlabel('Dimensions')
    plt.ylabel('Loss')
    plt.show()
#     plt.legend(['SGD', 'Momentum', 'Nesterov', 'Adagrad', 'RMSprop', 'Adam'], loc='upper left')

In [None]:
#OVERLAY GRAPH

samples = 100
criterion = torch.nn.MSELoss()

for optimizer in optimizer_list:
    losses = []
    sample_sizes = []
    for dim in range(1,500,20):
        losses.append(get_losses(samples, dim, optimizer, criterion))
        sample_sizes.append(dim)
    plt.plot(sample_sizes, losses)
    plt.title('Linear regression losses with '+str(samples)+' samples')
    plt.xlabel('Dimensions')
    plt.ylabel('Loss')
    plt.legend(['SGD', 'Momentum', 'Nesterov', 'Adagrad', 'RMSprop', 'Adam'], loc='upper left',bbox_to_anchor=(1, 0.5))