In [None]:
import torch
import torch.optim as optim
import torch.nn as nn
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt

# i think this combines everything that i did below so i'll remove/clean up the extra stuff above once i confirm this function works correctly

In [None]:
def train_test_trajectory(optimizer, model, x_dataset, y_dataset, x_test, y_test, criterion, epochs):
    # Main optimization loop
    test_trajectory = []
    for t in range(epochs):
        # Set the gradients to 0.
        optimizer.zero_grad()
        
        # Compute the current predicted labels from x_dataset
        y_predicted = model(x_dataset)
        
        # See how far off the prediction is
        current_loss = criterion(y_predicted, y_dataset)

        # Compute the gradient of the loss with respect to A and b
        current_loss.backward()
        
        # Update A and b accordingly
        optimizer.step()
        
#         print(f"train loss = {current_loss}")
        
        y_predicted = model(x_test)
    
        loss = criterion(y_predicted, y_test)

        # Get index with highest probability.
        predicted_labels = torch.argmax(y_predicted, dim=1)

        correct = (predicted_labels == y_test).sum()
        
#         print('test loss: {}'.format(loss.item()))
        test_trajectory.append(loss.item())
        
    return test_trajectory

In [None]:
def get_final_losses(samples, dim, optimizer_list, criterion, test_runs):
    loss_list = []
    
    samples = samples #number of samples from each distribution
    dim = dim
    output_dim = 2
    
    # means of the distributions
    mean1 = 0
    mean2 = 3/math.sqrt(dim)
    
    for i in range(test_runs):
        test_run_loss = []
        for optimizer in optimizer_list:
            
            # Logistic regression model
            model = torch.nn.Sequential(
                torch.nn.Linear(dim, samples),
                torch.nn.LogSoftmax(dim=1) 
            )

            # get training samples
            x_dataset = torch.Tensor(np.vstack((np.random.normal(mean1, 1, size=(samples, dim)),
                                                np.random.normal(mean2, 1, size=(samples, dim)))))

            # get training labels
            gaussian1_labels = [1]*int(samples)
            gaussian2_labels = [0]*int(samples)
            y_dataset = torch.tensor(gaussian1_labels+gaussian2_labels,dtype=torch.long) # combine labels and convert to tensor

            # get testing samples
            x_test = torch.Tensor(np.vstack((np.random.normal(mean1, 1, size=(samples, dim)),
                                             np.random.normal(mean2, 1, size=(samples, dim)))))

            # get testing labels 
            gaussian1_labels = [1]*int(samples)
            gaussian2_labels = [0]*int(samples)
            y_test = torch.tensor(gaussian1_labels+gaussian2_labels,dtype=torch.long) # combine labels and convert to tensor

            train(samples, optimizer, model, x_dataset, y_dataset, criterion)
            test_run_loss.append(test(model, x_test, y_test, criterion))
            
        loss_list.append(test_run_loss)
        
    return loss_list

In [None]:
def get_trajectory_losses(samples, dim, test_runs, epochs):
    loss_list = []
    
    samples = samples #number of samples from each distribution
    dim = dim
    output_dim = 2

    # means of the distributions
    mean1 = 0
    mean2 = 3/math.sqrt(dim)
    
    # Logistic regression model
    model = torch.nn.Sequential(
        torch.nn.Linear(dim, samples),
        torch.nn.LogSoftmax(dim=1) 
    )
    
    criterion = nn.NLLLoss() 
    optimizer_list=[]
    optimizer_list.append(optim.SGD(model.parameters(), lr=0.01))
    optimizer_list.append(optim.SGD(model.parameters(), lr=0.01,momentum=0.9))
    optimizer_list.append(optim.SGD(model.parameters(), lr=0.01,momentum=0.9,nesterov=True))
    optimizer_list.append(optim.Adagrad(model.parameters(), lr=0.01))
    optimizer_list.append(optim.RMSprop(model.parameters(), lr=0.01))
    optimizer_list.append(optim.Adam(model.parameters(), lr=0.01))
    
    for i in range(test_runs):
        test_run_loss = []
        
        for optimizer in optimizer_list:

            # get training samples
            x_dataset = torch.Tensor(np.vstack((np.random.normal(mean1, 1, size=(samples, dim)),
                                                np.random.normal(mean2, 1, size=(samples, dim)))))

            # get training labels
            gaussian1_labels = [1]*int(samples)
            gaussian2_labels = [0]*int(samples)
            y_dataset = torch.tensor(gaussian1_labels+gaussian2_labels,dtype=torch.long) # combine labels and convert to tensor

            # get testing samples
            x_test = torch.Tensor(np.vstack((np.random.normal(mean1, 1, size=(samples, dim)),
                                             np.random.normal(mean2, 1, size=(samples, dim)))))

            # get testing labels 
            gaussian1_labels = [1]*int(samples)
            gaussian2_labels = [0]*int(samples)
            y_test = torch.tensor(gaussian1_labels+gaussian2_labels,dtype=torch.long) # combine labels and convert to tensor
            
            trajectory = train_test_trajectory(optimizer, model, x_dataset, y_dataset, x_test, y_test, criterion, epochs)      
            for l in trajectory:
                test_run_loss.append(l)
           
        loss_list.append(test_run_loss)
        
    return loss_list

In [None]:
"""
Calculate the average loss for each optimizer over several test runs.
"""
def calc_task_avg_loss(loss_list):
    avg_loss = len(loss_list[0])*[0]
    for test_run in range(len(loss_list)):
        for optimizer in range(len(loss_list[test_run])):
            avg_loss[optimizer] += loss_list[test_run][optimizer]

    for i in range(len(avg_loss)):
        avg_loss[i] /= len(loss_list) 
    
    return avg_loss

In [None]:
all_losses = []
test_runs = 10
output_dim = 2
epochs = 50

In [None]:
# exactly parameterized
samples = 20 #number of samples from each distribution
dim = 20

exact_param = get_trajectory_losses(samples, dim, test_runs, epochs)

for i in exact_param:
    all_losses.append(i)

In [None]:
# overparameterized
samples = 10 #number of samples from each distribution
dim = 300

over_param = get_trajectory_losses(samples, dim, test_runs, epochs)
for i in over_param:
    all_losses.append(i)

In [None]:
# underparameterized
samples = 50 #number of samples from each distribution
dim = 3

under_param = get_trajectory_losses(samples, dim, test_runs, epochs)
for i in under_param:
    all_losses.append(i)

In [None]:
df = pd.DataFrame(data=all_losses)

tasks = ['logistic_regression_gaussian_exact_param','logistic_regression_gaussian_over_param','logistic_regression_gaussian_under_param']
runs = range(test_runs)
df.index = pd.MultiIndex.from_product([tasks, runs])

optimizers = ['SGD','Momentum','Nesterov','Adagrad','RMSProp','Adam']
epoch_ind = range(epochs)
df.columns = pd.MultiIndex.from_product([optimizers, epoch_ind])

df

In [None]:
df.to_csv('logistic_regression_gaussian_trajectory.csv')

In [None]:
index = ['logistic_regression_gaussian_exact_param','logistic_regression_gaussian_over_param','logistic_regression_gaussian_over_param']
col = ['SGD','Momentum','Nesterov','Adagrad','RMSProp','Adam']
df = pd.DataFrame(data=all_losses, index=index, columns=col)
df

# IGNORE STUFF BELOW! repetitive/testing stuff, will delete later

In [None]:
def train(n, optimizer, model, x_dataset, y_dataset, criterion):
    # Main optimization loop
    for t in range(500):
        # Set the gradients to 0.
        optimizer.zero_grad()
        
        # Compute the current predicted labels from x_dataset
        y_predicted = model(x_dataset)
        
        # See how far off the prediction is
        current_loss = criterion(y_predicted, y_dataset)

        # Compute the gradient of the loss with respect to A and b
        current_loss.backward()
        
        # Update A and b accordingly
        optimizer.step()
        
        print(f"loss = {current_loss}")
        
        y_predicted = model(x_test)
    
        loss = criterion(y_predicted, y_test)

        # Get index with highest probability.
        predicted_labels = torch.argmax(y_predicted, dim=1)

        correct = (predicted_labels == y_test).sum()
        
        print('Loss: {}'.format(loss.item()))
    
    return current_loss.item()

In [None]:
def test(model, x_test, y_test, loss_fn):
    # Returns accuracy, loss.
    
    # Get predicted probability vectors from test data.
    y_predicted = model(x_test)
    
    loss = loss_fn(y_predicted, y_test)
    
    # Get index with highest probability.
    predicted_labels = torch.argmax(y_predicted, dim=1)
    
    correct = (predicted_labels == y_test).sum()
    
#     print('Accuracy: {}'.format(correct.item()/len(y_test)))
#     print('Loss: {}'.format(loss.item()))
    
    return loss.item()

In [None]:
# def test(model, x_dataset, y_dataset):
#     total = 0
#     correct = 0
#     ind = 0
#     loss = 0
#     for sample in x_dataset:
#         sample = sample.unsqueeze(dim=0) #add an extra dimension to sample point bc pytorch syntax; e.g. [0,0] -> [[0,0]]
        
#         output = model(sample) 
#         print("OUTPUT")
#         print(output)
#         print()
        
#         _, predicted = torch.max(output.data, 1) #use _ to discard first output; get class with highest probability
#         print("PREDICTED CLASS")
#         print(predicted)
#         print()
        
#         print("ACTUAL CLASS")
#         print(y_dataset[ind]) # actual class of sample point
        
#         # count number of correct classifications
#         print(predicted)
#         if predicted == y_dataset[ind]:
#             correct += 1
#             print("CORRECT******")
        
#         # trying to set labels correctly and find loss here; getting value error 
#         label = 0
#         if y_dataset[ind] == 1:
#             label = torch.tensor([1,0])
#         else: 
#             label = torch.tensor([0,1])
#         print("LABEL ", label)
#         print("OUTPUT ", output)
#         loss = criterion(output, label)
        
#         print(loss)    
    
#     total += y_dataset.size(0) # Total number of labels
#     acc = correct.item()/total
#     print('Loss: {}'.format(loss))
#     print('Test accuracy over {} data points: {}%'.format(total_data, test_acc * 100))
#     print()

In [None]:
test_losses = []

# Exactly parameterized
Number of parameters = number of samples

In [None]:
samples = 20 #number of samples from each distribution
dim = 20
output_dim = 2

# means of the distributions
mean1 = 0
mean2 = 3/math.sqrt(dim)

In [None]:
# get training samples

# sample from 2 gaussians
x_dataset = torch.Tensor(np.vstack((np.random.normal(mean1, 1, size=(samples, dim)),
                                    np.random.normal(mean2, 1, size=(samples, dim)))))

# label = 1 for first dist., label = 0 for second dist.
gaussian1_labels = [1]*int(samples)
gaussian2_labels = [0]*int(samples)
y_dataset = torch.tensor(gaussian1_labels+gaussian2_labels,dtype=torch.long) # combine labels and convert to tensor

# print(y_dataset)
# print(x_dataset)

In [None]:
# get testing samples

x_test = torch.Tensor(np.vstack((np.random.normal(mean1, 1, size=(samples, dim)),
                                 np.random.normal(mean2, 1, size=(samples, dim)))))

# label = 1 for first dist., label = 0 for second dist.
gaussian1_labels = [1]*int(samples)
gaussian2_labels = [0]*int(samples)
y_test = torch.tensor(gaussian1_labels+gaussian2_labels,dtype=torch.long) # combine labels and convert to tensor

In [None]:
# Logistic regression model
model = torch.nn.Sequential(
    torch.nn.Linear(dim, output_dim),
    torch.nn.LogSoftmax(dim=1) 
)

# Use NLL since we include softmax as part of model 
criterion = nn.NLLLoss() 

In [None]:
# store loss values of exactly parameterized test runs
exact_param = []

## SGD

In [None]:
def train_test_trajectory(optimizer, model, x_dataset, y_dataset, x_test, y_test, criterion, epochs):
    
    # Main optimization loop
    test_trajectory = []
    for t in range(20):
        # Set the gradients to 0.
        optimizer.zero_grad()
        
        # Compute the current predicted labels from x_dataset
        y_predicted = model(x_dataset)
        
        # See how far off the prediction is
        current_loss = criterion(y_predicted, y_dataset)

        # Compute the gradient of the loss with respect to A and b
        current_loss.backward()
         
        # Update A and b accordingly
        optimizer.step()
        
        print(f"loss = {current_loss}")
        
        y_predicted = model(x_test)
    
        loss = criterion(y_predicted, y_test)

        # Get index with highest probability.
        predicted_labels = torch.argmax(y_predicted, dim=1)

        correct = (predicted_labels == y_test).sum()
        
        print('Loss: {}'.format(loss.item()))
        test_trajectory.append(loss.item())
        
    return test_trajectory

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01)
train_test_trajectory(optimizer, model, x_dataset, y_dataset, x_test, y_test, criterion, epochs)      

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01)
train(samples, optimizer, model, x_dataset, y_dataset, criterion)

In [None]:
test_loss = test(model, x_test, y_test, criterion)
exact_param.append(test_loss)

## SGD Momentum

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01,momentum=0.9)
train(samples, optimizer, model, x_dataset, y_dataset, criterion)

In [None]:
test_loss = test(model, x_test, y_test, criterion)
exact_param.append(test_loss)

## SGD Nesterov

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01,momentum=0.9,nesterov=True)
train(samples, optimizer, model, x_dataset, y_dataset, criterion)

In [None]:
test_loss = test(model, x_test, y_test, criterion)
exact_param.append(test_loss)

## Adagrad

In [None]:
optimizer = optim.Adagrad(model.parameters(), lr=0.01)
train(samples, optimizer, model, x_dataset, y_dataset, criterion)

In [None]:
test_loss = test(model, x_test, y_test, criterion)
exact_param.append(test_loss)

## RMSprop

In [None]:
optimizer = optim.RMSprop(model.parameters(), lr=0.01)
train(samples, optimizer, model, x_dataset, y_dataset, criterion)

In [None]:
test_loss = test(model, x_test, y_test, criterion)
exact_param.append(test_loss)

## Adam

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.01)
train(samples, optimizer, model, x_dataset, y_dataset, criterion)

In [None]:
test_loss = test(model, x_test, y_test, criterion)
exact_param.append(test_loss)

In [None]:
# store losses in overall loss list
test_losses.append(exact_param)

# Overparameterized
Number of parameters >> number of samples

In [None]:
samples = 50 #number of samples from each distribution
dim = 200
output_dim = 2

# means of the distributions
mean1 = 0
mean2 = 3/math.sqrt(dim)

In [None]:
# get training samples

# sample from 2 gaussians
x_dataset = torch.Tensor(np.vstack((np.random.normal(mean1, 1, size=(samples, dim)),
                                    np.random.normal(mean2, 1, size=(samples, dim)))))

# label = 1 for first dist., label = 0 for second dist.
gaussian1_labels = [1]*int(samples)
gaussian2_labels = [0]*int(samples)
y_dataset = torch.tensor(gaussian1_labels+gaussian2_labels,dtype=torch.long) # combine labels and convert to tensor

# print(y_dataset)
# print(x_dataset)

In [None]:
# get testing samples

x_test = torch.Tensor(np.vstack((np.random.normal(mean1, 1, size=(samples, dim)),
                                 np.random.normal(mean2, 1, size=(samples, dim)))))

# label = 1 for first dist., label = 0 for second dist.
gaussian1_labels = [1]*int(samples)
gaussian2_labels = [0]*int(samples)
y_test = torch.tensor(gaussian1_labels+gaussian2_labels,dtype=torch.long) # combine labels and convert to tensor

In [None]:
# Logistic regression model
model = torch.nn.Sequential(
    torch.nn.Linear(dim, samples),
    torch.nn.LogSoftmax(dim=1) 
)

# Use NLL since we include softmax as part of model 
criterion = nn.NLLLoss() 

In [None]:
over_param = []

## SGD

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01)
train(samples, optimizer, model, x_dataset, y_dataset, criterion)

In [None]:
test_loss = test(model, x_test, y_test, criterion)
over_param.append(test_loss)

## SGD Momentum

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01,momentum=0.9)
train(samples, optimizer, model, x_dataset, y_dataset, criterion)

In [None]:
test_loss = test(model, x_test, y_test, criterion)
over_param.append(test_loss)

## SGD Nesterov

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01,momentum=0.9,nesterov=True)
train(samples, optimizer, model, x_dataset, y_dataset, criterion)

In [None]:
test_loss = test(model, x_test, y_test, criterion)
over_param.append(test_loss)

## Adagrad

In [None]:
optimizer = optim.Adagrad(model.parameters(), lr=0.01)
train(samples, optimizer, model, x_dataset, y_dataset, criterion)

In [None]:
test_loss = test(model, x_test, y_test, criterion)
over_param.append(test_loss)

## RMSprop

In [None]:
optimizer = optim.RMSprop(model.parameters(), lr=0.01)
train(samples, optimizer, model, x_dataset, y_dataset, criterion)

In [None]:
test_loss = test(model, x_test, y_test, criterion)
over_param.append(test_loss)

## Adam

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.01)
train(samples, optimizer, model, x_dataset, y_dataset, criterion)

In [None]:
test_loss = test(model, x_test, y_test, criterion)
over_param.append(test_loss)

In [None]:
# store losses in overall loss list
test_losses.append(over_param)

# Underparameterized
Number of parameters << number of samples

In [None]:
samples = 50 #number of samples from each distribution
dim = 3
output_dim = 2

# means of the distributions
mean1 = 0
mean2 = 3/math.sqrt(dim)

In [None]:
# get training samples

# sample from 2 gaussians
x_dataset = torch.Tensor(np.vstack((np.random.normal(mean1, 1, size=(samples, dim)),
                                    np.random.normal(mean2, 1, size=(samples, dim)))))

# label = 1 for first dist., label = 0 for second dist.
gaussian1_labels = [1]*int(samples)
gaussian2_labels = [0]*int(samples)
y_dataset = torch.tensor(gaussian1_labels+gaussian2_labels,dtype=torch.long) # combine labels and convert to tensor

# print(y_dataset)
# print(x_dataset)

In [None]:
# get testing samples

x_test = torch.Tensor(np.vstack((np.random.normal(mean1, 1, size=(samples, dim)),
                                 np.random.normal(mean2, 1, size=(samples, dim)))))

# label = 1 for first dist., label = 0 for second dist.
gaussian1_labels = [1]*int(samples)
gaussian2_labels = [0]*int(samples)
y_test = torch.tensor(gaussian1_labels+gaussian2_labels,dtype=torch.long) # combine labels and convert to tensor

In [None]:
# Logistic regression model
model = torch.nn.Sequential(
    torch.nn.Linear(dim, samples),
    torch.nn.LogSoftmax(dim=1) 
)

# Use NLL since we include softmax as part of model 
criterion = nn.NLLLoss() 

In [None]:
# store underparameterized test run losses
under_param = []

## SGD

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01)
train(samples, optimizer, model, x_dataset, y_dataset, criterion)

In [None]:
test_loss = test(model, x_test, y_test, criterion)
under_param.append(test_loss)

## SGD Momentum

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01,momentum=0.9)
train(samples, optimizer, model, x_dataset, y_dataset, criterion)

In [None]:
test_loss = test(model, x_test, y_test, criterion)
under_param.append(test_loss)

## SGD Nesterov

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01,momentum=0.9,nesterov=True)
train(samples, optimizer, model, x_dataset, y_dataset, criterion)

In [None]:
test_loss = test(model, x_test, y_test, criterion)
under_param.append(test_loss)

## Adagrad

In [None]:
optimizer = optim.Adagrad(model.parameters(), lr=0.01)
train(samples, optimizer, model, x_dataset, y_dataset, criterion)

In [None]:
test_loss = test(model, x_test, y_test, criterion)
under_param.append(test_loss)

## RMSprop

In [None]:
optimizer = optim.RMSprop(model.parameters(), lr=0.01)
train(samples, optimizer, model, x_dataset, y_dataset, criterion)

In [None]:
test_loss = test(model, x_test, y_test, criterion)
under_param.append(test_loss)

## Adam

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.01)
train(samples, optimizer, model, x_dataset, y_dataset, criterion)

In [None]:
test_loss = test(model, x_test, y_test, criterion)
under_param.append(test_loss)

In [None]:
# store losses in overall loss list
test_losses.append(under_param)

In [None]:
index = ['logistic_regression_gaussian_exact_param','logistic_regression_gaussian_over_param','logistic_regression_gaussian_over_param']
col = ['SGD','Momentum','Nesterov','Adagrad','RMSProp','Adam']
df = pd.DataFrame(data=test_losses, index=index, columns=col)
df

In [None]:
df.to_csv('logistic_regression_gaussian_loss.csv')

# Normalize results

In [None]:
print(test_losses)

In [None]:
test_losses = np.asarray(test_losses)
test_losses
normalized_test_losses = []

for i in range(len(test_losses)):
    mean = np.mean(test_losses[i])
    minus_mean = test_losses[i] - mean
    normalized_test_losses.append((minus_mean)/np.linalg.norm(minus_mean))
print(normalized_test_losses)

In [None]:
index = ['logistic_regression_gaussian_exact_param','logistic_regression_gaussian_over_param','logistic_regression_gaussian_over_param']
col = ['SGD','Momentum','Nesterov','Adagrad','RMSProp','Adam']
df = pd.DataFrame(data=normalized_test_losses, index=index, columns=col)
df

In [None]:
df.to_csv('logistic_regression_gaussian_normalized_loss.csv')

# Overparameterized dimensions vs. losses graph

In [None]:
def get_losses(samples, dim, optimizer, criterion):
    samples = samples #number of samples from each distribution
    dim = dim
    output_dim = 2

    # means of the distributions
    mean1 = 0
    mean2 = 3/math.sqrt(dim)

    # get training samples
    x_dataset = torch.Tensor(np.vstack((np.random.normal(mean1, 1, size=(samples, dim)),
                                        np.random.normal(mean2, 1, size=(samples, dim)))))

    # get training labels
    gaussian1_labels = [1]*int(samples)
    gaussian2_labels = [0]*int(samples)
    y_dataset = torch.tensor(gaussian1_labels+gaussian2_labels,dtype=torch.long) # combine labels and convert to tensor

    # get testing samples
    x_test = torch.Tensor(np.vstack((np.random.normal(mean1, 1, size=(samples, dim)),
                                     np.random.normal(mean2, 1, size=(samples, dim)))))

    # get testing labels 
    gaussian1_labels = [1]*int(samples)
    gaussian2_labels = [0]*int(samples)
    y_test = torch.tensor(gaussian1_labels+gaussian2_labels,dtype=torch.long) # combine labels and convert to tensor
    
    # Logistic regression model
    model = torch.nn.Sequential(
        torch.nn.Linear(dim, samples),
        torch.nn.LogSoftmax(dim=1) 
    )
    
    train(samples, optimizer, model, x_dataset, y_dataset, criterion)
    return test(model, x_test, y_test, criterion)

In [None]:
optimizer_list=[]
optimizer_list.append(optim.SGD(model.parameters(), lr=0.01))
optimizer_list.append(optim.SGD(model.parameters(), lr=0.01,momentum=0.9))
optimizer_list.append(optim.SGD(model.parameters(), lr=0.01,momentum=0.9,nesterov=True))
optimizer_list.append(optim.Adagrad(model.parameters(), lr=0.01))
optimizer_list.append(optim.RMSprop(model.parameters(), lr=0.01))
optimizer_list.append(optim.Adam(model.parameters(), lr=0.01))

optimizer_names=['SGD', 'Momentum', 'Nesterov', 'Adagrad', 'RMSprop', 'Adam']

In [None]:
#INDIVIDUAL GRAPHS

samples = 100
criterion = nn.NLLLoss() 

for opt in range(len(optimizer_list)):
    losses = []
    sample_sizes = []
    for dim in range(1,500,20):
        losses.append(get_losses(samples, dim, optimizer_list[opt], criterion))
        sample_sizes.append(dim)
    plt.plot(sample_sizes, losses)
    plt.title('Logistic regression losses for ' + optimizer_names[opt]+' with '+str(samples)+' samples')
    plt.xlabel('Dimensions')
    plt.ylabel('Loss')
    plt.show()
#     plt.legend(['SGD', 'Momentum', 'Nesterov', 'Adagrad', 'RMSprop', 'Adam'], loc='upper left')

In [None]:
#OVERLAY GRAPH

samples = 100
criterion = nn.NLLLoss() 

for optimizer in optimizer_list:
    losses = []
    sample_sizes = []
    for dim in range(1,500,20):
        losses.append(get_losses(samples, dim, optimizer, criterion))
        sample_sizes.append(dim)
    plt.plot(sample_sizes, losses)
    plt.title('Logistic regression losses with '+str(samples)+' samples')
    plt.xlabel('Dimensions')
    plt.ylabel('Loss')
    plt.legend(['SGD', 'Momentum', 'Nesterov', 'Adagrad', 'RMSprop', 'Adam'], loc='upper left',bbox_to_anchor=(1, 0.5))