In [1]:
import torch, torch.nn as nn
torch.cuda.is_available()

True

In [4]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 4060 Laptop GPU'

In [5]:
import numpy as np
import os
from torch.utils.data import TensorDataset, DataLoader 
from torch.optim.lr_scheduler import StepLR
import matplotlib.pyplot as plt
import mnist1d
import random

In [7]:
args = mnist1d.data.get_dataset_args()
data = mnist1d.data.get_dataset(args, path = './mnist1d_data.pkl', download = False, regenerate=False)

# the training and test datasets, look at the formats
print("Examples in training set: {}".format(len(data['y'])))
print("Examples in test set: {}".format(len(data['y_test'])))
print("Length of each example: {}".format(data['x'].shape[-1]))

Successfully loaded data from ./mnist1d_data.pkl
Examples in training set: 4000
Examples in test set: 1000
Length of each example: 40


In [8]:
# Load in the data
train_data_x = data['x'].transpose()
train_data_y = data['y']
val_data_x = data['x_test'].transpose()
val_data_y = data['y_test']
# Print out sizes
print("Train data: %d examples (columns), each of which has %d dimensions (rows)"%((train_data_x.shape[1],train_data_x.shape[0])))
print("Validation data: %d examples (columns), each of which has %d dimensions (rows)"%((val_data_x.shape[1],val_data_x.shape[0])))

Train data: 4000 examples (columns), each of which has 40 dimensions (rows)
Validation data: 1000 examples (columns), each of which has 40 dimensions (rows)


In [22]:
def print_variance(name, data):

    # first dimension is batch elements
    # second dimension(columns) is neurons 
    np_data = data.detach().numpy()

    # compute variance across neurons and average these variances over the members of the batch
    neuron_variance = np.mean(np.var(np_data, axis = 0))

    # print out the name and the variance
    print("%s variance=%f"%(name, neuron_variance))
    

In [11]:
def weights_init(layer_in):
    if isinstance(layer_in, nn.Linear):
        nn.init.kaiming_uniform_(layer_in.weight)
        layer_in.bias.data.fill_(0.0)

In [16]:
def run_one_step_of_model(model, x_train, y_train):
    # choose cross-entropy loss function
    loss_function = nn.CrossEntropyLoss()

    # construct SGD optimizer and initialize learning rate and momentum
    optimizer = torch.optim.SGD(model.parameters(), lr = 0.05, momentum = 0.9)

    # load the data into a class that creates the batches
    data_loader = DataLoader(TensorDataset(x_train, y_train), batch_size = 200, shuffle = True, worker_init_fn = np.random.seed(1))

    # initialize model weights
    model.apply(weights_init)

    # get a batch
    for i, data in enumerate(data_loader):
        x_batch, y_batch = data
        optimizer.zero_grad()
        pred = model(x_batch)
        loss = loss_function(pred, y_batch)
        loss.backward()
        optimizer.step()
        # because this is only to see one step of the model
        break

In [17]:
# convert training data to torch tensors
x_train = torch.tensor(train_data_x.transpose().astype("float32"))
y_train = torch.tensor(train_data_y.astype("long"))

In [24]:
# a simple residual model with 5 residual branches in a row
class ResidualNetwork(torch.nn.Module):
    def __init__(self, input_size, output_size, hidden_size = 100):
        super(ResidualNetwork, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, hidden_size)
        self.linear4 = nn.Linear(hidden_size, hidden_size)
        self.linear5 = nn.Linear(hidden_size, hidden_size)
        self.linear6 = nn.Linear(hidden_size, hidden_size)
        self.linear7 = nn.Linear(hidden_size, output_size)

    def count_params(self):
        return sum([p.view(-1).shape[0] for p in self.parameters()])

    def forward(self, x):
        print_variance("Input",x)
        f = self.linear1(x)
        print_variance("First preactivation",f)
        res1 = f+ self.linear2(f.relu())
        print_variance("After first residual connection",res1)
        res2 = res1 + self.linear3(res1.relu())
        print_variance("After second residual connection",res2)
        res3 = res2 + self.linear4(res2.relu())
        print_variance("After third residual connection",res3)
        res4 = res3 + self.linear5(res3.relu())
        print_variance("After fourth residual connection",res4)
        res5 = res4 + self.linear6(res4.relu())
        print_variance("After fifth residual connection",res5)
        return self.linear7(res5)

In [26]:
# define the model and run for one step
n_hidden = 100
n_input = 40
n_output = 10
model = ResidualNetwork(n_input, n_output, n_hidden)
run_one_step_of_model(model, x_train, y_train)

Input variance=1.003261
First preactivation variance=2.099782
After first residual connection variance=3.596879
After second residual connection variance=6.531073
After third residual connection variance=11.242975
After fourth residual connection variance=20.700207
After fifth residual connection variance=38.224503


RuntimeError: expected scalar type Long but found Int