Run MLP (first layer weights fixed) on mnist and compute bias and variance

In [1]:
%matplotlib inline
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import queue
import os
import sys
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import copy 
import pandas as pd
class MLP(nn.Module):
    def __init__(self, p, d, o):
        """RF_models
        
        Args:
            p (int): the hidden size
            d (int): the input feature dimension
            o (int): the output dimension
        """
        super(MLP, self).__init__()

        self.fc1 = nn.Linear(d, p, bias=False)
        self.fc2 = nn.Linear(p, o, bias=False)
        self.p = p
        self.d = d
        self.o = o 
    def forward(self, x):
        o = F.relu(self.fc1(x))
        o = self.fc2(o)
        return o
class Ensemble_Two_Layer_NN(object):
    def __init__(self, n_classifiers, p, d=784, o=10):
        """Ensemble_Two_Layer_NN
        
        Args:
            p (int): the hidden size
            d (int, optional): the input feature dimension
            o (int, optional): the output dimension
            coef (float, optional): the ridge regression penalty coefficient
        """
        self.n_classifiers = n_classifiers
        self.p = p
        self.d = d 
        self.o = o 
        self.coef = coef
        self.learners = queue.LifoQueue(maxsize = self.n_classifiers)
        self.MODEL_TYPE = MLP
    def __len__(self):
        return len(self.learners.queue)
    
    def put_model_rho(self, model, rho):
        self.learners.put([model, rho])
    def get_init_model(self, cuda=True):
        model = self.MODEL_TYPE(self.p, self.d, self.o)
        if cuda:
            model.cuda()
        return model
    def cuda(self):
        if len(self) == 0:
            return 
        else:
            for model, rho in self.learners.queue:
                model.cuda()
            return
    def train(self):
        if len(self)!=0:
            for model, rho in self.learners.queue:
                model.train()
    def eval(self):
        if len(self)!=0:
            for model, rho in self.learners.queue:
                model.eval()
    def forward(self, x):
        Bs = x.size(0)
        if len(self) == 0:
            zeros = torch.zeros(Bs, self.o)
            zeros = zeros.to(x.device)
            return zeros
        else:
            outputs = torch.zeros(Bs, self.o)
            outputs = outputs.to(x.device) 
            for model, rho in self.learners.queue:
                output = model(x)
                outputs += rho*output
            return outputs


In [2]:
def get_subsample_dataset(trainset, subset):
    trainsubset = copy.deepcopy(trainset)
    trainsubset.data = [trainsubset.data[index] for index in subset]
    trainsubset.targets = [trainsubset.targets[index] for index in subset]
    return trainsubset
def fix_width_number(width, n_classifiers):
    return max(1, width//n_classifiers)

# Training
def train(net, trainset, permute_index, train_size, num_iters, lr, batch_size, coef):
    net.train()
    subsample_indexes = np.random.choice(permute_index, size=train_size)
    trainsubset = get_subsample_dataset(trainset, subsample_indexes)
    trainloader = torch.utils.data.DataLoader(trainsubset, batch_size=batch_size, shuffle=True)

    for i_c in range(net.n_classifiers):
        i_iter = 0
        model = net.get_init_model(cuda=True)
        rho = 1/net.n_classifiers
        optimizer = torch.optim.SGD(model.fc2.parameters(), lr=lr)
        #lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = num_iters//3, gamma = 0.1)
        while i_iter < num_iters:
            for inputs, targets in trainloader:
                Bs = inputs.size(0)
                inputs = inputs.reshape(Bs, -1)
                inputs, targets = inputs.cuda(), targets.cuda()
                targets_onehot = torch.FloatTensor(targets.size(0), net.o).cuda()
                targets_onehot.zero_()
                targets_onehot.scatter_(1, targets.view(-1, 1).long(), 1)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, targets_onehot)
                l2_reg = 0
                for param in model.fc2.parameters():
                    l2_reg += coef * torch.norm(param)
                mse_loss = loss.item()
                l2_loss = l2_reg.item()
                loss += l2_reg
                loss.backward()
                optimizer.step()
                #lr_scheduler.step()
                string = "Train {} model: Iters [{}/{}] mse: {:.4f}, l2_loss: {:.4f}, train_loss:{:.4f}".format(i_c+1, i_iter, num_iters,  mse_loss, l2_loss, loss.item())
                sys.stdout.write(string+"\r")
                sys.stdout.flush()
                i_iter +=1
        net.put_model_rho(model, rho)
    # after all models were trained, estimate the mse error
    train_loss = 0
    correct = 0
    total = 0
    for inputs, targets in trainloader:
        Bs = inputs.size(0)
        inputs = inputs.reshape(Bs, -1)
        inputs, targets = inputs.cuda(), targets.cuda()
        targets_onehot = torch.FloatTensor(targets.size(0), net.o).cuda()
        targets_onehot.zero_()
        targets_onehot.scatter_(1, targets.view(-1, 1).long(), 1)
        outputs = net.forward(inputs)
        loss = criterion(outputs, targets_onehot)
        train_loss = loss.item() * outputs.numel()
        _, predicted = outputs.max(1)
        correct = predicted.eq(targets).sum().item()
        total = targets.size(0)
    return train_loss/ total , 100. * correct / total

# Test
def test(net, testloader):
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.cuda(), targets.cuda()
            Bs = inputs.size(0)
            inputs = inputs.reshape(Bs, -1)
            targets_onehot = torch.FloatTensor(targets.size(0), net.o).cuda()
            targets_onehot.zero_()
            targets_onehot.scatter_(1, targets.view(-1, 1).long(), 1)
            outputs = net.forward(inputs)
            loss = criterion(outputs, targets_onehot)
            test_loss += loss.item() * outputs.numel()
            _, predicted = outputs.max(1)
            correct += predicted.eq(targets).sum().item()
            total += targets.size(0)
    return test_loss / total, 100. * correct / total

def compute_bias_variance(net, testloader, trial, OUTPUST_SUM, OUTPUTS_SUMNORMSQUARED):
    net.eval()
    bias2 = 0
    variance = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.cuda(), targets.cuda()
            Bs = inputs.size(0)
            inputs = inputs.reshape(Bs, -1)
            targets_onehot = torch.FloatTensor(targets.size(0), net.o).cuda()
            targets_onehot.zero_()
            targets_onehot.scatter_(1, targets.view(-1, 1).long(), 1)
            outputs = net.forward(inputs)
            OUTPUST_SUM[total:(total + targets.size(0)), :] += outputs
            OUTPUTS_SUMNORMSQUARED[total:total + targets.size(0)] += outputs.norm(dim=1) ** 2.0

            bias2 += (OUTPUST_SUM[total:total + targets.size(0), :] / (trial + 1) - targets_onehot).norm() ** 2.0
            variance += OUTPUTS_SUMNORMSQUARED[total:total + targets.size(0)].sum()/(trial + 1) - (OUTPUST_SUM[total:total + targets.size(0), :]/(trial + 1)).norm() ** 2.0
            total += targets.size(0)

    return bias2 / total, variance / total


In [3]:
transform_train = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])
trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform_train)
testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=4)
# loss definition
criterion = nn.MSELoss(reduction='mean').cuda()

def run_exps_sgd(train_sizes, N_Ds, P_Ns, trainset, test_size, feature_dim, num_classes, num_trials, coef,
             outdir, save_csv, num_iters, lr, batch_size_list, K = 1):
    df = pd.DataFrame()
    for batch_size in batch_size_list:
        for train_size in train_sizes:
            hidden_sizes = P_Ns * train_size
            hidden_sizes = np.unique([int(np.around(x)) for x in hidden_sizes])
            for hidden_size in hidden_sizes:
                TRAIN_ACC_SUM = 0.0
                TEST_ACC_SUM = 0.0
                TRAIN_LOSS_SUM = 0.0
                TEST_LOSS_SUM = 0.0
                permute_index = np.random.permutation(len(trainset))
                OUTPUST_SUM = torch.Tensor(test_size, num_classes).zero_().cuda()
                OUTPUTS_SUMNORMSQUARED = torch.Tensor(test_size).zero_().cuda()
                for trial in range(num_trials):
                    net = Ensemble_Two_Layer_NN(n_classifiers = K, p = fix_width_number(hidden_size, K), d=feature_dim, o=num_classes)
                    net.cuda()
                    train_loss, train_acc = train(net, trainset, permute_index, train_size,
                                                 num_iters, lr, batch_size, coef)
                    test_loss, test_acc = test(net, testloader)

                    TRAIN_LOSS_SUM += train_loss
                    TEST_LOSS_SUM += test_loss
                    TRAIN_ACC_SUM += train_acc
                    TEST_ACC_SUM += test_acc

                    # compute bias and variance
                    bias2, variance = compute_bias_variance(net, testloader, trial, OUTPUST_SUM, OUTPUTS_SUMNORMSQUARED)
                    variance_unbias = variance * num_trials / (num_trials - 1.0)
                    bias2_unbias = TEST_LOSS_SUM / (trial + 1) - variance_unbias
                    print('Train size: [{}] hidden size: [{}] batch size: [{}] trial: {}, train_loss: {:.6f}, train acc: {}, test loss: {:.6f}, test acc: {}, bias2: {}, variance: {}'.format(
                        train_size, hidden_size, batch_size,
                        trial, TRAIN_LOSS_SUM / (trial + 1), TRAIN_ACC_SUM / (trial + 1), TEST_LOSS_SUM / (trial + 1),
                        TEST_ACC_SUM / (trial + 1), bias2_unbias, variance_unbias))
                    torch.cuda.empty_cache()
                print('#'*50)
                df = df.append({'train_size': train_size, 'hidden_size':hidden_size, 'batch_size': batch_size,
                                'train_loss': TRAIN_LOSS_SUM / (trial + 1), 'train_acc': TRAIN_ACC_SUM / (trial + 1),
                                'test_loss': TEST_LOSS_SUM / (trial + 1), 'test_acc': TEST_ACC_SUM / (trial + 1), 
                               'variance': variance_unbias.item(),
                               'bias2': bias2_unbias.item()}, ignore_index=True)
                df.to_csv(os.path.join(outdir, save_csv))
    df.to_csv(os.path.join(outdir, save_csv))

In [None]:
num_classes = 10
num_trials = 50
coef = 0.001
N_Ds = [1]
feature_dim = 784
lr = 0.01 
batch_size_list = [10, 784]
train_sizes = [int(np.around(x*feature_dim)) for x in N_Ds]
test_size = 10000
P_Ns = 10** np.linspace(-2, 1, 50)

for num_iters in [500, 5000]:
    outdir = 'mnist_SGD/num_iters_{}_coef={}'.format(num_iters, coef)
    print(outdir)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    run_exps_sgd(train_sizes, N_Ds, P_Ns, trainset, test_size, feature_dim, num_classes, num_trials, coef,
             outdir, 'singleNN_output.csv', num_iters = num_iters, lr=lr, batch_size_list=batch_size_list,  K = 1)
    run_exps_sgd(train_sizes, N_Ds, P_Ns, trainset, test_size, feature_dim, num_classes, num_trials, coef,
                 outdir, 'ensembleNNK=2_output.csv', num_iters = num_iters, lr=lr, batch_size_list=batch_size_list,  K = 2)

mnist_SGD/num_iters_500_coef=0.001
Train size: [784] hidden size: [8] batch size: [10] trial: 0, train_loss: 1.204033, train acc: 0.0, test loss: 1.120322, test acc: 8.62, bias2: 1.1203222274780273, variance: -3.8925479506346505e-10
Train size: [784] hidden size: [8] batch size: [10] trial: 1, train_loss: 1.045916, train acc: 12.5, test loss: 1.061827, test acc: 10.95, bias2: 0.978081464767456, variance: 0.08374582976102829
Train size: [784] hidden size: [8] batch size: [10] trial: 2, train_loss: 1.080925, train acc: 8.333333333333334, test loss: 1.069227, test acc: 12.196666666666665, bias2: 0.9447241425514221, variance: 0.12450306862592697
Train size: [784] hidden size: [8] batch size: [10] trial: 3, train_loss: 1.076859, train acc: 12.5, test loss: 1.078112, test acc: 11.542499999999999, bias2: 0.9364995956420898, variance: 0.1416119635105133
Train size: [784] hidden size: [8] batch size: [10] trial: 4, train_loss: 1.124586, train acc: 10.0, test loss: 1.097511, test acc: 11.5639999

Train size: [784] hidden size: [8] batch size: [10] trial: 38, train_loss: 1.132124, train acc: 14.102564102564102, test loss: 1.132697, test acc: 10.90076923076923, bias2: 0.9025207757949829, variance: 0.2301764190196991
Train size: [784] hidden size: [8] batch size: [10] trial: 39, train_loss: 1.134295, train acc: 13.75, test loss: 1.132459, test acc: 10.96275, bias2: 0.9016899466514587, variance: 0.2307688444852829
Train size: [784] hidden size: [8] batch size: [10] trial: 40, train_loss: 1.132978, train acc: 13.414634146341463, test loss: 1.132258, test acc: 10.879999999999999, bias2: 0.9017515182495117, variance: 0.23050665855407715
Train size: [784] hidden size: [8] batch size: [10] trial: 41, train_loss: 1.132569, train acc: 13.095238095238095, test loss: 1.132137, test acc: 11.003095238095238, bias2: 0.9021018743515015, variance: 0.23003509640693665
Train size: [784] hidden size: [8] batch size: [10] trial: 42, train_loss: 1.135850, train acc: 12.790697674418604, test loss: 1.1

Train size: [784] hidden size: [9] batch size: [10] trial: 26, train_loss: 1.141658, train acc: 11.11111111111111, test loss: 1.141780, test acc: 11.015925925925924, bias2: 0.9157178401947021, variance: 0.22606220841407776
Train size: [784] hidden size: [9] batch size: [10] trial: 27, train_loss: 1.132208, train acc: 11.607142857142858, test loss: 1.141164, test acc: 10.982142857142856, bias2: 0.9160647392272949, variance: 0.225099578499794
Train size: [784] hidden size: [9] batch size: [10] trial: 28, train_loss: 1.132772, train acc: 12.068965517241379, test loss: 1.142178, test acc: 10.932413793103446, bias2: 0.9162260890007019, variance: 0.22595246136188507
Train size: [784] hidden size: [9] batch size: [10] trial: 29, train_loss: 1.128860, train acc: 12.5, test loss: 1.139393, test acc: 11.196333333333333, bias2: 0.9136346578598022, variance: 0.22575844824314117
Train size: [784] hidden size: [9] batch size: [10] trial: 30, train_loss: 1.132430, train acc: 12.903225806451612, test 

Train size: [784] hidden size: [10] batch size: [10] trial: 14, train_loss: 1.058351, train acc: 15.0, test loss: 1.114067, test acc: 11.677333333333333, bias2: 0.8914790749549866, variance: 0.22258810698986053
Train size: [784] hidden size: [10] batch size: [10] trial: 15, train_loss: 1.052210, train acc: 14.0625, test loss: 1.112781, test acc: 11.521875, bias2: 0.8893368244171143, variance: 0.2234443724155426
Train size: [784] hidden size: [10] batch size: [10] trial: 16, train_loss: 1.058893, train acc: 14.705882352941176, test loss: 1.112560, test acc: 11.375882352941176, bias2: 0.8894376158714294, variance: 0.22312255203723907
Train size: [784] hidden size: [10] batch size: [10] trial: 17, train_loss: 1.052693, train acc: 15.277777777777779, test loss: 1.107952, test acc: 11.764444444444443, bias2: 0.8862380981445312, variance: 0.22171422839164734
Train size: [784] hidden size: [10] batch size: [10] trial: 18, train_loss: 1.063205, train acc: 14.473684210526315, test loss: 1.10989

Train size: [784] hidden size: [12] batch size: [10] trial: 1, train_loss: 1.083749, train acc: 12.5, test loss: 1.157697, test acc: 12.36, bias2: 1.0130289793014526, variance: 0.14466798305511475
Train size: [784] hidden size: [12] batch size: [10] trial: 2, train_loss: 1.082794, train acc: 8.333333333333334, test loss: 1.140960, test acc: 12.42, bias2: 0.9670823216438293, variance: 0.1738772839307785
Train size: [784] hidden size: [12] batch size: [10] trial: 3, train_loss: 1.128002, train acc: 6.25, test loss: 1.135188, test acc: 13.4525, bias2: 0.9420695304870605, variance: 0.19311869144439697
Train size: [784] hidden size: [12] batch size: [10] trial: 4, train_loss: 1.142591, train acc: 10.0, test loss: 1.143471, test acc: 13.16, bias2: 0.9401010274887085, variance: 0.20336973667144775
Train size: [784] hidden size: [12] batch size: [10] trial: 5, train_loss: 1.145640, train acc: 8.333333333333334, test loss: 1.141009, test acc: 12.448333333333332, bias2: 0.936216413974762, varian

Train size: [784] hidden size: [12] batch size: [10] trial: 39, train_loss: 1.123286, train acc: 7.5, test loss: 1.122020, test acc: 11.354000000000003, bias2: 0.9055833220481873, variance: 0.21643655002117157
Train size: [784] hidden size: [12] batch size: [10] trial: 40, train_loss: 1.121111, train acc: 7.317073170731708, test loss: 1.121172, test acc: 11.42317073170732, bias2: 0.9039830565452576, variance: 0.21718914806842804
Train size: [784] hidden size: [12] batch size: [10] trial: 41, train_loss: 1.119098, train acc: 7.142857142857143, test loss: 1.121058, test acc: 11.485714285714288, bias2: 0.9028453826904297, variance: 0.21821270883083344
Train size: [784] hidden size: [12] batch size: [10] trial: 42, train_loss: 1.115707, train acc: 7.558139534883721, test loss: 1.119785, test acc: 11.55418604651163, bias2: 0.9025183320045471, variance: 0.21726687252521515
Train size: [784] hidden size: [12] batch size: [10] trial: 43, train_loss: 1.112116, train acc: 8.522727272727273, test

Train size: [784] hidden size: [14] batch size: [10] trial: 27, train_loss: 1.096996, train acc: 17.857142857142858, test loss: 1.080493, test acc: 13.233928571428573, bias2: 0.8571274280548096, variance: 0.22336605191230774
Train size: [784] hidden size: [14] batch size: [10] trial: 28, train_loss: 1.093051, train acc: 18.103448275862068, test loss: 1.083862, test acc: 13.23103448275862, bias2: 0.85953688621521, variance: 0.2243250459432602
Train size: [784] hidden size: [14] batch size: [10] trial: 29, train_loss: 1.087768, train acc: 17.5, test loss: 1.084341, test acc: 13.135666666666667, bias2: 0.8615744709968567, variance: 0.2227664589881897
Train size: [784] hidden size: [14] batch size: [10] trial: 30, train_loss: 1.092988, train acc: 16.93548387096774, test loss: 1.085417, test acc: 13.086774193548386, bias2: 0.8628588318824768, variance: 0.2225581854581833
Train size: [784] hidden size: [14] batch size: [10] trial: 31, train_loss: 1.090969, train acc: 16.40625, test loss: 1.0

Train size: [784] hidden size: [16] batch size: [10] trial: 14, train_loss: 1.069706, train acc: 11.666666666666666, test loss: 1.075754, test acc: 12.395999999999999, bias2: 0.882538378238678, variance: 0.19321519136428833
Train size: [784] hidden size: [16] batch size: [10] trial: 15, train_loss: 1.061128, train acc: 14.0625, test loss: 1.078062, test acc: 12.3325, bias2: 0.8834012150764465, variance: 0.1946612149477005
Train size: [784] hidden size: [16] batch size: [10] trial: 16, train_loss: 1.065216, train acc: 14.705882352941176, test loss: 1.081598, test acc: 12.23470588235294, bias2: 0.8837672472000122, variance: 0.1978311538696289
Train size: [784] hidden size: [16] batch size: [10] trial: 17, train_loss: 1.072595, train acc: 15.277777777777779, test loss: 1.081334, test acc: 12.294444444444444, bias2: 0.88181471824646, variance: 0.1995190531015396
Train size: [784] hidden size: [16] batch size: [10] trial: 18, train_loss: 1.069298, train acc: 14.473684210526315, test loss: 1

Train size: [784] hidden size: [18] batch size: [10] trial: 1, train_loss: 0.992941, train acc: 25.0, test loss: 1.108407, test acc: 10.254999999999999, bias2: 0.9942967891693115, variance: 0.11410989612340927
Train size: [784] hidden size: [18] batch size: [10] trial: 2, train_loss: 1.053018, train acc: 16.666666666666668, test loss: 1.093917, test acc: 11.06, bias2: 0.9526398777961731, variance: 0.14127711951732635
Train size: [784] hidden size: [18] batch size: [10] trial: 3, train_loss: 1.054478, train acc: 12.5, test loss: 1.075357, test acc: 11.9025, bias2: 0.9215575456619263, variance: 0.1537996232509613
Train size: [784] hidden size: [18] batch size: [10] trial: 4, train_loss: 1.013457, train acc: 15.0, test loss: 1.087301, test acc: 11.53, bias2: 0.9208225011825562, variance: 0.16647863388061523
Train size: [784] hidden size: [18] batch size: [10] trial: 5, train_loss: 1.018178, train acc: 16.666666666666668, test loss: 1.080758, test acc: 12.306666666666667, bias2: 0.90640300

Train size: [784] hidden size: [18] batch size: [10] trial: 39, train_loss: 1.030706, train acc: 14.375, test loss: 1.062446, test acc: 13.857, bias2: 0.8597567677497864, variance: 0.20268933475017548
Train size: [784] hidden size: [18] batch size: [10] trial: 40, train_loss: 1.031846, train acc: 14.634146341463415, test loss: 1.061082, test acc: 14.010243902439024, bias2: 0.8581013679504395, variance: 0.20298026502132416
Train size: [784] hidden size: [18] batch size: [10] trial: 41, train_loss: 1.033598, train acc: 14.285714285714286, test loss: 1.061889, test acc: 13.948095238095236, bias2: 0.8591501712799072, variance: 0.2027388960123062
Train size: [784] hidden size: [18] batch size: [10] trial: 42, train_loss: 1.030847, train acc: 13.953488372093023, test loss: 1.062552, test acc: 13.868837209302324, bias2: 0.8597719669342041, variance: 0.20278015732765198
Train size: [784] hidden size: [18] batch size: [10] trial: 43, train_loss: 1.029505, train acc: 13.636363636363637, test los

Train size: [784] hidden size: [21] batch size: [10] trial: 27, train_loss: 1.086295, train acc: 9.821428571428571, test loss: 1.048362, test acc: 15.059285714285716, bias2: 0.8413791656494141, variance: 0.20698300004005432
Train size: [784] hidden size: [21] batch size: [10] trial: 28, train_loss: 1.077047, train acc: 11.206896551724139, test loss: 1.047746, test acc: 15.054137931034484, bias2: 0.8412617444992065, variance: 0.2064843475818634
Train size: [784] hidden size: [21] batch size: [10] trial: 29, train_loss: 1.082843, train acc: 10.833333333333334, test loss: 1.048632, test acc: 15.040666666666668, bias2: 0.8415267467498779, variance: 0.20710527896881104
Train size: [784] hidden size: [21] batch size: [10] trial: 30, train_loss: 1.086871, train acc: 10.483870967741936, test loss: 1.049948, test acc: 14.906774193548388, bias2: 0.8431769609451294, variance: 0.2067708671092987
Train size: [784] hidden size: [21] batch size: [10] trial: 31, train_loss: 1.091327, train acc: 10.156

Train size: [784] hidden size: [24] batch size: [10] trial: 14, train_loss: 1.050522, train acc: 21.666666666666668, test loss: 1.047852, test acc: 15.562000000000001, bias2: 0.8496250510215759, variance: 0.19822664558887482
Train size: [784] hidden size: [24] batch size: [10] trial: 15, train_loss: 1.046935, train acc: 20.3125, test loss: 1.052134, test acc: 15.208125, bias2: 0.8527204394340515, variance: 0.19941382110118866
Train size: [784] hidden size: [24] batch size: [10] trial: 16, train_loss: 1.041263, train acc: 20.58823529411765, test loss: 1.053003, test acc: 15.22, bias2: 0.8525752425193787, variance: 0.20042817294597626
Train size: [784] hidden size: [24] batch size: [10] trial: 17, train_loss: 1.044550, train acc: 19.444444444444443, test loss: 1.053319, test acc: 15.006666666666668, bias2: 0.8534479141235352, variance: 0.19987055659294128
Train size: [784] hidden size: [24] batch size: [10] trial: 18, train_loss: 1.038777, train acc: 19.736842105263158, test loss: 1.0526

Train size: [784] hidden size: [28] batch size: [10] trial: 1, train_loss: 0.944503, train acc: 25.0, test loss: 0.995271, test acc: 18.86, bias2: 0.9013617038726807, variance: 0.09390953183174133
Train size: [784] hidden size: [28] batch size: [10] trial: 2, train_loss: 0.951991, train acc: 16.666666666666668, test loss: 0.999170, test acc: 17.87333333333333, bias2: 0.8813141584396362, variance: 0.11785539984703064
Train size: [784] hidden size: [28] batch size: [10] trial: 3, train_loss: 0.920703, train acc: 18.75, test loss: 0.993010, test acc: 19.85, bias2: 0.8533374071121216, variance: 0.13967260718345642
Train size: [784] hidden size: [28] batch size: [10] trial: 4, train_loss: 1.003970, train acc: 15.0, test loss: 1.013047, test acc: 18.852, bias2: 0.855333149433136, variance: 0.15771359205245972
Train size: [784] hidden size: [28] batch size: [10] trial: 5, train_loss: 1.024031, train acc: 12.5, test loss: 1.012093, test acc: 18.541666666666668, bias2: 0.8471009135246277, varia

Train size: [784] hidden size: [28] batch size: [10] trial: 39, train_loss: 1.018604, train acc: 13.125, test loss: 1.034468, test acc: 16.4335, bias2: 0.8290870189666748, variance: 0.20538057386875153
Train size: [784] hidden size: [28] batch size: [10] trial: 40, train_loss: 1.017923, train acc: 13.414634146341463, test loss: 1.034358, test acc: 16.43951219512195, bias2: 0.8289323449134827, variance: 0.2054256647825241
Train size: [784] hidden size: [28] batch size: [10] trial: 41, train_loss: 1.020717, train acc: 13.69047619047619, test loss: 1.036117, test acc: 16.33095238095238, bias2: 0.8306313157081604, variance: 0.20548586547374725
Train size: [784] hidden size: [28] batch size: [10] trial: 42, train_loss: 1.019502, train acc: 13.372093023255815, test loss: 1.035760, test acc: 16.229999999999997, bias2: 0.8312628865242004, variance: 0.20449714362621307
Train size: [784] hidden size: [28] batch size: [10] trial: 43, train_loss: 1.016743, train acc: 13.636363636363637, test loss:

Train size: [784] hidden size: [32] batch size: [10] trial: 27, train_loss: 1.043700, train acc: 16.964285714285715, test loss: 1.013230, test acc: 17.56107142857143, bias2: 0.8157926797866821, variance: 0.19743679463863373
Train size: [784] hidden size: [32] batch size: [10] trial: 28, train_loss: 1.046680, train acc: 16.379310344827587, test loss: 1.011907, test acc: 17.692413793103448, bias2: 0.8147544860839844, variance: 0.19715283811092377
Train size: [784] hidden size: [32] batch size: [10] trial: 29, train_loss: 1.046867, train acc: 15.833333333333334, test loss: 1.010948, test acc: 17.688, bias2: 0.8144014477729797, variance: 0.19654697179794312
Train size: [784] hidden size: [32] batch size: [10] trial: 30, train_loss: 1.049247, train acc: 15.32258064516129, test loss: 1.011406, test acc: 17.600645161290323, bias2: 0.8159233331680298, variance: 0.19548259675502777
Train size: [784] hidden size: [32] batch size: [10] trial: 31, train_loss: 1.049690, train acc: 15.625, test loss

Train size: [784] hidden size: [37] batch size: [10] trial: 15, train_loss: 0.984128, train acc: 14.0625, test loss: 1.013133, test acc: 17.63375, bias2: 0.8226878643035889, variance: 0.1904446929693222
Train size: [784] hidden size: [37] batch size: [10] trial: 16, train_loss: 0.993340, train acc: 13.235294117647058, test loss: 1.014934, test acc: 17.425294117647056, bias2: 0.8249324560165405, variance: 0.19000160694122314
Train size: [784] hidden size: [37] batch size: [10] trial: 17, train_loss: 1.015071, train acc: 12.5, test loss: 1.012533, test acc: 17.58111111111111, bias2: 0.8223371505737305, variance: 0.19019605219364166
Train size: [784] hidden size: [37] batch size: [10] trial: 18, train_loss: 1.007637, train acc: 13.157894736842104, test loss: 1.009670, test acc: 17.72631578947368, bias2: 0.8214914798736572, variance: 0.18817868828773499
Train size: [784] hidden size: [37] batch size: [10] trial: 19, train_loss: 0.999600, train acc: 13.75, test loss: 1.007276, test acc: 17.

Train size: [784] hidden size: [43] batch size: [10] trial: 2, train_loss: 0.810418, train acc: 41.666666666666664, test loss: 0.985160, test acc: 18.19, bias2: 0.8611629605293274, variance: 0.12399719655513763
Train size: [784] hidden size: [43] batch size: [10] trial: 3, train_loss: 0.798824, train acc: 43.75, test loss: 0.994550, test acc: 18.67, bias2: 0.8513871431350708, variance: 0.1431632936000824
Train size: [784] hidden size: [43] batch size: [10] trial: 4, train_loss: 0.887699, train acc: 35.0, test loss: 0.996420, test acc: 17.992, bias2: 0.8443819284439087, variance: 0.15203773975372314
Train size: [784] hidden size: [43] batch size: [10] trial: 5, train_loss: 0.913316, train acc: 29.166666666666668, test loss: 0.997138, test acc: 17.498333333333335, bias2: 0.8410930633544922, variance: 0.1560450941324234
Train size: [784] hidden size: [43] batch size: [10] trial: 6, train_loss: 0.911613, train acc: 28.571428571428573, test loss: 0.999083, test acc: 17.42, bias2: 0.83758121

Train size: [784] hidden size: [43] batch size: [10] trial: 40, train_loss: 0.981363, train acc: 19.51219512195122, test loss: 0.991508, test acc: 19.626341463414633, bias2: 0.7972748875617981, variance: 0.19423359632492065
Train size: [784] hidden size: [43] batch size: [10] trial: 41, train_loss: 0.977401, train acc: 19.642857142857142, test loss: 0.992226, test acc: 19.473809523809525, bias2: 0.798336386680603, variance: 0.1938895881175995
Train size: [784] hidden size: [43] batch size: [10] trial: 42, train_loss: 0.975736, train acc: 19.767441860465116, test loss: 0.992491, test acc: 19.433488372093024, bias2: 0.7982658743858337, variance: 0.194225013256073
Train size: [784] hidden size: [43] batch size: [10] trial: 43, train_loss: 0.979304, train acc: 19.318181818181817, test loss: 0.993834, test acc: 19.293636363636363, bias2: 0.799594521522522, variance: 0.1942390352487564
Train size: [784] hidden size: [43] batch size: [10] trial: 44, train_loss: 0.977353, train acc: 19.4444444

Train size: [784] hidden size: [49] batch size: [10] trial: 28, train_loss: 0.928811, train acc: 29.310344827586206, test loss: 0.968502, test acc: 22.889655172413796, bias2: 0.7749244570732117, variance: 0.19357770681381226
Train size: [784] hidden size: [49] batch size: [10] trial: 29, train_loss: 0.926412, train acc: 30.0, test loss: 0.968560, test acc: 22.939666666666668, bias2: 0.7737197875976562, variance: 0.19484025239944458
Train size: [784] hidden size: [49] batch size: [10] trial: 30, train_loss: 0.926051, train acc: 29.032258064516128, test loss: 0.967446, test acc: 23.046451612903226, bias2: 0.7721718549728394, variance: 0.19527441263198853
Train size: [784] hidden size: [49] batch size: [10] trial: 31, train_loss: 0.920553, train acc: 29.6875, test loss: 0.967309, test acc: 23.105937500000003, bias2: 0.7715654373168945, variance: 0.19574397802352905
Train size: [784] hidden size: [49] batch size: [10] trial: 32, train_loss: 0.924861, train acc: 28.78787878787879, test loss

Train size: [784] hidden size: [56] batch size: [10] trial: 16, train_loss: 0.939530, train acc: 23.529411764705884, test loss: 0.944876, test acc: 25.672941176470587, bias2: 0.7601415514945984, variance: 0.18473435938358307
Train size: [784] hidden size: [56] batch size: [10] trial: 17, train_loss: 0.944759, train acc: 23.61111111111111, test loss: 0.945005, test acc: 25.443333333333335, bias2: 0.7601546049118042, variance: 0.18485046923160553
Train size: [784] hidden size: [56] batch size: [10] trial: 18, train_loss: 0.951526, train acc: 23.68421052631579, test loss: 0.947816, test acc: 24.98315789473684, bias2: 0.7621357440948486, variance: 0.18567994236946106
Train size: [784] hidden size: [56] batch size: [10] trial: 19, train_loss: 0.953072, train acc: 23.75, test loss: 0.950480, test acc: 24.771, bias2: 0.7641326785087585, variance: 0.18634752929210663
Train size: [784] hidden size: [56] batch size: [10] trial: 20, train_loss: 0.948467, train acc: 23.80952380952381, test loss: 0

Train size: [784] hidden size: [65] batch size: [10] trial: 3, train_loss: 0.802469, train acc: 56.25, test loss: 0.923496, test acc: 30.005000000000003, bias2: 0.7734870314598083, variance: 0.15000943839550018
Train size: [784] hidden size: [65] batch size: [10] trial: 4, train_loss: 0.801204, train acc: 55.0, test loss: 0.930826, test acc: 28.620000000000005, bias2: 0.7685441374778748, variance: 0.162281796336174
Train size: [784] hidden size: [65] batch size: [10] trial: 5, train_loss: 0.786730, train acc: 54.166666666666664, test loss: 0.929933, test acc: 28.683333333333337, bias2: 0.764243483543396, variance: 0.16568981111049652
Train size: [784] hidden size: [65] batch size: [10] trial: 6, train_loss: 0.804909, train acc: 53.57142857142857, test loss: 0.931913, test acc: 28.050000000000004, bias2: 0.761407732963562, variance: 0.17050519585609436
Train size: [784] hidden size: [65] batch size: [10] trial: 7, train_loss: 0.880972, train acc: 46.875, test loss: 0.932462, test acc: 2

Train size: [784] hidden size: [65] batch size: [10] trial: 41, train_loss: 0.925566, train acc: 30.357142857142858, test loss: 0.936022, test acc: 26.433809523809536, bias2: 0.7451562881469727, variance: 0.19086576998233795
Train size: [784] hidden size: [65] batch size: [10] trial: 42, train_loss: 0.926235, train acc: 30.813953488372093, test loss: 0.935723, test acc: 26.449302325581403, bias2: 0.7446738481521606, variance: 0.1910490095615387
Train size: [784] hidden size: [65] batch size: [10] trial: 43, train_loss: 0.927835, train acc: 30.681818181818183, test loss: 0.935691, test acc: 26.512954545454555, bias2: 0.7443085312843323, variance: 0.19138221442699432
Train size: [784] hidden size: [65] batch size: [10] trial: 44, train_loss: 0.923663, train acc: 31.11111111111111, test loss: 0.935271, test acc: 26.535777777777785, bias2: 0.7442985773086548, variance: 0.1909724920988083
Train size: [784] hidden size: [65] batch size: [10] trial: 45, train_loss: 0.923373, train acc: 30.978

Train size: [784] hidden size: [75] batch size: [10] trial: 28, train_loss: 0.915298, train acc: 27.586206896551722, test loss: 0.913587, test acc: 28.83896551724138, bias2: 0.7314231991767883, variance: 0.18216349184513092
Train size: [784] hidden size: [75] batch size: [10] trial: 29, train_loss: 0.914589, train acc: 27.5, test loss: 0.912759, test acc: 28.91866666666667, bias2: 0.7304732203483582, variance: 0.18228618800640106
Train size: [784] hidden size: [75] batch size: [10] trial: 30, train_loss: 0.910476, train acc: 27.419354838709676, test loss: 0.913214, test acc: 28.82903225806452, bias2: 0.7303594946861267, variance: 0.18285471200942993
Train size: [784] hidden size: [75] batch size: [10] trial: 31, train_loss: 0.907587, train acc: 28.125, test loss: 0.912784, test acc: 28.9759375, bias2: 0.729246199131012, variance: 0.18353740870952606
Train size: [784] hidden size: [75] batch size: [10] trial: 32, train_loss: 0.900991, train acc: 29.545454545454547, test loss: 0.913133, 

Train size: [784] hidden size: [86] batch size: [10] trial: 16, train_loss: 0.873607, train acc: 35.294117647058826, test loss: 0.883929, test acc: 32.83705882352941, bias2: 0.7129069566726685, variance: 0.17102178931236267
Train size: [784] hidden size: [86] batch size: [10] trial: 17, train_loss: 0.877552, train acc: 34.72222222222222, test loss: 0.883901, test acc: 32.83444444444444, bias2: 0.7114765644073486, variance: 0.17242403328418732
Train size: [784] hidden size: [86] batch size: [10] trial: 18, train_loss: 0.887137, train acc: 32.89473684210526, test loss: 0.883147, test acc: 32.89947368421053, bias2: 0.7103437185287476, variance: 0.17280323803424835
Train size: [784] hidden size: [86] batch size: [10] trial: 19, train_loss: 0.905195, train acc: 32.5, test loss: 0.883696, test acc: 32.874, bias2: 0.7104905843734741, variance: 0.1732049584388733
Train size: [784] hidden size: [86] batch size: [10] trial: 20, train_loss: 0.896487, train acc: 33.333333333333336, test loss: 0.88

Train size: [784] hidden size: [99] batch size: [10] trial: 3, train_loss: 0.808417, train acc: 43.75, test loss: 0.881921, test acc: 33.585, bias2: 0.7427623867988586, variance: 0.13915841281414032
Train size: [784] hidden size: [99] batch size: [10] trial: 4, train_loss: 0.815655, train acc: 45.0, test loss: 0.885008, test acc: 33.038, bias2: 0.7383334636688232, variance: 0.14667443931102753
Train size: [784] hidden size: [99] batch size: [10] trial: 5, train_loss: 0.817865, train acc: 50.0, test loss: 0.880989, test acc: 33.36666666666667, bias2: 0.7302241921424866, variance: 0.15076442062854767
Train size: [784] hidden size: [99] batch size: [10] trial: 6, train_loss: 0.834671, train acc: 46.42857142857143, test loss: 0.877238, test acc: 33.90714285714286, bias2: 0.7221453785896301, variance: 0.15509247779846191
Train size: [784] hidden size: [99] batch size: [10] trial: 7, train_loss: 0.827603, train acc: 43.75, test loss: 0.875436, test acc: 33.7275, bias2: 0.7189012765884399, va

Train size: [784] hidden size: [99] batch size: [10] trial: 41, train_loss: 0.844111, train acc: 40.476190476190474, test loss: 0.874849, test acc: 34.14976190476191, bias2: 0.6965570449829102, variance: 0.17829176783561707
Train size: [784] hidden size: [99] batch size: [10] trial: 42, train_loss: 0.842350, train acc: 40.116279069767444, test loss: 0.875228, test acc: 34.094883720930234, bias2: 0.6966922283172607, variance: 0.17853543162345886
Train size: [784] hidden size: [99] batch size: [10] trial: 43, train_loss: 0.847555, train acc: 39.77272727272727, test loss: 0.875044, test acc: 34.137727272727275, bias2: 0.6964812278747559, variance: 0.17856323719024658
Train size: [784] hidden size: [99] batch size: [10] trial: 44, train_loss: 0.845332, train acc: 40.0, test loss: 0.874872, test acc: 34.17666666666667, bias2: 0.6965411901473999, variance: 0.1783306896686554
Train size: [784] hidden size: [99] batch size: [10] trial: 45, train_loss: 0.842064, train acc: 40.21739130434783, te

Train size: [784] hidden size: [114] batch size: [10] trial: 29, train_loss: 0.887097, train acc: 30.833333333333332, test loss: 0.843660, test acc: 39.59333333333334, bias2: 0.6704445481300354, variance: 0.1732156127691269
Train size: [784] hidden size: [114] batch size: [10] trial: 30, train_loss: 0.876418, train acc: 32.25806451612903, test loss: 0.844372, test acc: 39.474193548387106, bias2: 0.6708307266235352, variance: 0.1735411286354065
Train size: [784] hidden size: [114] batch size: [10] trial: 31, train_loss: 0.865470, train acc: 34.375, test loss: 0.843753, test acc: 39.60500000000001, bias2: 0.6695733070373535, variance: 0.17417961359024048
Train size: [784] hidden size: [114] batch size: [10] trial: 32, train_loss: 0.866586, train acc: 34.09090909090909, test loss: 0.843778, test acc: 39.60696969696971, bias2: 0.669545590877533, variance: 0.17423273622989655
Train size: [784] hidden size: [114] batch size: [10] trial: 33, train_loss: 0.857072, train acc: 36.029411764705884

Train size: [784] hidden size: [131] batch size: [10] trial: 17, train_loss: 0.805462, train acc: 40.27777777777778, test loss: 0.823689, test acc: 42.01111111111111, bias2: 0.6593926548957825, variance: 0.16429655253887177
Train size: [784] hidden size: [131] batch size: [10] trial: 18, train_loss: 0.809004, train acc: 40.78947368421053, test loss: 0.823967, test acc: 41.95578947368421, bias2: 0.6588780879974365, variance: 0.16508927941322327
Train size: [784] hidden size: [131] batch size: [10] trial: 19, train_loss: 0.823193, train acc: 41.25, test loss: 0.825369, test acc: 41.75150000000001, bias2: 0.6591647267341614, variance: 0.1662038415670395
Train size: [784] hidden size: [131] batch size: [10] trial: 20, train_loss: 0.818604, train acc: 40.476190476190474, test loss: 0.825605, test acc: 41.736666666666665, bias2: 0.6589177846908569, variance: 0.1666867733001709
Train size: [784] hidden size: [131] batch size: [10] trial: 21, train_loss: 0.822442, train acc: 39.77272727272727,

Train size: [784] hidden size: [151] batch size: [10] trial: 4, train_loss: 0.792861, train acc: 40.0, test loss: 0.790641, test acc: 46.608000000000004, bias2: 0.6581280827522278, variance: 0.13251309096813202
Train size: [784] hidden size: [151] batch size: [10] trial: 5, train_loss: 0.781100, train acc: 41.666666666666664, test loss: 0.792684, test acc: 45.925000000000004, bias2: 0.6549063324928284, variance: 0.13777762651443481
Train size: [784] hidden size: [151] batch size: [10] trial: 6, train_loss: 0.806123, train acc: 39.285714285714285, test loss: 0.799905, test acc: 44.73142857142857, bias2: 0.6585438251495361, variance: 0.14136077463626862
Train size: [784] hidden size: [151] batch size: [10] trial: 7, train_loss: 0.820376, train acc: 37.5, test loss: 0.800513, test acc: 44.6575, bias2: 0.6576491594314575, variance: 0.14286361634731293
Train size: [784] hidden size: [151] batch size: [10] trial: 8, train_loss: 0.818086, train acc: 36.111111111111114, test loss: 0.803040, te

Train size: [784] hidden size: [151] batch size: [10] trial: 42, train_loss: 0.727220, train acc: 54.651162790697676, test loss: 0.792549, test acc: 46.32767441860464, bias2: 0.6290304660797119, variance: 0.16351833939552307
Train size: [784] hidden size: [151] batch size: [10] trial: 43, train_loss: 0.729150, train acc: 54.54545454545455, test loss: 0.793196, test acc: 46.24045454545453, bias2: 0.6296448707580566, variance: 0.1635509431362152
Train size: [784] hidden size: [151] batch size: [10] trial: 44, train_loss: 0.729883, train acc: 55.0, test loss: 0.793388, test acc: 46.22244444444443, bias2: 0.629638671875, variance: 0.16374975442886353
Train size: [784] hidden size: [151] batch size: [10] trial: 45, train_loss: 0.728059, train acc: 54.34782608695652, test loss: 0.793081, test acc: 46.31043478260868, bias2: 0.6290709376335144, variance: 0.16401048004627228
Train size: [784] hidden size: [151] batch size: [10] trial: 46, train_loss: 0.736047, train acc: 53.723404255319146, tes

Train size: [784] hidden size: [174] batch size: [10] trial: 30, train_loss: 0.780719, train acc: 43.54838709677419, test loss: 0.770488, test acc: 49.90709677419355, bias2: 0.608406662940979, variance: 0.16208158433437347
Train size: [784] hidden size: [174] batch size: [10] trial: 31, train_loss: 0.780537, train acc: 44.53125, test loss: 0.770996, test acc: 49.736875, bias2: 0.6090013980865479, variance: 0.16199447214603424
Train size: [784] hidden size: [174] batch size: [10] trial: 32, train_loss: 0.773900, train acc: 45.45454545454545, test loss: 0.770407, test acc: 49.89636363636363, bias2: 0.608522355556488, variance: 0.1618848294019699
Train size: [784] hidden size: [174] batch size: [10] trial: 33, train_loss: 0.770059, train acc: 46.3235294117647, test loss: 0.769666, test acc: 49.99617647058823, bias2: 0.6078739166259766, variance: 0.1617916375398636
Train size: [784] hidden size: [174] batch size: [10] trial: 34, train_loss: 0.777597, train acc: 45.714285714285715, test los

Train size: [784] hidden size: [201] batch size: [10] trial: 18, train_loss: 0.737266, train acc: 55.26315789473684, test loss: 0.743447, test acc: 52.82736842105263, bias2: 0.5921521186828613, variance: 0.15129472315311432
Train size: [784] hidden size: [201] batch size: [10] trial: 19, train_loss: 0.735873, train acc: 55.0, test loss: 0.742297, test acc: 52.936, bias2: 0.5908892154693604, variance: 0.15140804648399353
Train size: [784] hidden size: [201] batch size: [10] trial: 20, train_loss: 0.744321, train acc: 54.76190476190476, test loss: 0.743425, test acc: 52.75571428571429, bias2: 0.5913791060447693, variance: 0.15204554796218872
Train size: [784] hidden size: [201] batch size: [10] trial: 21, train_loss: 0.737456, train acc: 56.81818181818182, test loss: 0.744201, test acc: 52.740454545454554, bias2: 0.5918523073196411, variance: 0.1523488610982895
Train size: [784] hidden size: [201] batch size: [10] trial: 22, train_loss: 0.726915, train acc: 58.69565217391305, test loss: 

Train size: [784] hidden size: [231] batch size: [10] trial: 5, train_loss: 0.715771, train acc: 58.333333333333336, test loss: 0.726727, test acc: 55.51666666666666, bias2: 0.5983800292015076, variance: 0.128346785902977
Train size: [784] hidden size: [231] batch size: [10] trial: 6, train_loss: 0.720430, train acc: 57.142857142857146, test loss: 0.724753, test acc: 55.324285714285715, bias2: 0.5931917428970337, variance: 0.13156111538410187
Train size: [784] hidden size: [231] batch size: [10] trial: 7, train_loss: 0.719286, train acc: 59.375, test loss: 0.724589, test acc: 55.332499999999996, bias2: 0.5897013545036316, variance: 0.13488727807998657
Train size: [784] hidden size: [231] batch size: [10] trial: 8, train_loss: 0.725076, train acc: 55.55555555555556, test loss: 0.726868, test acc: 55.086666666666666, bias2: 0.5893543362617493, variance: 0.13751356303691864
Train size: [784] hidden size: [231] batch size: [10] trial: 9, train_loss: 0.742805, train acc: 50.0, test loss: 0.

Train size: [784] hidden size: [231] batch size: [10] trial: 42, train_loss: 0.700869, train acc: 58.13953488372093, test loss: 0.722739, test acc: 55.74209302325582, bias2: 0.5701666474342346, variance: 0.1525726318359375
Train size: [784] hidden size: [231] batch size: [10] trial: 43, train_loss: 0.702722, train acc: 58.52272727272727, test loss: 0.723000, test acc: 55.741136363636365, bias2: 0.57039475440979, variance: 0.15260551869869232
Train size: [784] hidden size: [231] batch size: [10] trial: 44, train_loss: 0.701953, train acc: 58.333333333333336, test loss: 0.723413, test acc: 55.732666666666674, bias2: 0.5704893469810486, variance: 0.15292410552501678
Train size: [784] hidden size: [231] batch size: [10] trial: 45, train_loss: 0.703615, train acc: 58.15217391304348, test loss: 0.723613, test acc: 55.707608695652176, bias2: 0.5706753730773926, variance: 0.1529376357793808
Train size: [784] hidden size: [231] batch size: [10] trial: 46, train_loss: 0.700643, train acc: 58.510

Train size: [784] hidden size: [266] batch size: [10] trial: 29, train_loss: 0.676242, train acc: 60.0, test loss: 0.695046, test acc: 59.72933333333333, bias2: 0.546562910079956, variance: 0.1484832763671875
Train size: [784] hidden size: [266] batch size: [10] trial: 30, train_loss: 0.674461, train acc: 59.67741935483871, test loss: 0.695014, test acc: 59.692580645161286, bias2: 0.546433687210083, variance: 0.1485801786184311
Train size: [784] hidden size: [266] batch size: [10] trial: 31, train_loss: 0.673659, train acc: 59.375, test loss: 0.694628, test acc: 59.670937499999994, bias2: 0.5459476709365845, variance: 0.1486799567937851
Train size: [784] hidden size: [266] batch size: [10] trial: 32, train_loss: 0.673804, train acc: 59.09090909090909, test loss: 0.695269, test acc: 59.563333333333325, bias2: 0.5467227101325989, variance: 0.14854639768600464
Train size: [784] hidden size: [266] batch size: [10] trial: 33, train_loss: 0.672498, train acc: 60.294117647058826, test loss: 0

Train size: [784] hidden size: [306] batch size: [10] trial: 16, train_loss: 0.626026, train acc: 64.70588235294117, test loss: 0.671860, test acc: 62.49647058823531, bias2: 0.533360481262207, variance: 0.13849905133247375
Train size: [784] hidden size: [306] batch size: [10] trial: 17, train_loss: 0.625953, train acc: 65.27777777777777, test loss: 0.671234, test acc: 62.48555555555557, bias2: 0.532437801361084, variance: 0.1387958526611328
Train size: [784] hidden size: [306] batch size: [10] trial: 18, train_loss: 0.627097, train acc: 65.78947368421052, test loss: 0.672206, test acc: 62.454210526315805, bias2: 0.5329301357269287, variance: 0.13927538692951202
Train size: [784] hidden size: [306] batch size: [10] trial: 19, train_loss: 0.638793, train acc: 65.0, test loss: 0.671123, test acc: 62.59600000000002, bias2: 0.531326413154602, variance: 0.13979651033878326
Train size: [784] hidden size: [306] batch size: [10] trial: 20, train_loss: 0.621897, train acc: 66.66666666666667, tes

Train size: [784] hidden size: [353] batch size: [10] trial: 3, train_loss: 0.702573, train acc: 56.25, test loss: 0.652396, test acc: 64.39999999999999, bias2: 0.5450870990753174, variance: 0.10730880498886108
Train size: [784] hidden size: [353] batch size: [10] trial: 4, train_loss: 0.678354, train acc: 65.0, test loss: 0.649915, test acc: 64.72, bias2: 0.53582763671875, variance: 0.11408726871013641
Train size: [784] hidden size: [353] batch size: [10] trial: 5, train_loss: 0.704409, train acc: 62.5, test loss: 0.650175, test acc: 65.01666666666667, bias2: 0.5312063694000244, variance: 0.11896886676549911
Train size: [784] hidden size: [353] batch size: [10] trial: 6, train_loss: 0.708633, train acc: 64.28571428571429, test loss: 0.651859, test acc: 64.47285714285714, bias2: 0.5291358232498169, variance: 0.12272291630506516
Train size: [784] hidden size: [353] batch size: [10] trial: 7, train_loss: 0.710897, train acc: 62.5, test loss: 0.650271, test acc: 64.63624999999999, bias2: 

Train size: [784] hidden size: [353] batch size: [10] trial: 41, train_loss: 0.653463, train acc: 63.69047619047619, test loss: 0.652499, test acc: 64.22809523809524, bias2: 0.5127179622650146, variance: 0.13978107273578644
Train size: [784] hidden size: [353] batch size: [10] trial: 42, train_loss: 0.656329, train acc: 62.7906976744186, test loss: 0.652458, test acc: 64.25325581395349, bias2: 0.5125740170478821, variance: 0.1398843675851822
Train size: [784] hidden size: [353] batch size: [10] trial: 43, train_loss: 0.658364, train acc: 62.5, test loss: 0.652993, test acc: 64.16772727272726, bias2: 0.5129818916320801, variance: 0.14001062512397766
Train size: [784] hidden size: [353] batch size: [10] trial: 44, train_loss: 0.661614, train acc: 61.666666666666664, test loss: 0.652583, test acc: 64.24822222222221, bias2: 0.5126466155052185, variance: 0.1399361491203308
Train size: [784] hidden size: [353] batch size: [10] trial: 45, train_loss: 0.658073, train acc: 62.5, test loss: 0.65

Train size: [784] hidden size: [406] batch size: [10] trial: 29, train_loss: 0.556413, train acc: 73.33333333333333, test loss: 0.630532, test acc: 66.99066666666667, bias2: 0.4960654377937317, variance: 0.13446645438671112
Train size: [784] hidden size: [406] batch size: [10] trial: 30, train_loss: 0.549970, train acc: 74.19354838709677, test loss: 0.631162, test acc: 66.91064516129033, bias2: 0.4965769648551941, variance: 0.1345854550600052
Train size: [784] hidden size: [406] batch size: [10] trial: 31, train_loss: 0.549526, train acc: 74.21875, test loss: 0.631498, test acc: 66.89031250000002, bias2: 0.4967639446258545, variance: 0.13473419845104218
Train size: [784] hidden size: [406] batch size: [10] trial: 32, train_loss: 0.546848, train acc: 74.24242424242425, test loss: 0.631818, test acc: 66.8257575757576, bias2: 0.49706393480300903, variance: 0.134754478931427
Train size: [784] hidden size: [406] batch size: [10] trial: 33, train_loss: 0.546081, train acc: 75.0, test loss: 0

Train size: [784] hidden size: [468] batch size: [10] trial: 17, train_loss: 0.534915, train acc: 77.77777777777777, test loss: 0.611202, test acc: 69.14888888888888, bias2: 0.4848952889442444, variance: 0.12630635499954224
Train size: [784] hidden size: [468] batch size: [10] trial: 18, train_loss: 0.536885, train acc: 78.94736842105263, test loss: 0.610991, test acc: 69.11631578947367, bias2: 0.4844403862953186, variance: 0.12655025720596313
Train size: [784] hidden size: [468] batch size: [10] trial: 19, train_loss: 0.536127, train acc: 80.0, test loss: 0.610878, test acc: 69.11299999999999, bias2: 0.4840010404586792, variance: 0.1268768161535263
Train size: [784] hidden size: [468] batch size: [10] trial: 20, train_loss: 0.538503, train acc: 79.76190476190476, test loss: 0.610383, test acc: 69.15095238095238, bias2: 0.4834281802177429, variance: 0.12695498764514923
Train size: [784] hidden size: [468] batch size: [10] trial: 21, train_loss: 0.538497, train acc: 79.54545454545455, t

Train size: [784] hidden size: [538] batch size: [10] trial: 5, train_loss: 0.510536, train acc: 83.33333333333333, test loss: 0.590432, test acc: 71.73833333333333, bias2: 0.47866806387901306, variance: 0.1117643341422081
Train size: [784] hidden size: [538] batch size: [10] trial: 6, train_loss: 0.497629, train acc: 82.14285714285714, test loss: 0.591438, test acc: 71.50428571428571, bias2: 0.4773346483707428, variance: 0.1141030415892601
Train size: [784] hidden size: [538] batch size: [10] trial: 7, train_loss: 0.508817, train acc: 81.25, test loss: 0.592757, test acc: 71.28375, bias2: 0.47745680809020996, variance: 0.11529989540576935
Train size: [784] hidden size: [538] batch size: [10] trial: 8, train_loss: 0.501426, train acc: 83.33333333333333, test loss: 0.592084, test acc: 71.51777777777778, bias2: 0.4752461910247803, variance: 0.11683767288923264
Train size: [784] hidden size: [538] batch size: [10] trial: 9, train_loss: 0.498539, train acc: 82.5, test loss: 0.592658, test 

Train size: [784] hidden size: [538] batch size: [10] trial: 43, train_loss: 0.533334, train acc: 73.86363636363636, test loss: 0.594011, test acc: 71.05636363636364, bias2: 0.46576958894729614, variance: 0.12824107706546783
Train size: [784] hidden size: [538] batch size: [10] trial: 44, train_loss: 0.529610, train acc: 74.44444444444444, test loss: 0.593965, test acc: 71.03666666666666, bias2: 0.4655400514602661, variance: 0.1284245252609253
Train size: [784] hidden size: [538] batch size: [10] trial: 45, train_loss: 0.529676, train acc: 74.45652173913044, test loss: 0.593889, test acc: 71.03565217391304, bias2: 0.4654706120491028, variance: 0.12841849029064178
Train size: [784] hidden size: [538] batch size: [10] trial: 46, train_loss: 0.527787, train acc: 75.0, test loss: 0.593826, test acc: 71.05723404255319, bias2: 0.46539297699928284, variance: 0.12843260169029236
Train size: [784] hidden size: [538] batch size: [10] trial: 47, train_loss: 0.527905, train acc: 75.0, test loss: 0

Train size: [784] hidden size: [620] batch size: [10] trial: 31, train_loss: 0.555564, train acc: 73.4375, test loss: 0.572892, test acc: 72.95843749999999, bias2: 0.4504297971725464, variance: 0.12246204912662506
Train size: [784] hidden size: [620] batch size: [10] trial: 32, train_loss: 0.554842, train acc: 72.72727272727273, test loss: 0.572716, test acc: 72.9351515151515, bias2: 0.45018258690834045, variance: 0.12253352254629135
Train size: [784] hidden size: [620] batch size: [10] trial: 33, train_loss: 0.554446, train acc: 72.79411764705883, test loss: 0.572987, test acc: 72.93911764705882, bias2: 0.45036637783050537, variance: 0.1226203590631485
Train size: [784] hidden size: [620] batch size: [10] trial: 34, train_loss: 0.563325, train acc: 71.42857142857143, test loss: 0.572913, test acc: 72.93771428571428, bias2: 0.4502269923686981, variance: 0.122686006128788
Train size: [784] hidden size: [620] batch size: [10] trial: 35, train_loss: 0.563518, train acc: 71.52777777777777,

Train size: [784] hidden size: [714] batch size: [10] trial: 19, train_loss: 0.450698, train acc: 86.25, test loss: 0.560344, test acc: 74.05250000000001, bias2: 0.442166805267334, variance: 0.11817687749862671
Train size: [784] hidden size: [714] batch size: [10] trial: 20, train_loss: 0.447087, train acc: 85.71428571428571, test loss: 0.559817, test acc: 74.0047619047619, bias2: 0.44151419401168823, variance: 0.11830317229032516
Train size: [784] hidden size: [714] batch size: [10] trial: 21, train_loss: 0.440920, train acc: 86.36363636363636, test loss: 0.560026, test acc: 73.92545454545456, bias2: 0.44134068489074707, variance: 0.1186850517988205
Train size: [784] hidden size: [714] batch size: [10] trial: 22, train_loss: 0.437936, train acc: 85.8695652173913, test loss: 0.560276, test acc: 73.90913043478261, bias2: 0.44148802757263184, variance: 0.11878758668899536
Train size: [784] hidden size: [714] batch size: [10] trial: 23, train_loss: 0.452303, train acc: 84.375, test loss: 

Train size: [784] hidden size: [822] batch size: [10] trial: 7, train_loss: 0.481749, train acc: 81.25, test loss: 0.544117, test acc: 76.01624999999999, bias2: 0.4362856447696686, variance: 0.10783151537179947
Train size: [784] hidden size: [822] batch size: [10] trial: 8, train_loss: 0.482076, train acc: 80.55555555555556, test loss: 0.543735, test acc: 75.92666666666666, bias2: 0.43426597118377686, variance: 0.10946856439113617
Train size: [784] hidden size: [822] batch size: [10] trial: 9, train_loss: 0.472617, train acc: 82.5, test loss: 0.543953, test acc: 75.854, bias2: 0.43359729647636414, variance: 0.11035605520009995
Train size: [784] hidden size: [822] batch size: [10] trial: 10, train_loss: 0.463144, train acc: 84.0909090909091, test loss: 0.544178, test acc: 75.71454545454544, bias2: 0.4329591393470764, variance: 0.11121874302625656
Train size: [784] hidden size: [822] batch size: [10] trial: 11, train_loss: 0.474339, train acc: 83.33333333333333, test loss: 0.543443, test

Train size: [784] hidden size: [822] batch size: [10] trial: 44, train_loss: 0.490843, train acc: 81.11111111111111, test loss: 0.544650, test acc: 75.6262222222222, bias2: 0.4259685277938843, variance: 0.11868184804916382
Train size: [784] hidden size: [822] batch size: [10] trial: 45, train_loss: 0.489208, train acc: 80.97826086956522, test loss: 0.544977, test acc: 75.56717391304345, bias2: 0.4262065291404724, variance: 0.11877019703388214
Train size: [784] hidden size: [822] batch size: [10] trial: 46, train_loss: 0.487571, train acc: 80.85106382978724, test loss: 0.544908, test acc: 75.57063829787232, bias2: 0.4261223077774048, variance: 0.1187860295176506
Train size: [784] hidden size: [822] batch size: [10] trial: 47, train_loss: 0.486730, train acc: 80.72916666666667, test loss: 0.544646, test acc: 75.58979166666664, bias2: 0.42600274085998535, variance: 0.11864370107650757
Train size: [784] hidden size: [822] batch size: [10] trial: 48, train_loss: 0.490267, train acc: 80.1020

Train size: [784] hidden size: [946] batch size: [10] trial: 32, train_loss: 0.460661, train acc: 81.81818181818181, test loss: 0.529450, test acc: 76.7548484848485, bias2: 0.41574573516845703, variance: 0.1137046292424202
Train size: [784] hidden size: [946] batch size: [10] trial: 33, train_loss: 0.457679, train acc: 81.61764705882354, test loss: 0.529974, test acc: 76.7214705882353, bias2: 0.41593027114868164, variance: 0.11404378712177277
Train size: [784] hidden size: [946] batch size: [10] trial: 34, train_loss: 0.458847, train acc: 81.42857142857143, test loss: 0.530153, test acc: 76.72885714285715, bias2: 0.41572749614715576, variance: 0.11442570388317108
Train size: [784] hidden size: [946] batch size: [10] trial: 35, train_loss: 0.451098, train acc: 81.94444444444444, test loss: 0.530184, test acc: 76.69277777777778, bias2: 0.4155452847480774, variance: 0.11463868618011475
Train size: [784] hidden size: [946] batch size: [10] trial: 36, train_loss: 0.459868, train acc: 80.405

Train size: [784] hidden size: [1089] batch size: [10] trial: 19, train_loss: 0.445875, train acc: 80.0, test loss: 0.518507, test acc: 77.70750000000001, bias2: 0.40704137086868286, variance: 0.11146591603755951
Train size: [784] hidden size: [1089] batch size: [10] trial: 20, train_loss: 0.441486, train acc: 80.95238095238095, test loss: 0.517902, test acc: 77.77190476190476, bias2: 0.40634405612945557, variance: 0.11155813187360764
Train size: [784] hidden size: [1089] batch size: [10] trial: 21, train_loss: 0.444564, train acc: 81.81818181818181, test loss: 0.517534, test acc: 77.79954545454547, bias2: 0.4058868885040283, variance: 0.11164726316928864
Train size: [784] hidden size: [1089] batch size: [10] trial: 22, train_loss: 0.438042, train acc: 82.6086956521739, test loss: 0.517938, test acc: 77.72000000000001, bias2: 0.4055732190608978, variance: 0.1123645007610321
Train size: [784] hidden size: [1089] batch size: [10] trial: 23, train_loss: 0.435981, train acc: 82.29166666666

Train size: [784] hidden size: [1254] batch size: [10] trial: 6, train_loss: 0.576218, train acc: 71.42857142857143, test loss: 0.504530, test acc: 78.79714285714286, bias2: 0.4060634672641754, variance: 0.0984666645526886
Train size: [784] hidden size: [1254] batch size: [10] trial: 7, train_loss: 0.538163, train acc: 75.0, test loss: 0.503223, test acc: 78.94375000000001, bias2: 0.4030042886734009, variance: 0.10021843016147614
Train size: [784] hidden size: [1254] batch size: [10] trial: 8, train_loss: 0.534624, train acc: 75.0, test loss: 0.503229, test acc: 78.97, bias2: 0.40128594636917114, variance: 0.10194332897663116
Train size: [784] hidden size: [1254] batch size: [10] trial: 9, train_loss: 0.512419, train acc: 77.5, test loss: 0.502617, test acc: 79.036, bias2: 0.399528831243515, variance: 0.10308828204870224
Train size: [784] hidden size: [1254] batch size: [10] trial: 10, train_loss: 0.481083, train acc: 79.54545454545455, test loss: 0.501862, test acc: 79.13727272727273,

Train size: [784] hidden size: [1254] batch size: [10] trial: 43, train_loss: 0.448423, train acc: 84.6590909090909, test loss: 0.504141, test acc: 79.09181818181817, bias2: 0.3928091526031494, variance: 0.1113317608833313
Train size: [784] hidden size: [1254] batch size: [10] trial: 44, train_loss: 0.443886, train acc: 85.0, test loss: 0.503993, test acc: 79.11822222222222, bias2: 0.3928366005420685, variance: 0.11115672439336777
Train size: [784] hidden size: [1254] batch size: [10] trial: 45, train_loss: 0.440130, train acc: 85.32608695652173, test loss: 0.504274, test acc: 79.07934782608694, bias2: 0.3929316997528076, variance: 0.11134223639965057
Train size: [784] hidden size: [1254] batch size: [10] trial: 46, train_loss: 0.440707, train acc: 85.1063829787234, test loss: 0.504264, test acc: 79.07021276595744, bias2: 0.3928728699684143, variance: 0.11139076203107834
Train size: [784] hidden size: [1254] batch size: [10] trial: 47, train_loss: 0.439043, train acc: 85.41666666666667

Train size: [784] hidden size: [1444] batch size: [10] trial: 30, train_loss: 0.421682, train acc: 88.70967741935483, test loss: 0.494799, test acc: 80.0067741935484, bias2: 0.38407811522483826, variance: 0.11072057485580444
Train size: [784] hidden size: [1444] batch size: [10] trial: 31, train_loss: 0.426029, train acc: 88.28125, test loss: 0.494753, test acc: 79.9859375, bias2: 0.38415879011154175, variance: 0.11059443652629852
Train size: [784] hidden size: [1444] batch size: [10] trial: 32, train_loss: 0.425209, train acc: 87.87878787878788, test loss: 0.494871, test acc: 79.98, bias2: 0.3841838836669922, variance: 0.1106867641210556
Train size: [784] hidden size: [1444] batch size: [10] trial: 33, train_loss: 0.429479, train acc: 88.23529411764706, test loss: 0.494746, test acc: 79.97470588235295, bias2: 0.38410982489585876, variance: 0.11063617467880249
Train size: [784] hidden size: [1444] batch size: [10] trial: 34, train_loss: 0.432779, train acc: 87.85714285714286, test loss

Train size: [784] hidden size: [1663] batch size: [10] trial: 17, train_loss: 0.438110, train acc: 84.72222222222223, test loss: 0.480045, test acc: 80.78611111111113, bias2: 0.3759210407733917, variance: 0.10412445664405823
Train size: [784] hidden size: [1663] batch size: [10] trial: 18, train_loss: 0.434147, train acc: 85.52631578947368, test loss: 0.479925, test acc: 80.76789473684212, bias2: 0.3755747079849243, variance: 0.10435032844543457
Train size: [784] hidden size: [1663] batch size: [10] trial: 19, train_loss: 0.432992, train acc: 86.25, test loss: 0.480022, test acc: 80.77750000000002, bias2: 0.37547412514686584, variance: 0.1045481339097023
Train size: [784] hidden size: [1663] batch size: [10] trial: 20, train_loss: 0.429280, train acc: 86.9047619047619, test loss: 0.479885, test acc: 80.79238095238097, bias2: 0.37534844875335693, variance: 0.1045365184545517
Train size: [784] hidden size: [1663] batch size: [10] trial: 21, train_loss: 0.431316, train acc: 86.36363636363

Train size: [784] hidden size: [1915] batch size: [10] trial: 4, train_loss: 0.404711, train acc: 85.0, test loss: 0.484901, test acc: 80.492, bias2: 0.3860829770565033, variance: 0.09881790727376938
Train size: [784] hidden size: [1915] batch size: [10] trial: 5, train_loss: 0.372305, train acc: 87.5, test loss: 0.484728, test acc: 80.68833333333335, bias2: 0.3828427791595459, variance: 0.1018853485584259
Train size: [784] hidden size: [1915] batch size: [10] trial: 6, train_loss: 0.369442, train acc: 85.71428571428571, test loss: 0.484289, test acc: 80.62428571428572, bias2: 0.3805115818977356, variance: 0.10377725958824158
Train size: [784] hidden size: [1915] batch size: [10] trial: 7, train_loss: 0.357837, train acc: 87.5, test loss: 0.483086, test acc: 80.68875, bias2: 0.37813618779182434, variance: 0.10495001077651978
Train size: [784] hidden size: [1915] batch size: [10] trial: 8, train_loss: 0.371047, train acc: 88.88888888888889, test loss: 0.481634, test acc: 80.858888888888

Train size: [784] hidden size: [1915] batch size: [10] trial: 42, train_loss: 0.383027, train acc: 89.53488372093024, test loss: 0.476645, test acc: 81.18, bias2: 0.36463651061058044, variance: 0.1120089590549469
Train size: [784] hidden size: [1915] batch size: [10] trial: 43, train_loss: 0.381047, train acc: 89.77272727272727, test loss: 0.476301, test acc: 81.18227272727273, bias2: 0.36444544792175293, variance: 0.11185585707426071
Train size: [784] hidden size: [1915] batch size: [10] trial: 44, train_loss: 0.384716, train acc: 89.44444444444444, test loss: 0.476135, test acc: 81.1848888888889, bias2: 0.3643832504749298, variance: 0.11175153404474258
Train size: [784] hidden size: [1915] batch size: [10] trial: 45, train_loss: 0.386912, train acc: 89.1304347826087, test loss: 0.476595, test acc: 81.16304347826087, bias2: 0.36430972814559937, variance: 0.11228539049625397
Train size: [784] hidden size: [1915] batch size: [10] trial: 46, train_loss: 0.384171, train acc: 89.3617021276

Train size: [784] hidden size: [2204] batch size: [10] trial: 29, train_loss: 0.376730, train acc: 90.0, test loss: 0.470650, test acc: 81.60633333333334, bias2: 0.3560020625591278, variance: 0.11464757472276688
Train size: [784] hidden size: [2204] batch size: [10] trial: 30, train_loss: 0.382105, train acc: 89.51612903225806, test loss: 0.470065, test acc: 81.65064516129033, bias2: 0.3556644916534424, variance: 0.11440016329288483
Train size: [784] hidden size: [2204] batch size: [10] trial: 31, train_loss: 0.386876, train acc: 89.0625, test loss: 0.470012, test acc: 81.6509375, bias2: 0.35556697845458984, variance: 0.11444514244794846
Train size: [784] hidden size: [2204] batch size: [10] trial: 32, train_loss: 0.384117, train acc: 89.39393939393939, test loss: 0.469973, test acc: 81.67454545454545, bias2: 0.35563698410987854, variance: 0.1143362820148468
Train size: [784] hidden size: [2204] batch size: [10] trial: 33, train_loss: 0.384847, train acc: 89.70588235294117, test loss: 

Train size: [784] hidden size: [2538] batch size: [10] trial: 16, train_loss: 0.336907, train acc: 88.23529411764706, test loss: 0.460861, test acc: 82.11352941176472, bias2: 0.3504468500614166, variance: 0.11041444540023804
Train size: [784] hidden size: [2538] batch size: [10] trial: 17, train_loss: 0.330720, train acc: 88.88888888888889, test loss: 0.460488, test acc: 82.0777777777778, bias2: 0.3501225411891937, variance: 0.11036523431539536
Train size: [784] hidden size: [2538] batch size: [10] trial: 18, train_loss: 0.337284, train acc: 89.47368421052632, test loss: 0.461148, test acc: 82.18157894736844, bias2: 0.3497510254383087, variance: 0.11139661073684692
Train size: [784] hidden size: [2538] batch size: [10] trial: 19, train_loss: 0.336149, train acc: 90.0, test loss: 0.461009, test acc: 82.11750000000002, bias2: 0.34981676936149597, variance: 0.11119236797094345
Train size: [784] hidden size: [2538] batch size: [10] trial: 20, train_loss: 0.338591, train acc: 89.28571428571

Train size: [784] hidden size: [2922] batch size: [10] trial: 3, train_loss: 0.303278, train acc: 87.5, test loss: 0.442692, test acc: 83.285, bias2: 0.36016905307769775, variance: 0.0825224369764328
Train size: [784] hidden size: [2922] batch size: [10] trial: 4, train_loss: 0.295671, train acc: 90.0, test loss: 0.445991, test acc: 83.066, bias2: 0.35570892691612244, variance: 0.09028183668851852
Train size: [784] hidden size: [2922] batch size: [10] trial: 5, train_loss: 0.305305, train acc: 91.66666666666667, test loss: 0.447040, test acc: 83.01666666666667, bias2: 0.35230934619903564, variance: 0.09473048150539398
Train size: [784] hidden size: [2922] batch size: [10] trial: 6, train_loss: 0.306617, train acc: 92.85714285714286, test loss: 0.446155, test acc: 83.07857142857142, bias2: 0.34864988923072815, variance: 0.0975046157836914
Train size: [784] hidden size: [2922] batch size: [10] trial: 7, train_loss: 0.323302, train acc: 90.625, test loss: 0.447410, test acc: 83.06625, bia

Train size: [784] hidden size: [2922] batch size: [10] trial: 41, train_loss: 0.339025, train acc: 91.66666666666667, test loss: 0.451514, test acc: 82.80452380952383, bias2: 0.3362269401550293, variance: 0.115287184715271
Train size: [784] hidden size: [2922] batch size: [10] trial: 42, train_loss: 0.334642, train acc: 91.86046511627907, test loss: 0.451636, test acc: 82.79000000000002, bias2: 0.33628860116004944, variance: 0.11534759402275085
Train size: [784] hidden size: [2922] batch size: [10] trial: 43, train_loss: 0.331241, train acc: 92.04545454545455, test loss: 0.451680, test acc: 82.78750000000001, bias2: 0.33635929226875305, variance: 0.11532028764486313
Train size: [784] hidden size: [2922] batch size: [10] trial: 44, train_loss: 0.330311, train acc: 92.22222222222223, test loss: 0.451723, test acc: 82.78044444444446, bias2: 0.3364003002643585, variance: 0.11532294750213623
Train size: [784] hidden size: [2922] batch size: [10] trial: 45, train_loss: 0.329239, train acc: 9

Train size: [784] hidden size: [3365] batch size: [10] trial: 28, train_loss: 0.312273, train acc: 94.82758620689656, test loss: 0.446667, test acc: 83.14793103448272, bias2: 0.32874318957328796, variance: 0.11792409420013428
Train size: [784] hidden size: [3365] batch size: [10] trial: 29, train_loss: 0.311459, train acc: 95.0, test loss: 0.447275, test acc: 83.1243333333333, bias2: 0.3288353681564331, variance: 0.11843980848789215
Train size: [784] hidden size: [3365] batch size: [10] trial: 30, train_loss: 0.308377, train acc: 95.16129032258064, test loss: 0.446518, test acc: 83.16612903225804, bias2: 0.3287332057952881, variance: 0.11778513342142105
Train size: [784] hidden size: [3365] batch size: [10] trial: 31, train_loss: 0.304006, train acc: 95.3125, test loss: 0.446604, test acc: 83.11906249999997, bias2: 0.3287034034729004, variance: 0.11790032684803009
Train size: [784] hidden size: [3365] batch size: [10] trial: 32, train_loss: 0.301783, train acc: 95.45454545454545, test 

Train size: [784] hidden size: [3874] batch size: [10] trial: 15, train_loss: 0.337424, train acc: 90.625, test loss: 0.446446, test acc: 83.41499999999999, bias2: 0.3238745331764221, variance: 0.12257169187068939
Train size: [784] hidden size: [3874] batch size: [10] trial: 16, train_loss: 0.334791, train acc: 89.70588235294117, test loss: 0.445731, test acc: 83.42470588235292, bias2: 0.3231709599494934, variance: 0.1225598156452179
Train size: [784] hidden size: [3874] batch size: [10] trial: 17, train_loss: 0.323486, train acc: 90.27777777777777, test loss: 0.445612, test acc: 83.44666666666666, bias2: 0.32269999384880066, variance: 0.12291152030229568
Train size: [784] hidden size: [3874] batch size: [10] trial: 18, train_loss: 0.322088, train acc: 89.47368421052632, test loss: 0.444960, test acc: 83.4736842105263, bias2: 0.3216678500175476, variance: 0.12329240143299103
Train size: [784] hidden size: [3874] batch size: [10] trial: 19, train_loss: 0.320308, train acc: 90.0, test lo

Train size: [784] hidden size: [4461] batch size: [10] trial: 2, train_loss: 0.295549, train acc: 100.0, test loss: 0.442066, test acc: 84.93666666666667, bias2: 0.35023799538612366, variance: 0.0918276235461235
Train size: [784] hidden size: [4461] batch size: [10] trial: 3, train_loss: 0.328872, train acc: 100.0, test loss: 0.445007, test acc: 84.705, bias2: 0.33997535705566406, variance: 0.1050315648317337
Train size: [784] hidden size: [4461] batch size: [10] trial: 4, train_loss: 0.329608, train acc: 100.0, test loss: 0.448121, test acc: 84.07000000000001, bias2: 0.3344365060329437, variance: 0.1136847510933876
Train size: [784] hidden size: [4461] batch size: [10] trial: 5, train_loss: 0.308173, train acc: 100.0, test loss: 0.447450, test acc: 83.94666666666667, bias2: 0.3273119330406189, variance: 0.12013788521289825
Train size: [784] hidden size: [4461] batch size: [10] trial: 6, train_loss: 0.307666, train acc: 100.0, test loss: 0.449162, test acc: 83.67285714285715, bias2: 0.

Train size: [784] hidden size: [4461] batch size: [10] trial: 40, train_loss: 0.304752, train acc: 92.6829268292683, test loss: 0.450471, test acc: 83.26512195121951, bias2: 0.310279905796051, variance: 0.14019136130809784
Train size: [784] hidden size: [4461] batch size: [10] trial: 41, train_loss: 0.311307, train acc: 92.26190476190476, test loss: 0.451600, test acc: 83.19190476190477, bias2: 0.31002867221832275, variance: 0.14157158136367798
Train size: [784] hidden size: [4461] batch size: [10] trial: 42, train_loss: 0.314519, train acc: 92.44186046511628, test loss: 0.451311, test acc: 83.18790697674419, bias2: 0.31015896797180176, variance: 0.1411520093679428
Train size: [784] hidden size: [4461] batch size: [10] trial: 43, train_loss: 0.316886, train acc: 92.61363636363636, test loss: 0.450507, test acc: 83.21181818181817, bias2: 0.31000906229019165, variance: 0.14049777388572693
Train size: [784] hidden size: [4461] batch size: [10] trial: 44, train_loss: 0.315786, train acc: 9

Train size: [784] hidden size: [5136] batch size: [10] trial: 27, train_loss: 0.276222, train acc: 95.53571428571429, test loss: 0.447258, test acc: 83.38892857142856, bias2: 0.30120325088500977, variance: 0.14605435729026794
Train size: [784] hidden size: [5136] batch size: [10] trial: 28, train_loss: 0.276889, train acc: 95.6896551724138, test loss: 0.446479, test acc: 83.4155172413793, bias2: 0.3010547161102295, variance: 0.14542441070079803
Train size: [784] hidden size: [5136] batch size: [10] trial: 29, train_loss: 0.279386, train acc: 95.83333333333333, test loss: 0.448451, test acc: 83.30899999999998, bias2: 0.30134087800979614, variance: 0.14711034297943115
Train size: [784] hidden size: [5136] batch size: [10] trial: 30, train_loss: 0.284141, train acc: 95.16129032258064, test loss: 0.447543, test acc: 83.37870967741934, bias2: 0.3011189103126526, variance: 0.1464242786169052
Train size: [784] hidden size: [5136] batch size: [10] trial: 31, train_loss: 0.284184, train acc: 95

Train size: [784] hidden size: [5914] batch size: [10] trial: 14, train_loss: 0.239165, train acc: 96.66666666666667, test loss: 0.434573, test acc: 84.53, bias2: 0.2941276431083679, variance: 0.14044538140296936
Train size: [784] hidden size: [5914] batch size: [10] trial: 15, train_loss: 0.236212, train acc: 96.875, test loss: 0.434285, test acc: 84.49125000000001, bias2: 0.2935589551925659, variance: 0.14072616398334503
Train size: [784] hidden size: [5914] batch size: [10] trial: 16, train_loss: 0.246116, train acc: 97.05882352941177, test loss: 0.433032, test acc: 84.60470588235296, bias2: 0.29295891523361206, variance: 0.14007271826267242
Train size: [784] hidden size: [5914] batch size: [10] trial: 17, train_loss: 0.244351, train acc: 97.22222222222223, test loss: 0.433743, test acc: 84.54444444444445, bias2: 0.29244738817214966, variance: 0.14129577577114105
Train size: [784] hidden size: [5914] batch size: [10] trial: 18, train_loss: 0.239078, train acc: 97.36842105263158, tes

Train size: [784] hidden size: [6809] batch size: [10] trial: 1, train_loss: 0.561169, train acc: 75.0, test loss: 0.497081, test acc: 82.02, bias2: 0.3802022933959961, variance: 0.11687906086444855
Train size: [784] hidden size: [6809] batch size: [10] trial: 2, train_loss: 0.511801, train acc: 75.0, test loss: 0.484579, test acc: 82.61333333333333, bias2: 0.345751017332077, variance: 0.13882794976234436
Train size: [784] hidden size: [6809] batch size: [10] trial: 3, train_loss: 0.437984, train acc: 81.25, test loss: 0.472905, test acc: 82.76249999999999, bias2: 0.3208661377429962, variance: 0.15203863382339478
Train size: [784] hidden size: [6809] batch size: [10] trial: 4, train_loss: 0.412191, train acc: 85.0, test loss: 0.467879, test acc: 82.922, bias2: 0.3098335862159729, variance: 0.1580454707145691
Train size: [784] hidden size: [6809] batch size: [10] trial: 5, train_loss: 0.397062, train acc: 87.5, test loss: 0.467737, test acc: 83.17666666666666, bias2: 0.3077338635921478,

Train size: [784] hidden size: [6809] batch size: [10] trial: 39, train_loss: 0.297378, train acc: 93.75, test loss: 0.460163, test acc: 83.36724999999998, bias2: 0.2814124822616577, variance: 0.1787509173154831
Train size: [784] hidden size: [6809] batch size: [10] trial: 40, train_loss: 0.296940, train acc: 93.90243902439025, test loss: 0.460151, test acc: 83.36853658536585, bias2: 0.2813723683357239, variance: 0.17877823114395142
Train size: [784] hidden size: [6809] batch size: [10] trial: 41, train_loss: 0.297025, train acc: 94.04761904761905, test loss: 0.459919, test acc: 83.37309523809523, bias2: 0.28122079372406006, variance: 0.1786980926990509
Train size: [784] hidden size: [6809] batch size: [10] trial: 42, train_loss: 0.295807, train acc: 94.18604651162791, test loss: 0.459369, test acc: 83.37697674418604, bias2: 0.2810121774673462, variance: 0.17835655808448792
Train size: [784] hidden size: [6809] batch size: [10] trial: 43, train_loss: 0.292335, train acc: 94.31818181818

Train size: [784] hidden size: [7840] batch size: [10] trial: 27, train_loss: 0.275629, train acc: 96.42857142857143, test loss: 0.461706, test acc: 83.33821428571427, bias2: 0.2749752104282379, variance: 0.1867305040359497
Train size: [784] hidden size: [7840] batch size: [10] trial: 28, train_loss: 0.277024, train acc: 96.55172413793103, test loss: 0.467004, test acc: 83.12793103448274, bias2: 0.2749149203300476, variance: 0.1920892745256424
Train size: [784] hidden size: [7840] batch size: [10] trial: 29, train_loss: 0.278619, train acc: 96.66666666666667, test loss: 0.467075, test acc: 83.13133333333332, bias2: 0.27511781454086304, variance: 0.19195759296417236
Train size: [784] hidden size: [7840] batch size: [10] trial: 30, train_loss: 0.284932, train acc: 96.7741935483871, test loss: 0.469163, test acc: 83.09774193548387, bias2: 0.27373650670051575, variance: 0.19542652368545532
Train size: [784] hidden size: [7840] batch size: [10] trial: 31, train_loss: 0.279216, train acc: 96

Train size: [784] hidden size: [8] batch size: [784] trial: 14, train_loss: 1.191639, train acc: 9.93197278911565, test loss: 1.194558, test acc: 10.392666666666667, bias2: 0.9515081644058228, variance: 0.2430494874715805
Train size: [784] hidden size: [8] batch size: [784] trial: 15, train_loss: 1.189567, train acc: 10.291772959183675, test loss: 1.193589, test acc: 10.736250000000002, bias2: 0.9501278400421143, variance: 0.24346080422401428
Train size: [784] hidden size: [8] batch size: [784] trial: 16, train_loss: 1.194884, train acc: 10.249099639855942, test loss: 1.199769, test acc: 10.58529411764706, bias2: 0.9533774256706238, variance: 0.24639193713665009
Train size: [784] hidden size: [8] batch size: [784] trial: 17, train_loss: 1.191041, train acc: 10.615079365079366, test loss: 1.196418, test acc: 10.84888888888889, bias2: 0.9474222660064697, variance: 0.24899528920650482
Train size: [784] hidden size: [8] batch size: [784] trial: 18, train_loss: 1.185662, train acc: 10.76799

Train size: [784] hidden size: [9] batch size: [784] trial: 0, train_loss: 1.204514, train acc: 15.306122448979592, test loss: 1.181130, test acc: 16.02, bias2: 1.1811295747756958, variance: -1.9462739753173253e-10
Train size: [784] hidden size: [9] batch size: [784] trial: 1, train_loss: 1.175583, train acc: 12.244897959183675, test loss: 1.163819, test acc: 12.545, bias2: 1.050841212272644, variance: 0.11297750473022461
Train size: [784] hidden size: [9] batch size: [784] trial: 2, train_loss: 1.171470, train acc: 13.095238095238097, test loss: 1.160519, test acc: 13.5, bias2: 0.9969092607498169, variance: 0.1636100858449936
Train size: [784] hidden size: [9] batch size: [784] trial: 3, train_loss: 1.161559, train acc: 12.595663265306124, test loss: 1.148477, test acc: 13.48, bias2: 0.9792909622192383, variance: 0.16918602585792542
Train size: [784] hidden size: [9] batch size: [784] trial: 4, train_loss: 1.156134, train acc: 12.704081632653063, test loss: 1.150639, test acc: 13.76, 

Train size: [784] hidden size: [9] batch size: [784] trial: 37, train_loss: 1.152669, train acc: 12.385875402792696, test loss: 1.153871, test acc: 12.256578947368418, bias2: 0.9021762609481812, variance: 0.25169476866722107
Train size: [784] hidden size: [9] batch size: [784] trial: 38, train_loss: 1.153727, train acc: 12.460753532182101, test loss: 1.155991, test acc: 12.31282051282051, bias2: 0.9020634889602661, variance: 0.25392699241638184
Train size: [784] hidden size: [9] batch size: [784] trial: 39, train_loss: 1.154391, train acc: 12.487244897959183, test loss: 1.156380, test acc: 12.354249999999997, bias2: 0.9009068012237549, variance: 0.25547364354133606
Train size: [784] hidden size: [9] batch size: [784] trial: 40, train_loss: 1.154197, train acc: 12.456445993031357, test loss: 1.156395, test acc: 12.304390243902438, bias2: 0.9021612405776978, variance: 0.254233717918396
Train size: [784] hidden size: [9] batch size: [784] trial: 41, train_loss: 1.153983, train acc: 12.469

Train size: [784] hidden size: [10] batch size: [784] trial: 24, train_loss: 1.134417, train acc: 11.750000000000005, test loss: 1.140524, test acc: 11.8012, bias2: 0.9075508713722229, variance: 0.2329729050397873
Train size: [784] hidden size: [10] batch size: [784] trial: 25, train_loss: 1.132748, train acc: 11.607142857142861, test loss: 1.138982, test acc: 11.738076923076923, bias2: 0.9071618318557739, variance: 0.23182055354118347
Train size: [784] hidden size: [10] batch size: [784] trial: 26, train_loss: 1.132145, train acc: 11.734693877551024, test loss: 1.138898, test acc: 11.747407407407408, bias2: 0.9069051742553711, variance: 0.23199306428432465
Train size: [784] hidden size: [10] batch size: [784] trial: 27, train_loss: 1.131416, train acc: 11.853134110787176, test loss: 1.137130, test acc: 11.853214285714285, bias2: 0.9056394696235657, variance: 0.23149091005325317
Train size: [784] hidden size: [10] batch size: [784] trial: 28, train_loss: 1.129913, train acc: 11.9150246

Train size: [784] hidden size: [12] batch size: [784] trial: 11, train_loss: 1.117781, train acc: 12.606292517006805, test loss: 1.129045, test acc: 11.798333333333334, bias2: 0.9004815220832825, variance: 0.22856326401233673
Train size: [784] hidden size: [12] batch size: [784] trial: 12, train_loss: 1.115095, train acc: 12.500000000000002, test loss: 1.125691, test acc: 11.716923076923079, bias2: 0.9000654220581055, variance: 0.22562573850154877
Train size: [784] hidden size: [12] batch size: [784] trial: 13, train_loss: 1.114536, train acc: 12.663994169096211, test loss: 1.125724, test acc: 11.867142857142857, bias2: 0.8960017561912537, variance: 0.22972184419631958
Train size: [784] hidden size: [12] batch size: [784] trial: 14, train_loss: 1.116249, train acc: 12.51700680272109, test loss: 1.126522, test acc: 11.694666666666668, bias2: 0.8986268043518066, variance: 0.22789467871189117
Train size: [784] hidden size: [12] batch size: [784] trial: 15, train_loss: 1.113074, train acc:

Train size: [784] hidden size: [12] batch size: [784] trial: 48, train_loss: 1.105667, train acc: 12.609329446064145, test loss: 1.112737, test acc: 11.85469387755102, bias2: 0.8858990669250488, variance: 0.22683753073215485
Train size: [784] hidden size: [12] batch size: [784] trial: 49, train_loss: 1.105951, train acc: 12.469387755102046, test loss: 1.112689, test acc: 11.737400000000001, bias2: 0.8864831924438477, variance: 0.22620533406734467
##################################################
Train size: [784] hidden size: [14] batch size: [784] trial: 0, train_loss: 1.086425, train acc: 11.096938775510203, test loss: 1.077872, test acc: 12.79, bias2: 1.077872395515442, variance: 1.9462740308284765e-09
Train size: [784] hidden size: [14] batch size: [784] trial: 1, train_loss: 1.076776, train acc: 11.926020408163264, test loss: 1.074114, test acc: 12.485, bias2: 0.9669901132583618, variance: 0.10712350904941559
Train size: [784] hidden size: [14] batch size: [784] trial: 2, train_l

Train size: [784] hidden size: [14] batch size: [784] trial: 35, train_loss: 1.109668, train acc: 11.890589569160996, test loss: 1.111111, test acc: 11.893888888888888, bias2: 0.8856576681137085, variance: 0.22545363008975983
Train size: [784] hidden size: [14] batch size: [784] trial: 36, train_loss: 1.108041, train acc: 12.051847766133477, test loss: 1.109652, test acc: 12.049999999999999, bias2: 0.8841719031333923, variance: 0.22548027336597443
Train size: [784] hidden size: [14] batch size: [784] trial: 37, train_loss: 1.108098, train acc: 12.0703544575725, test loss: 1.109889, test acc: 12.024736842105261, bias2: 0.8845218420028687, variance: 0.22536706924438477
Train size: [784] hidden size: [14] batch size: [784] trial: 38, train_loss: 1.108200, train acc: 12.04212454212454, test loss: 1.110105, test acc: 12.034358974358971, bias2: 0.8832783699035645, variance: 0.22682641446590424
Train size: [784] hidden size: [14] batch size: [784] trial: 39, train_loss: 1.108602, train acc: 1

Train size: [784] hidden size: [16] batch size: [784] trial: 21, train_loss: 1.080672, train acc: 13.085575139146567, test loss: 1.084916, test acc: 13.209545454545452, bias2: 0.875697910785675, variance: 0.20921771228313446
Train size: [784] hidden size: [16] batch size: [784] trial: 22, train_loss: 1.083920, train acc: 12.899290150842944, test loss: 1.088056, test acc: 13.002173913043476, bias2: 0.8788078427314758, variance: 0.20924778282642365
Train size: [784] hidden size: [16] batch size: [784] trial: 23, train_loss: 1.082364, train acc: 13.100552721088434, test loss: 1.086103, test acc: 13.17708333333333, bias2: 0.8770731091499329, variance: 0.20902973413467407
Train size: [784] hidden size: [16] batch size: [784] trial: 24, train_loss: 1.083255, train acc: 13.045918367346937, test loss: 1.087123, test acc: 13.081999999999999, bias2: 0.878318727016449, variance: 0.20880381762981415
Train size: [784] hidden size: [16] batch size: [784] trial: 25, train_loss: 1.082786, train acc: 1

Train size: [784] hidden size: [18] batch size: [784] trial: 7, train_loss: 1.064890, train acc: 15.768494897959183, test loss: 1.070954, test acc: 14.923749999999998, bias2: 0.8700046539306641, variance: 0.20094917714595795
Train size: [784] hidden size: [18] batch size: [784] trial: 8, train_loss: 1.063238, train acc: 15.986394557823129, test loss: 1.070562, test acc: 15.16333333333333, bias2: 0.862562358379364, variance: 0.20799951255321503
Train size: [784] hidden size: [18] batch size: [784] trial: 9, train_loss: 1.061203, train acc: 16.301020408163264, test loss: 1.068961, test acc: 15.403999999999996, bias2: 0.8610393404960632, variance: 0.20792119204998016
Train size: [784] hidden size: [18] batch size: [784] trial: 10, train_loss: 1.063659, train acc: 15.978664192949907, test loss: 1.070711, test acc: 15.389999999999997, bias2: 0.8614453077316284, variance: 0.20926591753959656
Train size: [784] hidden size: [18] batch size: [784] trial: 11, train_loss: 1.066062, train acc: 15.

Train size: [784] hidden size: [18] batch size: [784] trial: 44, train_loss: 1.070932, train acc: 14.067460317460322, test loss: 1.078306, test acc: 13.958444444444448, bias2: 0.8637605309486389, variance: 0.21454519033432007
Train size: [784] hidden size: [18] batch size: [784] trial: 45, train_loss: 1.071581, train acc: 13.977928127772852, test loss: 1.079041, test acc: 13.891521739130438, bias2: 0.8647059202194214, variance: 0.21433541178703308
Train size: [784] hidden size: [18] batch size: [784] trial: 46, train_loss: 1.070827, train acc: 13.973621363438998, test loss: 1.078043, test acc: 13.891063829787237, bias2: 0.8644285202026367, variance: 0.21361485123634338
Train size: [784] hidden size: [18] batch size: [784] trial: 47, train_loss: 1.070079, train acc: 14.057185374149663, test loss: 1.077415, test acc: 13.927708333333335, bias2: 0.8630849719047546, variance: 0.21433013677597046
Train size: [784] hidden size: [18] batch size: [784] trial: 48, train_loss: 1.068845, train acc

Train size: [784] hidden size: [21] batch size: [784] trial: 31, train_loss: 1.072412, train acc: 14.325573979591837, test loss: 1.076490, test acc: 14.4253125, bias2: 0.8537614345550537, variance: 0.22272837162017822
Train size: [784] hidden size: [21] batch size: [784] trial: 32, train_loss: 1.072880, train acc: 14.243197278911564, test loss: 1.076968, test acc: 14.306666666666667, bias2: 0.85454261302948, variance: 0.2224254608154297
Train size: [784] hidden size: [21] batch size: [784] trial: 33, train_loss: 1.074278, train acc: 14.116896758703481, test loss: 1.077764, test acc: 14.146470588235294, bias2: 0.8555147647857666, variance: 0.22224962711334229
Train size: [784] hidden size: [21] batch size: [784] trial: 34, train_loss: 1.074994, train acc: 14.07069970845481, test loss: 1.078972, test acc: 14.008000000000001, bias2: 0.8569488525390625, variance: 0.22202298045158386
Train size: [784] hidden size: [21] batch size: [784] trial: 35, train_loss: 1.073062, train acc: 14.2715419

Train size: [784] hidden size: [24] batch size: [784] trial: 18, train_loss: 1.044944, train acc: 16.004296455424278, test loss: 1.046066, test acc: 15.412105263157894, bias2: 0.8438825607299805, variance: 0.20218394696712494
Train size: [784] hidden size: [24] batch size: [784] trial: 19, train_loss: 1.044353, train acc: 16.237244897959187, test loss: 1.044704, test acc: 15.6725, bias2: 0.8406249284744263, variance: 0.2040794938802719
Train size: [784] hidden size: [24] batch size: [784] trial: 20, train_loss: 1.044204, train acc: 16.192905733722064, test loss: 1.046289, test acc: 15.554761904761904, bias2: 0.8415765166282654, variance: 0.20471243560314178
Train size: [784] hidden size: [24] batch size: [784] trial: 21, train_loss: 1.046127, train acc: 15.972866419294991, test loss: 1.047966, test acc: 15.42590909090909, bias2: 0.842377781867981, variance: 0.20558786392211914
Train size: [784] hidden size: [24] batch size: [784] trial: 22, train_loss: 1.045144, train acc: 15.938331854

Train size: [784] hidden size: [28] batch size: [784] trial: 5, train_loss: 1.032438, train acc: 16.645408163265305, test loss: 1.040913, test acc: 16.235000000000003, bias2: 0.8575948476791382, variance: 0.18331803381443024
Train size: [784] hidden size: [28] batch size: [784] trial: 6, train_loss: 1.035051, train acc: 16.636297376093292, test loss: 1.046249, test acc: 15.915714285714287, bias2: 0.8552430272102356, variance: 0.1910061091184616
Train size: [784] hidden size: [28] batch size: [784] trial: 7, train_loss: 1.037462, train acc: 16.677295918367346, test loss: 1.045564, test acc: 15.988750000000001, bias2: 0.8516020774841309, variance: 0.19396235048770905
Train size: [784] hidden size: [28] batch size: [784] trial: 8, train_loss: 1.038196, train acc: 16.865079365079364, test loss: 1.044263, test acc: 16.352222222222224, bias2: 0.8468583822250366, variance: 0.19740435481071472
Train size: [784] hidden size: [28] batch size: [784] trial: 9, train_loss: 1.034069, train acc: 16.5

Train size: [784] hidden size: [28] batch size: [784] trial: 42, train_loss: 1.027010, train acc: 16.29093497864262, test loss: 1.031644, test acc: 16.153953488372096, bias2: 0.82834792137146, variance: 0.2032957226037979
Train size: [784] hidden size: [28] batch size: [784] trial: 43, train_loss: 1.026604, train acc: 16.40480055658627, test loss: 1.031364, test acc: 16.22068181818182, bias2: 0.8273810148239136, variance: 0.20398281514644623
Train size: [784] hidden size: [28] batch size: [784] trial: 44, train_loss: 1.026624, train acc: 16.414399092970523, test loss: 1.031566, test acc: 16.218000000000004, bias2: 0.8276480436325073, variance: 0.2039184719324112
Train size: [784] hidden size: [28] batch size: [784] trial: 45, train_loss: 1.027332, train acc: 16.323757763975156, test loss: 1.032281, test acc: 16.122826086956525, bias2: 0.8281747102737427, variance: 0.20410633087158203
Train size: [784] hidden size: [28] batch size: [784] trial: 46, train_loss: 1.027249, train acc: 16.39

Train size: [784] hidden size: [32] batch size: [784] trial: 29, train_loss: 1.030951, train acc: 16.313775510204085, test loss: 1.038085, test acc: 15.857666666666667, bias2: 0.8312140703201294, variance: 0.20687106251716614
Train size: [784] hidden size: [32] batch size: [784] trial: 30, train_loss: 1.032369, train acc: 16.260697827518108, test loss: 1.039043, test acc: 15.842258064516129, bias2: 0.8311039209365845, variance: 0.20793911814689636
Train size: [784] hidden size: [32] batch size: [784] trial: 31, train_loss: 1.032494, train acc: 16.127232142857146, test loss: 1.038705, test acc: 15.7571875, bias2: 0.8313926458358765, variance: 0.20731234550476074
Train size: [784] hidden size: [32] batch size: [784] trial: 32, train_loss: 1.032101, train acc: 16.110080395794682, test loss: 1.037939, test acc: 15.719090909090909, bias2: 0.831456184387207, variance: 0.2064824104309082
Train size: [784] hidden size: [32] batch size: [784] trial: 33, train_loss: 1.031747, train acc: 16.10144

Train size: [784] hidden size: [37] batch size: [784] trial: 16, train_loss: 1.002017, train acc: 18.734993997599034, test loss: 1.010304, test acc: 18.10705882352941, bias2: 0.8157590627670288, variance: 0.1945449411869049
Train size: [784] hidden size: [37] batch size: [784] trial: 17, train_loss: 1.001981, train acc: 18.48072562358276, test loss: 1.010775, test acc: 17.858888888888888, bias2: 0.81706702709198, variance: 0.19370773434638977
Train size: [784] hidden size: [37] batch size: [784] trial: 18, train_loss: 0.996835, train acc: 19.24006444683136, test loss: 1.006300, test acc: 18.436842105263157, bias2: 0.8115907907485962, variance: 0.19470903277397156
Train size: [784] hidden size: [37] batch size: [784] trial: 19, train_loss: 0.996630, train acc: 19.349489795918366, test loss: 1.006654, test acc: 18.494499999999995, bias2: 0.8113729357719421, variance: 0.1952807456254959
Train size: [784] hidden size: [37] batch size: [784] trial: 20, train_loss: 0.996097, train acc: 19.21

Train size: [784] hidden size: [43] batch size: [784] trial: 3, train_loss: 1.001208, train acc: 20.567602040816325, test loss: 1.014194, test acc: 19.572499999999998, bias2: 0.8495717644691467, variance: 0.16462214291095734
Train size: [784] hidden size: [43] batch size: [784] trial: 4, train_loss: 0.985316, train acc: 21.53061224489796, test loss: 0.999164, test acc: 20.651999999999997, bias2: 0.8279045820236206, variance: 0.17125993967056274
Train size: [784] hidden size: [43] batch size: [784] trial: 5, train_loss: 0.982071, train acc: 20.897108843537413, test loss: 0.998876, test acc: 20.106666666666666, bias2: 0.8235414028167725, variance: 0.17533423006534576
Train size: [784] hidden size: [43] batch size: [784] trial: 6, train_loss: 0.980930, train acc: 21.15524781341108, test loss: 0.996714, test acc: 20.464285714285715, bias2: 0.8162661790847778, variance: 0.18044784665107727
Train size: [784] hidden size: [43] batch size: [784] trial: 7, train_loss: 0.981932, train acc: 21.22

Train size: [784] hidden size: [43] batch size: [784] trial: 40, train_loss: 0.990941, train acc: 20.423718267794918, test loss: 0.998486, test acc: 19.835365853658534, bias2: 0.796785295009613, variance: 0.20170027017593384
Train size: [784] hidden size: [43] batch size: [784] trial: 41, train_loss: 0.991155, train acc: 20.37475704567541, test loss: 0.998945, test acc: 19.777619047619044, bias2: 0.7969004511833191, variance: 0.20204462110996246
Train size: [784] hidden size: [43] batch size: [784] trial: 42, train_loss: 0.991753, train acc: 20.328073089700993, test loss: 0.999647, test acc: 19.700232558139533, bias2: 0.7971358895301819, variance: 0.20251131057739258
Train size: [784] hidden size: [43] batch size: [784] trial: 43, train_loss: 0.992160, train acc: 20.283511131725415, test loss: 1.000184, test acc: 19.65772727272727, bias2: 0.797062337398529, variance: 0.2031216025352478
Train size: [784] hidden size: [43] batch size: [784] trial: 44, train_loss: 0.992550, train acc: 20.

Train size: [784] hidden size: [49] batch size: [784] trial: 27, train_loss: 0.981978, train acc: 21.036807580174926, test loss: 0.988426, test acc: 20.702857142857145, bias2: 0.7886454463005066, variance: 0.1997804194688797
Train size: [784] hidden size: [49] batch size: [784] trial: 28, train_loss: 0.982346, train acc: 20.98434201266714, test loss: 0.988887, test acc: 20.672413793103452, bias2: 0.7888860702514648, variance: 0.2000008523464203
Train size: [784] hidden size: [49] batch size: [784] trial: 29, train_loss: 0.981693, train acc: 21.152210884353742, test loss: 0.988423, test acc: 20.75666666666667, bias2: 0.7883393168449402, variance: 0.20008395612239838
Train size: [784] hidden size: [49] batch size: [784] trial: 30, train_loss: 0.980344, train acc: 21.30513495720869, test loss: 0.987363, test acc: 20.890645161290326, bias2: 0.786981463432312, variance: 0.20038186013698578
Train size: [784] hidden size: [49] batch size: [784] trial: 31, train_loss: 0.980606, train acc: 21.2

Train size: [784] hidden size: [56] batch size: [784] trial: 14, train_loss: 0.960904, train acc: 22.70408163265306, test loss: 0.975273, test acc: 21.468666666666667, bias2: 0.7938330173492432, variance: 0.18143954873085022
Train size: [784] hidden size: [56] batch size: [784] trial: 15, train_loss: 0.961884, train acc: 22.536670918367346, test loss: 0.975414, test acc: 21.414375000000003, bias2: 0.7936206459999084, variance: 0.18179315328598022
Train size: [784] hidden size: [56] batch size: [784] trial: 16, train_loss: 0.962678, train acc: 22.486494597839137, test loss: 0.975468, test acc: 21.417058823529413, bias2: 0.7922438979148865, variance: 0.18322379887104034
Train size: [784] hidden size: [56] batch size: [784] trial: 17, train_loss: 0.962206, train acc: 22.49858276643991, test loss: 0.975391, test acc: 21.400000000000002, bias2: 0.7917029857635498, variance: 0.18368837237358093
Train size: [784] hidden size: [56] batch size: [784] trial: 18, train_loss: 0.962558, train acc: 

Train size: [784] hidden size: [65] batch size: [784] trial: 0, train_loss: 0.984988, train acc: 18.367346938775512, test loss: 0.991639, test acc: 18.83, bias2: 0.991639256477356, variance: 1.9462739753173253e-10
Train size: [784] hidden size: [65] batch size: [784] trial: 1, train_loss: 0.985284, train acc: 19.961734693877553, test loss: 0.993627, test acc: 18.915, bias2: 0.8928084969520569, variance: 0.10081858187913895
Train size: [784] hidden size: [65] batch size: [784] trial: 2, train_loss: 0.980989, train acc: 20.450680272108844, test loss: 0.985287, test acc: 18.926666666666666, bias2: 0.8531190752983093, variance: 0.13216781616210938
Train size: [784] hidden size: [65] batch size: [784] trial: 3, train_loss: 0.972554, train acc: 22.002551020408163, test loss: 0.980465, test acc: 20.035, bias2: 0.8325610756874084, variance: 0.1479043960571289
Train size: [784] hidden size: [65] batch size: [784] trial: 4, train_loss: 0.960015, train acc: 23.622448979591837, test loss: 0.970961

Train size: [784] hidden size: [65] batch size: [784] trial: 37, train_loss: 0.941332, train acc: 25.35915682062299, test loss: 0.954371, test acc: 24.0371052631579, bias2: 0.762285053730011, variance: 0.19208616018295288
Train size: [784] hidden size: [65] batch size: [784] trial: 38, train_loss: 0.941296, train acc: 25.399005756148615, test loss: 0.954227, test acc: 24.10948717948719, bias2: 0.7619543671607971, variance: 0.1922730803489685
Train size: [784] hidden size: [65] batch size: [784] trial: 39, train_loss: 0.941943, train acc: 25.296556122448983, test loss: 0.955089, test acc: 23.97725000000001, bias2: 0.7625026702880859, variance: 0.19258633255958557
Train size: [784] hidden size: [65] batch size: [784] trial: 40, train_loss: 0.941818, train acc: 25.24888003982081, test loss: 0.954946, test acc: 23.935365853658546, bias2: 0.762492299079895, variance: 0.1924535185098648
Train size: [784] hidden size: [65] batch size: [784] trial: 41, train_loss: 0.941521, train acc: 25.25813

Train size: [784] hidden size: [75] batch size: [784] trial: 24, train_loss: 0.915896, train acc: 28.974489795918366, test loss: 0.928640, test acc: 27.4312, bias2: 0.7416142225265503, variance: 0.18702617287635803
Train size: [784] hidden size: [75] batch size: [784] trial: 25, train_loss: 0.915321, train acc: 28.973704866562006, test loss: 0.927867, test acc: 27.512692307692305, bias2: 0.7413104772567749, variance: 0.18655681610107422
Train size: [784] hidden size: [75] batch size: [784] trial: 26, train_loss: 0.914992, train acc: 29.020219198790624, test loss: 0.927245, test acc: 27.539259259259257, bias2: 0.7406559586524963, variance: 0.18658868968486786
Train size: [784] hidden size: [75] batch size: [784] trial: 27, train_loss: 0.914306, train acc: 29.222849854227402, test loss: 0.927237, test acc: 27.638571428571428, bias2: 0.7397431135177612, variance: 0.18749377131462097
Train size: [784] hidden size: [75] batch size: [784] trial: 28, train_loss: 0.913936, train acc: 29.345531

Train size: [784] hidden size: [86] batch size: [784] trial: 11, train_loss: 0.905325, train acc: 31.451955782312925, test loss: 0.916598, test acc: 29.734166666666667, bias2: 0.7393314838409424, variance: 0.1772661805152893
Train size: [784] hidden size: [86] batch size: [784] trial: 12, train_loss: 0.900914, train acc: 32.133045525902666, test loss: 0.912432, test acc: 30.38769230769231, bias2: 0.7333955764770508, variance: 0.17903614044189453
Train size: [784] hidden size: [86] batch size: [784] trial: 13, train_loss: 0.899693, train acc: 31.869533527696795, test loss: 0.911518, test acc: 30.204285714285714, bias2: 0.7322250604629517, variance: 0.17929263412952423
Train size: [784] hidden size: [86] batch size: [784] trial: 14, train_loss: 0.902362, train acc: 31.462585034013607, test loss: 0.913876, test acc: 29.691333333333333, bias2: 0.7341156005859375, variance: 0.17976032197475433
Train size: [784] hidden size: [86] batch size: [784] trial: 15, train_loss: 0.903663, train acc: 

Train size: [784] hidden size: [86] batch size: [784] trial: 48, train_loss: 0.895709, train acc: 31.92419825072886, test loss: 0.907799, test acc: 30.48326530612245, bias2: 0.7216089963912964, variance: 0.18618974089622498
Train size: [784] hidden size: [86] batch size: [784] trial: 49, train_loss: 0.895747, train acc: 31.86989795918367, test loss: 0.907860, test acc: 30.4434, bias2: 0.7218377590179443, variance: 0.18602246046066284
##################################################
Train size: [784] hidden size: [99] batch size: [784] trial: 0, train_loss: 0.873874, train acc: 35.45918367346939, test loss: 0.878850, test acc: 33.84, bias2: 0.8788496255874634, variance: -9.731370154142382e-10
Train size: [784] hidden size: [99] batch size: [784] trial: 1, train_loss: 0.873197, train acc: 33.609693877551024, test loss: 0.892312, test acc: 32.265, bias2: 0.7946229577064514, variance: 0.09768948704004288
Train size: [784] hidden size: [99] batch size: [784] trial: 2, train_loss: 0.883198

Train size: [784] hidden size: [99] batch size: [784] trial: 35, train_loss: 0.868396, train acc: 35.14739229024943, test loss: 0.884399, test acc: 33.45222222222222, bias2: 0.7017043232917786, variance: 0.182694211602211
Train size: [784] hidden size: [99] batch size: [784] trial: 36, train_loss: 0.868042, train acc: 35.26268615554329, test loss: 0.883873, test acc: 33.62, bias2: 0.7010136842727661, variance: 0.1828596442937851
Train size: [784] hidden size: [99] batch size: [784] trial: 37, train_loss: 0.867700, train acc: 35.4189044038668, test loss: 0.883435, test acc: 33.80105263157894, bias2: 0.6996955871582031, variance: 0.18373893201351166
Train size: [784] hidden size: [99] batch size: [784] trial: 38, train_loss: 0.868007, train acc: 35.30546834118262, test loss: 0.883741, test acc: 33.71153846153845, bias2: 0.6998098492622375, variance: 0.1839313507080078
Train size: [784] hidden size: [99] batch size: [784] trial: 39, train_loss: 0.868014, train acc: 35.3252551020408, test 

Train size: [784] hidden size: [114] batch size: [784] trial: 22, train_loss: 0.842886, train acc: 39.34117125110914, test loss: 0.860831, test acc: 37.14652173913043, bias2: 0.6783302426338196, variance: 0.18250036239624023
Train size: [784] hidden size: [114] batch size: [784] trial: 23, train_loss: 0.842678, train acc: 39.45046768707483, test loss: 0.860712, test acc: 37.15333333333333, bias2: 0.6781383752822876, variance: 0.1825738251209259
Train size: [784] hidden size: [114] batch size: [784] trial: 24, train_loss: 0.842333, train acc: 39.40306122448979, test loss: 0.859978, test acc: 37.199999999999996, bias2: 0.6776875257492065, variance: 0.18229083716869354
Train size: [784] hidden size: [114] batch size: [784] trial: 25, train_loss: 0.842304, train acc: 39.40835949764521, test loss: 0.859762, test acc: 37.18461538461538, bias2: 0.6777052879333496, variance: 0.18205660581588745
Train size: [784] hidden size: [114] batch size: [784] trial: 26, train_loss: 0.841886, train acc: 3

Train size: [784] hidden size: [131] batch size: [784] trial: 9, train_loss: 0.822025, train acc: 41.339285714285715, test loss: 0.841595, test acc: 39.173, bias2: 0.6822537779808044, variance: 0.1593409925699234
Train size: [784] hidden size: [131] batch size: [784] trial: 10, train_loss: 0.820655, train acc: 41.454081632653065, test loss: 0.840824, test acc: 39.24909090909091, bias2: 0.6804652214050293, variance: 0.16035854816436768
Train size: [784] hidden size: [131] batch size: [784] trial: 11, train_loss: 0.821048, train acc: 41.12457482993197, test loss: 0.841806, test acc: 38.92, bias2: 0.6809136867523193, variance: 0.16089263558387756
Train size: [784] hidden size: [131] batch size: [784] trial: 12, train_loss: 0.821923, train acc: 40.86538461538461, test loss: 0.841615, test acc: 38.84307692307693, bias2: 0.6805365085601807, variance: 0.16107890009880066
Train size: [784] hidden size: [131] batch size: [784] trial: 13, train_loss: 0.822939, train acc: 40.75255102040817, test 

Train size: [784] hidden size: [131] batch size: [784] trial: 46, train_loss: 0.825104, train acc: 41.155557967868006, test loss: 0.844281, test acc: 38.96234042553191, bias2: 0.6688446998596191, variance: 0.17543604969978333
Train size: [784] hidden size: [131] batch size: [784] trial: 47, train_loss: 0.825284, train acc: 41.17240646258504, test loss: 0.844626, test acc: 38.948125, bias2: 0.6688216328620911, variance: 0.17580460011959076
Train size: [784] hidden size: [131] batch size: [784] trial: 48, train_loss: 0.825858, train acc: 41.14431486880467, test loss: 0.845236, test acc: 38.89530612244898, bias2: 0.6691255569458008, variance: 0.17611002922058105
Train size: [784] hidden size: [131] batch size: [784] trial: 49, train_loss: 0.825533, train acc: 41.16071428571429, test loss: 0.844988, test acc: 38.9262, bias2: 0.6688116788864136, variance: 0.1761762499809265
##################################################
Train size: [784] hidden size: [151] batch size: [784] trial: 0, tr

Train size: [784] hidden size: [151] batch size: [784] trial: 33, train_loss: 0.792727, train acc: 46.3985594237695, test loss: 0.811629, test acc: 43.931470588235285, bias2: 0.6413179636001587, variance: 0.1703113168478012
Train size: [784] hidden size: [151] batch size: [784] trial: 34, train_loss: 0.792944, train acc: 46.30830903790086, test loss: 0.811497, test acc: 43.90257142857142, bias2: 0.6412261128425598, variance: 0.17027120292186737
Train size: [784] hidden size: [151] batch size: [784] trial: 35, train_loss: 0.793179, train acc: 46.37896825396824, test loss: 0.811394, test acc: 43.990555555555545, bias2: 0.6408498883247375, variance: 0.17054419219493866
Train size: [784] hidden size: [151] batch size: [784] trial: 36, train_loss: 0.792670, train acc: 46.421676778819624, test loss: 0.810974, test acc: 44.05324324324323, bias2: 0.640315592288971, variance: 0.1706586480140686
Train size: [784] hidden size: [151] batch size: [784] trial: 37, train_loss: 0.793802, train acc: 46

Train size: [784] hidden size: [174] batch size: [784] trial: 20, train_loss: 0.767295, train acc: 49.56268221574344, test loss: 0.789282, test acc: 46.92666666666666, bias2: 0.6260040998458862, variance: 0.16327808797359467
Train size: [784] hidden size: [174] batch size: [784] trial: 21, train_loss: 0.767912, train acc: 49.37963821892393, test loss: 0.789886, test acc: 46.72363636363636, bias2: 0.6261956691741943, variance: 0.1636902093887329
Train size: [784] hidden size: [174] batch size: [784] trial: 22, train_loss: 0.768500, train acc: 49.26796805678793, test loss: 0.790288, test acc: 46.69043478260869, bias2: 0.626688539981842, variance: 0.16359944641590118
Train size: [784] hidden size: [174] batch size: [784] trial: 23, train_loss: 0.769594, train acc: 49.303784013605444, test loss: 0.790582, test acc: 46.68541666666666, bias2: 0.6259295344352722, variance: 0.16465264558792114
Train size: [784] hidden size: [174] batch size: [784] trial: 24, train_loss: 0.768778, train acc: 49

Train size: [784] hidden size: [201] batch size: [784] trial: 7, train_loss: 0.731630, train acc: 54.9266581632653, test loss: 0.759698, test acc: 50.66374999999999, bias2: 0.6158559918403625, variance: 0.14384214580059052
Train size: [784] hidden size: [201] batch size: [784] trial: 8, train_loss: 0.729498, train acc: 55.13038548752834, test loss: 0.757399, test acc: 51.13666666666666, bias2: 0.6122391819953918, variance: 0.14515946805477142
Train size: [784] hidden size: [201] batch size: [784] trial: 9, train_loss: 0.731735, train acc: 54.7704081632653, test loss: 0.758808, test acc: 51.029999999999994, bias2: 0.6107613444328308, variance: 0.14804649353027344
Train size: [784] hidden size: [201] batch size: [784] trial: 10, train_loss: 0.733459, train acc: 54.56864564007421, test loss: 0.759242, test acc: 51.13545454545454, bias2: 0.6090264320373535, variance: 0.1502160280942917
Train size: [784] hidden size: [201] batch size: [784] trial: 11, train_loss: 0.731245, train acc: 54.783

Train size: [784] hidden size: [201] batch size: [784] trial: 44, train_loss: 0.734262, train acc: 54.050453514739246, test loss: 0.759532, test acc: 50.94444444444444, bias2: 0.5993260741233826, variance: 0.1602059006690979
Train size: [784] hidden size: [201] batch size: [784] trial: 45, train_loss: 0.734254, train acc: 54.08717834960072, test loss: 0.759544, test acc: 50.95934782608696, bias2: 0.5993079543113708, variance: 0.16023601591587067
Train size: [784] hidden size: [201] batch size: [784] trial: 46, train_loss: 0.733978, train acc: 54.103343465045604, test loss: 0.759371, test acc: 50.97319148936171, bias2: 0.5992680788040161, variance: 0.1601032316684723
Train size: [784] hidden size: [201] batch size: [784] trial: 47, train_loss: 0.733869, train acc: 54.10820578231294, test loss: 0.759051, test acc: 51.02916666666667, bias2: 0.5990695953369141, variance: 0.15998145937919617
Train size: [784] hidden size: [201] batch size: [784] trial: 48, train_loss: 0.733629, train acc: 5

Train size: [784] hidden size: [231] batch size: [784] trial: 31, train_loss: 0.706430, train acc: 57.56138392857142, test loss: 0.733050, test acc: 54.6028125, bias2: 0.5793747901916504, variance: 0.1536751687526703
Train size: [784] hidden size: [231] batch size: [784] trial: 32, train_loss: 0.706085, train acc: 57.5525664811379, test loss: 0.733130, test acc: 54.555454545454545, bias2: 0.5790455341339111, variance: 0.15408435463905334
Train size: [784] hidden size: [231] batch size: [784] trial: 33, train_loss: 0.707091, train acc: 57.424219687875144, test loss: 0.733665, test acc: 54.4585294117647, bias2: 0.5794014930725098, variance: 0.15426382422447205
Train size: [784] hidden size: [231] batch size: [784] trial: 34, train_loss: 0.706779, train acc: 57.438046647230316, test loss: 0.733990, test acc: 54.41971428571428, bias2: 0.5793778896331787, variance: 0.15461163222789764
Train size: [784] hidden size: [231] batch size: [784] trial: 35, train_loss: 0.706660, train acc: 57.42630

Train size: [784] hidden size: [266] batch size: [784] trial: 18, train_loss: 0.683410, train acc: 61.43931256713211, test loss: 0.709733, test acc: 58.06842105263158, bias2: 0.5606421828269958, variance: 0.1490909457206726
Train size: [784] hidden size: [266] batch size: [784] trial: 19, train_loss: 0.683248, train acc: 61.4094387755102, test loss: 0.709085, test acc: 58.156499999999994, bias2: 0.5600540041923523, variance: 0.1490306407213211
Train size: [784] hidden size: [266] batch size: [784] trial: 20, train_loss: 0.683612, train acc: 61.35204081632653, test loss: 0.709783, test acc: 57.999999999999986, bias2: 0.5603952407836914, variance: 0.14938823878765106
Train size: [784] hidden size: [266] batch size: [784] trial: 21, train_loss: 0.683682, train acc: 61.276669758812616, test loss: 0.711019, test acc: 57.75499999999999, bias2: 0.5610964894294739, variance: 0.14992208778858185
Train size: [784] hidden size: [266] batch size: [784] trial: 22, train_loss: 0.683158, train acc: 6

Train size: [784] hidden size: [306] batch size: [784] trial: 4, train_loss: 0.657900, train acc: 63.57142857142858, test loss: 0.691012, test acc: 59.452, bias2: 0.5716373920440674, variance: 0.11937415599822998
Train size: [784] hidden size: [306] batch size: [784] trial: 5, train_loss: 0.664193, train acc: 62.92517006802722, test loss: 0.694483, test acc: 59.18, bias2: 0.5694274306297302, variance: 0.1250554919242859
Train size: [784] hidden size: [306] batch size: [784] trial: 6, train_loss: 0.665731, train acc: 62.937317784256564, test loss: 0.694158, test acc: 59.54285714285714, bias2: 0.5654891729354858, variance: 0.12866836786270142
Train size: [784] hidden size: [306] batch size: [784] trial: 7, train_loss: 0.663029, train acc: 62.89859693877551, test loss: 0.695192, test acc: 58.99249999999999, bias2: 0.5630285143852234, variance: 0.13216346502304077
Train size: [784] hidden size: [306] batch size: [784] trial: 8, train_loss: 0.660489, train acc: 62.925170068027214, test loss

Train size: [784] hidden size: [306] batch size: [784] trial: 41, train_loss: 0.658268, train acc: 63.42626336248783, test loss: 0.690050, test acc: 59.84380952380951, bias2: 0.5432194471359253, variance: 0.14683018624782562
Train size: [784] hidden size: [306] batch size: [784] trial: 42, train_loss: 0.658232, train acc: 63.42548647365921, test loss: 0.690203, test acc: 59.84697674418604, bias2: 0.5431394577026367, variance: 0.14706382155418396
Train size: [784] hidden size: [306] batch size: [784] trial: 43, train_loss: 0.658006, train acc: 63.44503710575137, test loss: 0.690199, test acc: 59.83272727272726, bias2: 0.5431816577911377, variance: 0.1470169723033905
Train size: [784] hidden size: [306] batch size: [784] trial: 44, train_loss: 0.658440, train acc: 63.381519274376394, test loss: 0.690382, test acc: 59.85155555555554, bias2: 0.543315052986145, variance: 0.14706721901893616
Train size: [784] hidden size: [306] batch size: [784] trial: 45, train_loss: 0.658627, train acc: 63

Train size: [784] hidden size: [353] batch size: [784] trial: 28, train_loss: 0.623674, train acc: 67.5228712174525, test loss: 0.661109, test acc: 63.67896551724136, bias2: 0.5206835269927979, variance: 0.1404251903295517
Train size: [784] hidden size: [353] batch size: [784] trial: 29, train_loss: 0.624077, train acc: 67.53826530612245, test loss: 0.661221, test acc: 63.650999999999975, bias2: 0.5205336809158325, variance: 0.14068715274333954
Train size: [784] hidden size: [353] batch size: [784] trial: 30, train_loss: 0.624314, train acc: 67.51974983541805, test loss: 0.661691, test acc: 63.586774193548365, bias2: 0.5207969546318054, variance: 0.14089356362819672
Train size: [784] hidden size: [353] batch size: [784] trial: 31, train_loss: 0.623866, train acc: 67.52630739795919, test loss: 0.660983, test acc: 63.64156249999998, bias2: 0.5206345915794373, variance: 0.14034868776798248
Train size: [784] hidden size: [353] batch size: [784] trial: 32, train_loss: 0.623923, train acc: 6

Train size: [784] hidden size: [406] batch size: [784] trial: 15, train_loss: 0.605650, train acc: 70.08928571428572, test loss: 0.641893, test acc: 66.04624999999999, bias2: 0.5116873979568481, variance: 0.13020583987236023
Train size: [784] hidden size: [406] batch size: [784] trial: 16, train_loss: 0.606220, train acc: 70.078031212485, test loss: 0.641533, test acc: 66.10117647058823, bias2: 0.5107542276382446, variance: 0.13077853620052338
Train size: [784] hidden size: [406] batch size: [784] trial: 17, train_loss: 0.605376, train acc: 70.03968253968254, test loss: 0.641105, test acc: 66.07555555555555, bias2: 0.5102327466011047, variance: 0.1308719664812088
Train size: [784] hidden size: [406] batch size: [784] trial: 18, train_loss: 0.605965, train acc: 69.99865735767992, test loss: 0.642049, test acc: 65.94684210526316, bias2: 0.5103105902671814, variance: 0.13173888623714447
Train size: [784] hidden size: [406] batch size: [784] trial: 19, train_loss: 0.606124, train acc: 69.9

Train size: [784] hidden size: [468] batch size: [784] trial: 2, train_loss: 0.575932, train acc: 72.78911564625851, test loss: 0.612448, test acc: 68.87, bias2: 0.5262056589126587, variance: 0.08624199032783508
Train size: [784] hidden size: [468] batch size: [784] trial: 3, train_loss: 0.578227, train acc: 72.54464285714286, test loss: 0.617679, test acc: 68.17500000000001, bias2: 0.5210427045822144, variance: 0.09663677215576172
Train size: [784] hidden size: [468] batch size: [784] trial: 4, train_loss: 0.578968, train acc: 72.47448979591837, test loss: 0.621145, test acc: 67.71000000000001, bias2: 0.5170609951019287, variance: 0.1040838286280632
Train size: [784] hidden size: [468] batch size: [784] trial: 5, train_loss: 0.575648, train acc: 72.83163265306122, test loss: 0.621872, test acc: 67.42500000000001, bias2: 0.5121167898178101, variance: 0.10975529253482819
Train size: [784] hidden size: [468] batch size: [784] trial: 6, train_loss: 0.579335, train acc: 72.649416909621, te

Train size: [784] hidden size: [468] batch size: [784] trial: 39, train_loss: 0.581321, train acc: 72.63392857142854, test loss: 0.621529, test acc: 68.07674999999998, bias2: 0.49009621143341064, variance: 0.1314324289560318
Train size: [784] hidden size: [468] batch size: [784] trial: 40, train_loss: 0.581413, train acc: 72.62319561971127, test loss: 0.621192, test acc: 68.12682926829267, bias2: 0.48983752727508545, variance: 0.1313544064760208
Train size: [784] hidden size: [468] batch size: [784] trial: 41, train_loss: 0.581541, train acc: 72.64941690962097, test loss: 0.621357, test acc: 68.15690476190474, bias2: 0.48977476358413696, variance: 0.13158206641674042
Train size: [784] hidden size: [468] batch size: [784] trial: 42, train_loss: 0.581246, train acc: 72.65365448504981, test loss: 0.621090, test acc: 68.19534883720928, bias2: 0.4894142150878906, variance: 0.13167595863342285
Train size: [784] hidden size: [468] batch size: [784] trial: 43, train_loss: 0.581671, train acc: 

Train size: [784] hidden size: [538] batch size: [784] trial: 26, train_loss: 0.561294, train acc: 74.14021164021165, test loss: 0.602348, test acc: 70.03444444444443, bias2: 0.47577399015426636, variance: 0.12657387554645538
Train size: [784] hidden size: [538] batch size: [784] trial: 27, train_loss: 0.561562, train acc: 74.18002915451895, test loss: 0.602409, test acc: 70.05107142857142, bias2: 0.4756424129009247, variance: 0.1267661154270172
Train size: [784] hidden size: [538] batch size: [784] trial: 28, train_loss: 0.561266, train acc: 74.30066854327939, test loss: 0.602513, test acc: 70.08620689655172, bias2: 0.4755009114742279, variance: 0.12701252102851868
Train size: [784] hidden size: [538] batch size: [784] trial: 29, train_loss: 0.561147, train acc: 74.29421768707483, test loss: 0.602715, test acc: 70.03466666666667, bias2: 0.47552764415740967, variance: 0.12718747556209564
Train size: [784] hidden size: [538] batch size: [784] trial: 30, train_loss: 0.561626, train acc: 

Train size: [784] hidden size: [620] batch size: [784] trial: 13, train_loss: 0.538960, train acc: 76.77660349854229, test loss: 0.584546, test acc: 71.73357142857142, bias2: 0.465985506772995, variance: 0.11856047064065933
Train size: [784] hidden size: [620] batch size: [784] trial: 14, train_loss: 0.539382, train acc: 76.60714285714288, test loss: 0.584508, test acc: 71.70333333333333, bias2: 0.46520334482192993, variance: 0.11930496245622635
Train size: [784] hidden size: [620] batch size: [784] trial: 15, train_loss: 0.539070, train acc: 76.63424744897961, test loss: 0.584393, test acc: 71.809375, bias2: 0.4648485481739044, variance: 0.11954466253519058
Train size: [784] hidden size: [620] batch size: [784] trial: 16, train_loss: 0.539471, train acc: 76.62815126050421, test loss: 0.585162, test acc: 71.7535294117647, bias2: 0.4649440348148346, variance: 0.12021789699792862
Train size: [784] hidden size: [620] batch size: [784] trial: 17, train_loss: 0.539833, train acc: 76.5943877

Train size: [784] hidden size: [714] batch size: [784] trial: 0, train_loss: 0.511490, train acc: 79.71938775510205, test loss: 0.564658, test acc: 73.19, bias2: 0.5646582245826721, variance: 1.0120625226761604e-08
Train size: [784] hidden size: [714] batch size: [784] trial: 1, train_loss: 0.515350, train acc: 79.71938775510205, test loss: 0.563078, test acc: 74.33500000000001, bias2: 0.5037022829055786, variance: 0.059375397861003876
Train size: [784] hidden size: [714] batch size: [784] trial: 2, train_loss: 0.511638, train acc: 79.29421768707483, test loss: 0.563408, test acc: 73.76666666666667, bias2: 0.4822056293487549, variance: 0.08120274543762207
Train size: [784] hidden size: [714] batch size: [784] trial: 3, train_loss: 0.513393, train acc: 79.55994897959184, test loss: 0.564535, test acc: 74.0275, bias2: 0.4714549779891968, variance: 0.09307967126369476
Train size: [784] hidden size: [714] batch size: [784] trial: 4, train_loss: 0.513481, train acc: 78.59693877551021, test 

Train size: [784] hidden size: [714] batch size: [784] trial: 37, train_loss: 0.513630, train acc: 79.10848549946292, test loss: 0.565199, test acc: 73.70631578947366, bias2: 0.44554635882377625, variance: 0.11965300887823105
Train size: [784] hidden size: [714] batch size: [784] trial: 38, train_loss: 0.514026, train acc: 79.01622187336471, test loss: 0.565250, test acc: 73.71358974358972, bias2: 0.4455745816230774, variance: 0.11967581510543823
Train size: [784] hidden size: [714] batch size: [784] trial: 39, train_loss: 0.514001, train acc: 79.04017857142856, test loss: 0.564925, test acc: 73.74924999999998, bias2: 0.4453010857105255, variance: 0.11962398141622543
Train size: [784] hidden size: [714] batch size: [784] trial: 40, train_loss: 0.513966, train acc: 79.05052264808361, test loss: 0.564705, test acc: 73.77634146341461, bias2: 0.44502073526382446, variance: 0.11968434602022171
Train size: [784] hidden size: [714] batch size: [784] trial: 41, train_loss: 0.513903, train acc:

In [None]:
import matplotlib
import matplotlib.pyplot as plt
font = {
        'size'   : 18}
matplotlib.rc('font', **font)
figsize = (16, 5)
import seaborn as sns
sns.set_style('darkgrid')
import pandas as pd

def plot_bias_var(df, N_D, ymin=0, ymax=1.0):
    fig1, axes1 = plt.subplots(1, 3, figsize=figsize)
    axes1[0].set_xscale('log')
    axes1[1].set_xscale('log')
    axes1[2].set_xscale('log')
    cur_df = df[df['train_size']/feature_dim==N_D]
    test_loss = cur_df['test_loss']
    bias2 = cur_df['bias2']
    var = cur_df['variance']
    P_N = cur_df['hidden_size']/cur_df['train_size']
    axes1[0].plot(P_N, test_loss)
    axes1[0].set_xlabel("P/N")
    axes1[0].set_ylabel("Test Loss")
    axes1[0].set_ylim(ymin, ymax)
    axes1[1].plot(P_N, bias2)
    axes1[1].set_xlabel("P/N")
    axes1[1].set_ylabel("Bias Square")
    axes1[1].set_ylim(ymin, ymax)
    axes1[2].plot(P_N, var)
    axes1[2].set_xlabel("P/N")
    axes1[2].set_ylabel("Variance")
    axes1[2].set_ylim(ymin, ymax)
    fig1.suptitle("Bias-Variance Decomposition (N/D={:.2f})".format(N_D))
    plt.show()
def plot_single_vs_ensemble(dfs_list, Ks_list, N_D, feature_dim, ymin=0, ymax=1.0):
    assert len(dfs_list) == len(Ks_list)
    fig1, axes1 = plt.subplots(1, 3, figsize=figsize)
    for i in range(3):
        axes1[i].set_xscale('log')
    dfs_list = [df[df['train_size']/feature_dim==N_D] for df in dfs_list]
    for cur_df, K in zip(dfs_list, Ks_list):
        test_loss = cur_df['test_loss']
        bias2 = cur_df['bias2']
        var = cur_df['variance']
        P_N = cur_df['hidden_size']/cur_df['train_size']
        axes1[0].plot(P_N, test_loss, label='K={}'.format(K))
        axes1[1].plot(P_N, bias2, label='K={}'.format(K))
        axes1[2].plot(P_N, var, label='K={}'.format(K))
    
    axes1[0].set_xlabel("P/N")
    axes1[0].set_ylabel("Test Loss")
    axes1[0].set_ylim(ymin, ymax)
    
    axes1[1].set_xlabel("P/N")
    axes1[1].set_ylabel("Bias Square")
    axes1[1].set_ylim(ymin, ymax)
    
    axes1[2].set_xlabel("P/N")
    axes1[2].set_ylabel("Variance")
    axes1[2].set_ylim(ymin, ymax)
    fig1.suptitle("Bias-Variance Decomposition (N/D={:.2f})".format(N_D))
    plt.legend()
    plt.show()


In [None]:
# K2_df = pd.read_csv(os.path.join(outdir, 'ensembleNNK=2_output.csv'))
# K1_df = pd.read_csv(os.path.join(outdir, 'singleNN_output.csv'))
# plot_single_vs_ensemble([K1_df, K2_df], [1, 2], N_Ds[0], 784,)