defaults to float64 for cpu tests because of numerical stability on cpu

tfjgeorge · Sep 15, 2020 · 6cfb3b1 · 6cfb3b1
1 parent d114ae5
commit 6cfb3b1
Show file tree

Hide file tree

Showing 4 changed files with 56 additions and 27 deletions.
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,9 @@
+import torch
+import pytest
+
+
+@pytest.fixture(scope="session", autouse=True)
+def default_to_float64_on_cpu(request):
+    if not torch.cuda.is_available(): 
+        torch.set_default_dtype(torch.float64)
+        torch.set_default_tensor_type(torch.DoubleTensor)
diff --git a/tests/tasks.py b/tests/tasks.py
@@ -12,9 +12,17 @@
 
 if torch.cuda.is_available():
     device = 'cuda'
+
+    def to_device(tensor):
+        return tensor.to(device)
 else:
     device = 'cpu'
 
+    # on cpu we need to use double as otherwise ill-conditioning in sums
+    # causes numerical instability
+    def to_device(tensor):
+        return tensor.double()
+
 class FCNet(nn.Module):
     def __init__(self, in_size=10, out_size=10, n_hidden=2, hidden_size=15,
                  nonlinearity=nn.ReLU, normalization='none'):
@@ -90,7 +98,7 @@ def get_linear_fc_task():
     net.to(device)
 
     def output_fn(input, target):
-        return net(input.to(device))
+        return net(to_device(input))
 
     layer_collection = LayerCollection.from_model(net)
     return (train_loader, layer_collection, net.parameters(),
@@ -122,7 +130,7 @@ def get_linear_conv_task():
     net.to(device)
 
     def output_fn(input, target):
-        return net(input.to(device))
+        return net(to_device(input))
 
     layer_collection = LayerCollection.from_model(net)
     return (train_loader, layer_collection, net.parameters(),
@@ -156,7 +164,7 @@ def get_batchnorm_fc_linear_task():
     net.to(device)
 
     def output_fn(input, target):
-        return net(input.to(device))
+        return net(to_device(input))
 
     lc_full = LayerCollection.from_model(net)
     layer_collection = LayerCollection()
@@ -197,7 +205,7 @@ def get_batchnorm_conv_linear_task():
     net.to(device)
 
     def output_fn(input, target):
-        return net(input.to(device))
+        return net(to_device(input))
 
     lc_full = LayerCollection.from_model(net)
     layer_collection = LayerCollection()
@@ -249,7 +257,7 @@ def get_batchnorm_nonlinear_task():
     net.to(device)
 
     def output_fn(input, target):
-        return net(input.to(device))
+        return net(to_device(input))
 
     layer_collection = LayerCollection.from_model(net)
     return (train_loader, layer_collection, net.parameters(),
@@ -274,7 +282,7 @@ def get_fullyconnect_task(normalization='none'):
     net.to(device)
 
     def output_fn(input, target):
-        return net(input.to(device))
+        return net(to_device(input))
 
     layer_collection = LayerCollection.from_model(net)
     return (train_loader, layer_collection, net.parameters(),
@@ -296,7 +304,7 @@ def get_conv_task(normalization='none'):
     net.to(device)
 
     def output_fn(input, target):
-        return net(input.to(device))
+        return net(to_device(input))
 
     layer_collection = LayerCollection.from_model(net)
     return (train_loader, layer_collection, net.parameters(),

diff --git a/tests/test_jacobian_ekfac.py b/tests/test_jacobian_ekfac.py
@@ -1,7 +1,7 @@
 from nngeometry.generator.jacobian import Jacobian
 from nngeometry.object.pspace import PMatBlockDiag, PMatKFAC, PMatEKFAC
 import torch
-from tasks import get_fullyconnect_task, get_conv_task
+from tasks import get_fullyconnect_task, get_conv_task, device
 from nngeometry.object.vector import random_pvector
 from utils import check_ratio, check_tensors
 
@@ -55,7 +55,7 @@ def test_pspace_ekfac_vs_direct():
                              n_output=n_output)
 
         M_ekfac = PMatEKFAC(generator)
-        v = random_pvector(lc, device='cuda')
+        v = random_pvector(lc, device=device)
 
         # the second time we will have called update_diag
         for i in range(2):

diff --git a/tests/test_jacobian_kfac.py b/tests/test_jacobian_kfac.py
@@ -13,6 +13,20 @@
 from tasks import get_fullyconnect_task
 
 
+if torch.cuda.is_available():
+    device = 'cuda'
+
+    def to_device(tensor):
+        return tensor.to(device)
+else:
+    device = 'cpu'
+
+    # on cpu we need to use double as otherwise ill-conditioning in sums
+    # causes numerical instability
+    def to_device(tensor):
+        return tensor.double()
+
+
 class Net(nn.Module):
     def __init__(self, in_size=10, out_size=10, n_hidden=2, hidden_size=25,
                  nonlinearity=nn.ReLU):
@@ -62,24 +76,33 @@ def forward(self, x):
 def get_fullyconnect_kfac_task(bs=300):
     train_set = get_dataset('train')
     train_set = Subset(train_set, range(1000))
-    train_set = to_onexdataset(train_set, 'cuda')
+    train_set = to_onexdataset(train_set, device)
     train_loader = DataLoader(
         dataset=train_set,
         batch_size=bs,
         shuffle=False)
 
     net = Net(in_size=10)
-    net.to('cuda')
+    net.to(device)
 
     def output_fn(input, target):
-        input = input.to('cuda')
-        return net(input)
+        return net(to_device(input))
 
     layer_collection = LayerCollection.from_model(net)
     return (train_loader, layer_collection, net.parameters(), net,
             output_fn, 10)
 
 
+def to_onexdataset(dataset, device):
+    # this weird dataset only uses a single input x repeated, it is only
+    # designed to test kfac since in this case KFAC and regular Fisher
+    # are the same
+    loader = torch.utils.data.DataLoader(dataset, len(dataset))
+    x, t = next(iter(loader))
+    x = x[0, :].repeat(x.size(0), 1)
+    return torch.utils.data.TensorDataset(x.to(device), t.to(device))
+
+
 def get_convnet_kfc_task(bs=300):
     train_set = datasets.MNIST(root=default_datapath,
                                train=True,
@@ -91,27 +114,16 @@ def get_convnet_kfc_task(bs=300):
         batch_size=bs,
         shuffle=False)
     net = ConvNet()
-    net.to('cuda')
+    net.to(device)
 
     def output_fn(input, target):
-        input = input.to('cuda')
-        return net(input)
+        return net(to_device(input))
 
     layer_collection = LayerCollection.from_model(net)
     return (train_loader, layer_collection, net.parameters(), net,
             output_fn, 10)
 
 
-def to_onexdataset(dataset, device):
-    # this weird dataset only uses a single input x repeated, it is only
-    # designed to test kfac since in this case KFAC and regular Fisher
-    # are the same
-    loader = torch.utils.data.DataLoader(dataset, len(dataset))
-    x, t = next(iter(loader))
-    x = x[0, :].repeat(x.size(0), 1)
-    return torch.utils.data.TensorDataset(x.to(device), t.to(device))
-
-
 def test_jacobian_kfac_vs_pblockdiag():
     """
     Compares blockdiag and kfac representation on datasets/architectures
@@ -162,7 +174,7 @@ def test_jacobian_kfac():
                       M_kfac.get_diag(split_weight_bias=True))
 
         # sample random vector
-        random_v = random_pvector(lc, 'cuda')
+        random_v = random_pvector(lc, device)
 
         # Test mv
         mv_direct = torch.mv(G_kfac_split, random_v.get_flat_representation())