## Run package tests

In [1]:
# import sys
# import unittest
# import network_moments.torch.gaussian as gnm
# runner = unittest.TextTestRunner(sys.stdout, verbosity=2)
# load = unittest.TestLoader().loadTestsFromModule
# result = runner.run(unittest.TestSuite([
#     load(gnm.relu.tests),
#     load(gnm.affine.tests),
#     load(gnm.net.adf.tests),
# ]))

## TODOs:

 - Fix relu covariance in case of zero-mean and zero-variance
 - Support scalar variance for affine.batch_moments()
 - Make computing AS faster for diagonal input variance
 - Backprobagation through the moments

In [2]:
import torch
from time import time
import matplotlib.pyplot as plt
from torchvision import datasets
import network_moments.torch.gaussian as gnm
from torch.distributions import MultivariateNormal as gaussian

# %matplotlib widget
plt.style.use('dark_background')

def timeit(stmt):
    out = get_ipython().run_line_magic('timeit', '-o -q {}'.format(stmt))
    return out.average, out.stdev

def get_gaussian(dims, sigma=1, zero_mean=False, mbatch=1, vbatch=1,
                 dtype=torch.float64, device='cpu'):
    mean = torch.randn(mbatch, dims, dtype=dtype, device=device) * sigma
    if zero_mean:
        mean.zero_()
    cov = gnm.utils.rand.definite(dims, norm=sigma ** 2,
                                  batch=vbatch, dtype=dtype, device=device)
    var = cov.diagonal(dim1=-2, dim2=-1)
    return mean, cov, var


def get_net(num_layers=4, dims=2, bias_in_first_layer=True, verbose=False):
    net = gnm.net.Sequential(*[
        layer for i in range(num_layers) for layer in (
            torch.nn.Linear(dims, dims, bias=bias_in_first_layer or i > 0),
            torch.nn.ReLU(inplace=True),
        )
    ][:-1]).double().eval()
    if verbose:
        print(net)

    relu = 2 * max(num_layers - 1, 1) - 1  # index of linearization layer
    lrs = gnm.net.Sequential.split_layers(net)
    tsl = gnm.net.Sequential.encapsulate(
        net[:relu], net[relu:relu + 1], net[relu + 1:])
    if verbose:
        print('Linearizing around layer {}.'.format(relu % len(net)))
    return net, lrs, tsl


def get_image(loader, index):
    for images, _ in loader:
        if index < len(images):
            image = images[index:index + 1]
            index = -1
            break
        index -= len(images)
    if index >= 0:
        print('Couldn\'t find the image at index {} !!'.format(index))
        image = None
    return image


def error(a, b):
    if a.size() != b.size():
        return float('inf')
    return (a - b).abs().mean().item()
#     return ((a / b) - 1).abs().mean().item()
#     return ((a - b).norm() / a.norm()).item()

In [4]:
cuda = False
lenet = gnm.net.LeNet().to('cuda' if cuda else 'cpu')
lenet.load_state_dict(torch.load('models/mnist/lenet.pt'))
mnist_train = datasets.MNIST('data/mnist', train=True,
                             transform=lenet.default_transforms())
mnist_train_loader = torch.utils.data.DataLoader(mnist_train,
                                                 batch_size=5000,
                                                 pin_memory=cuda,
                                                 num_workers=4 if cuda else 0,
                                                 shuffle=False,
                                                 drop_last=False)
# print(lenet.accuracy(mnist_train_loader, 'cuda' if cuda else 'cpu'))

0.9975166666666667


In [None]:
cuda = False
alexnet = gnm.net.AlexNet().to('cuda' if cuda else 'cpu')
alexnet.load_state_dict(torch.load('models/imagenet/alexnet.pt'))
imagenet_valid = datasets.ImageFolder('data/imagenet/val/',
                                      transform=alexnet.default_transforms())
imagenet_valid_loader = torch.utils.data.DataLoader(imagenet_valid,
                                                    batch_size=64,
                                                    pin_memory=True,
                                                    num_workers=4,
                                                    shuffle=False,
                                                    drop_last=False)
# print(alexnet.accuracy(imagenet_valid_loader, 'cuda' if cuda else 'cpu'))

## Performance comparison (forward pass vs ADF vs TSL)

NOTE: ADF is 10 times slower than the forward pass and TSL is 10 times slower than ADF

In [None]:
# dims = 100
# mu, cov, var = get_gaussian(dims)
# net, lrs, tsl = get_net(num_layers=7, dims=dims)

# # forward pass time
# %timeit net(mu)
# # 189 µs ± 1.2 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

# # independent ADF time
# %timeit gnm.net.adf.gaussian(lrs, mu, var, independent=True)
# # 1.35 ms ± 9.09 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

# # independent TSL time
# %timeit gnm.net.adf.gaussian(tsl, mu, var, independent=True)
# # 14.9 ms ± 651 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

# # TSL time
# %timeit gnm.net.adf.gaussian(tsl, mu, cov, independent=False)
# # 36.1 ms ± 4.11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

## Profile $\mathbf{A}\mathbf{\Sigma}\mathbf{A}^\top$ if we know $\mathbf{A}$ versus our trick

We are computing the covariance of the linearization of a neural network around a certain point. One could say that this is a memory and computation tradeoff where we can just save the linearizations and compute the covariance directly.

NOTE: (Cache, Trick, ASAT) is ordered from faster to slower on both CPU and GPU

In [None]:
def trick_fn(net, mu, cov):
    with torch.no_grad():
        AS = gnm.utils.jac_at_x(net, mu, gnm.utils.sqrtm(cov))
    return AS.t().mm(AS)

def asat_fn(net, mu, cov):
    A = gnm.utils.linearize(net, mu.view(1, -1), True)[0]
    return A.mm(cov).mm(A.t())

def cache_fn(A, cov):
    return A.mm(cov).mm(A.t())

def asat_cache_trick(device, trials=7, dim=1000, layers=10, sigma=10, dtype=torch.float64, file=None):
    try:
        dims, times = torch.load(file)
    except:
        times = []
        dims = [2**i for i in range(13, 15)]
        for n in gnm.utils.verbosify(dims):
            net = gnm.net.Sequential(*[
                layer for i in range(1, layers + 1) for layer in (
                    torch.nn.Linear(n if i == 1 else dim,
                                    n if i == layers else dim),
                    torch.nn.ReLU(inplace=True),
                )
            ][:-1]).to(device, dtype).eval()

            mu, cov, _ = get_gaussian(n, sigma=sigma, dtype=dtype, device=device)
            mu, cov = mu[0], cov[0]

            t = time()
            for _ in range(trials):
                r = asat_fn(net, mu, cov)
            asat = (time() - t) / trials

            t = time()
            for _ in range(trials):
                r = trick_fn(net, mu, cov)
            trick = (time() - t) / trials
            
            A = gnm.utils.linearize(net, mu.view(1, -1), True)[0]
            t = time()
            for _ in range(trials):
                r = cache_fn(A, cov)
            cache = (time() - t) / trials
            
            times.append((asat, trick, cache))
        if file is not None:
            torch.save((dims, times), file)
    plt.figure()
    plt.tick_params('y', right=True)
    plt.plot(dims, [t[0] for t in times], '+-b', label='ASA^T')
    plt.plot(dims, [t[1] for t in times], '+-r', label='Trick')
    plt.plot(dims, [t[2] for t in times], '+-g', label='Cache')
    plt.ylabel('Average time in seconds')
    plt.xlabel('Number of dimensions')
    plt.xscale('log')
    plt.yscale('log')
    plt.title('Profiling on ' + str(device))
    plt.legend()
    plt.show()

asat_cache_trick(device='cpu', file='data/static/asat_cpu.pt')
asat_cache_trick(device='cuda', file='data/static/asat_gpu.pt')

## Computing the covariance of ReLU for general Gaussian input

Which is more accurate when computing the output covariance of ReLU?
 - To copy the input covariance
 - Replace the output variance with Hinton's expressions

Test all four combinations {(copy, replace), (copy), (replace), ()}

NOTE: The best approach is to replace

In [None]:
def covariance_computation(n=100, sigma=100, count=100000):
    mu = sigma * torch.randn(n, dtype=torch.float64)
    cov = gnm.utils.rand.definite(n, norm=sigma ** 2, dtype=torch.float64)
    out = gaussian(mu, cov).sample((count,)).clamp(min=0.0)
    mc_mean = out.mean(0)
    mc_cov = gnm.utils.cov(out)
    mc_var = mc_cov.diag()
    def ocov(copy=False, replace=True):
        if copy:
            out_cov = cov.clone()
        else:
            out_cov = gnm.relu.zero_mean_covariance(cov)
        if replace:
            out_mu, out_var = gnm.relu.moments(mu, cov.diag())
            out_cov.diagonal(dim1=-2, dim2=-1).copy_(out_var)
        return out_cov
    print('copy only:', error(mc_cov, ocov(True, False)))
    print('copy and replace:', error(mc_cov, ocov(True, True)))
    print('neither:', error(mc_cov, ocov(False, False)))
    print('replace:', error(mc_cov, ocov(False, True)))  # gnm.relu.batch_moments is using this method

covariance_computation()

## ADF vs two-stage vs one-stage linearization tightness

NOTE: 2-stage and 1-stage sometimes give the same error

TODO: Binary classification visualization then test on MNIST and LeNet with two points (one is close to the decision boundary and the other is far)

In [None]:
torch.manual_seed(0)

def adf_vs_sl(dims=2, num_layers=4, bias_in_first_layer=True, zero_mean=False,
              samples_count=int(1e7), sigmas=(10,), verbose=False):
    net, lrs, tsl = get_net(num_layers, dims, bias_in_first_layer, verbose)
    mu, cov, var = get_gaussian(dims, 1, zero_mean)
    samples = gaussian(mu[0] * 0, cov[0]).sample((samples_count,))

    errors = []
    for sigma in (gnm.utils.verbosify(sigmas) if not verbose else sigmas):
        v = sigma ** 2
        out = net(samples * sigma + mu[0])
        mc_mean = out.mean(dim=0, keepdim=True)
        mc_var = out.var(dim=0, keepdim=True)

        adf_m, adf_v = gnm.net.adf.gaussian(lrs, mu, var * v, independent=True)
        adf_m_err = error(mc_mean, adf_m)
        adf_v_err = error(mc_var, adf_v)
        if verbose:
            print('ADF errors           :', [adf_m_err, adf_v_err])

        lin2_m, lin2_v = gnm.net.adf.gaussian(tsl, mu, cov * v,
                                              independent=False, linearize=True)
        lin2_m_err = error(mc_mean, lin2_m)
        lin2_v_err = error(mc_var, lin2_v)
        if verbose:
            print('2-stage linearization:', [lin2_m_err, lin2_v_err])

        lin2i_m, lin2i_v = gnm.net.adf.gaussian(tsl, mu, cov * v,
                                                independent=True, linearize=True)
        lin2i_m_err = error(mc_mean, lin2i_m)
        lin2i_v_err = error(mc_var, lin2i_v)
        if verbose:
            print('Indep. 2-stage lin   :', [lin2i_m_err, lin2i_v_err])

        lin1_m, lin1_v = gnm.net.adf.gaussian(
            [lambda x: net(x)],  # pylint: disable=W0108
            mu, cov * v, independent=False, linearize=True
        )
        lin1_m_err = error(mc_mean, lin1_m)
        lin1_v_err = error(mc_var, lin1_v)
        if verbose:
            print('1-stage linearization:', [lin1_m_err, lin1_v_err])
        errors.append(((adf_m_err, adf_v_err),
                       (lin2i_m_err, lin2i_v_err),
                       (lin2_m_err, lin2_v_err),
                       (lin1_m_err, lin1_v_err)))

    if len(sigmas) > 1:
        plt.figure()
        plt.plot(sigmas, [e[0][0] for e in errors], 'b', label='ADF')
        plt.plot(sigmas, [e[1][0] for e in errors], 'o-w', label='Li2')
        plt.plot(sigmas, [e[2][0] for e in errors], '+-r', label='Ln2')
        plt.plot(sigmas, [e[3][0] for e in errors], 'g', label='Ln1')
        plt.ylabel('Error')
        plt.xlabel('sigma')
        plt.title('Mean Errors')
        plt.legend()
        plt.show()

        plt.figure()
        plt.plot(sigmas, [e[0][1] for e in errors], 'b', label='ADF')
        plt.plot(sigmas, [e[1][1] for e in errors], 'o-w', label='Li2')
        plt.plot(sigmas, [e[2][1] for e in errors], '+-r', label='Ln2')
        plt.plot(sigmas, [e[3][1] for e in errors], 'g', label='Ln1')
        plt.ylabel('Error')
        plt.xlabel('sigma')
        plt.title('Variance Errors')
#         plt.yscale('log')
        plt.legend()
        plt.show()


# print('Sanity checks:')  # small linearization errors
# adf_vs_sl(num_layers=2, bias_in_first_layer=False,
#           zero_mean=True, sigmas=(1,), verbose=True)

print('\nBehaviour of sigmas:')
adf_vs_sl(num_layers=5, sigmas=torch.linspace(1, 10, 10).numpy().tolist())

## Variance tightness with similar computation constraints

To compute the output variance using our expressions, we need to do a number of forward passes. One might say that doing Monte-Carlo estimation using a number of samples equal to the number of forward passes we need for our expressions might actually be tighter than our method. So, we need to verify this with varying dimensionlaity and noise level.

In [None]:
def constrained_variance(dim=100, layers=3, sigma=1, count=10000,
                         dtype=torch.float64, device='cpu'):
    for n in [2**i for i in range(1, 13)]:
        net = gnm.net.Sequential(*[
            layer for i in range(1, layers + 1) for layer in (
                torch.nn.Linear(n if i == 1 else dim,
                                n if i == layers else dim),
                torch.nn.ReLU(inplace=True),
            )
        ][:-1]).to(device, dtype).eval()
        relu = 2 * max(num_layers - 1, 1) - 1  # index of linearization layer
        tsl = gnm.net.Sequential.encapsulate(
            net[:relu], net[relu:relu + 1], net[relu + 1:])
        mu, cov, _ = get_gaussian(n, sigma=sigma, dtype=dtype, device=device)

        dist = gaussian(mu[0], cov[0])
        samples = dist.sample((count,))
        mc_var = net(samples).var(dim=0)

        samples = dist.sample((n,))
        small_mc_var = net(samples).var(dim=0)
        exp_var = gnm.net.adf.gaussian(tsl, mu, cov, independent=False)

        small_mc_error = error(mc_var, small_mc_var)
        exp_error = error(mc_var, exp_var)

        print(small_mc_error - exp_error)

## Gaussianity test of the output of each layer of neural networks

ADF is assuming that the output of each layer in the neural network is uncorrelated Gaussian.
The Gaussianity of data samples can be tested using either [hypothesis testing](https://link.springer.com/content/pdf/10.1007%2Fs00362-002-0119-6.pdf) (whether there is sufficient evidence that the data is Guassian or not) or by estimating the PDF as a histogram and compare it to the PDF of the best Gaussian fit. [This](https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Multivariate_normality_tests) is an example of a hypothesis test for a multivariate Gaussian but first it assumes that the covariance matrix is full-rank. However, in our case, the covariance matrix is most probably rank deficient since the affine transformations are themselves rank deficient. Plus, after the ReLU some units might actually be almost determistic zero which makes this assumption even more strict. However, in this case, we can work with each element and [test its Gaussianity](https://machinelearningmastery.com/a-gentle-introduction-to-normality-tests-in-python/) independently.

NOTE: Each affine after a ReLU manages, somehow, to return the distribution to Gaussian

In [4]:
def fit_normal_and_display(x):
    plt.figure()
    fit = gnm.utils.stats.gaussian.fit(x)
    label = 'PDF fits ~{:.2f}%'.format(100 * fit['similarity'])
    plt.plot(fit['xs'].numpy(), fit['pdf'].numpy(), 'c', label=label)
    label = 'N({:.2f}, {:.2f}^2)'.format(fit['mean'], fit['std'])
    plt.plot(fit['xs'].numpy(), fit['fit'].numpy(), 'g', label=label)
    plt.legend()
    plt.show()
    
def display_layer_gaussianity(normality):
    bins = gnm.utils.stats.num_hist_bins(normality, min_bins=50)
    hist = normality.histc(bins, min=0, max=1)
    xs = torch.linspace(0, 100, bins)
    plt.figure()
    plt.plot(xs.numpy(), hist.numpy())
    plt.xlabel('Gaussianity fit')
    plt.ylabel('Number of units')
    msg = 'Mean fit: {:.2f}% ({} units : {} bins)'
    plt.title(msg.format(100 * normality.mean(), normality.numel(), bins))
    plt.show()
    
def network_gaussianity(net, mu, sigma, count):
    if torch.is_tensor(sigma) and sigma.numel() > 1:
        if sigma.dim() == 1:
            sigma = sigma.diag()
    else:
        sigma = gnm.utils.rand.definite(mu.numel(), norm=sigma ** 2,
                                        dtype=mu.dtype, device=mu.device)
    print('Testing the Gaussianity of the output of each layer in {}\n'
          'with a Gaussian input that has a covariance matrix of norm {}\n'
          'using Monte-Carlo estimation with {} samples around the image\n'.format(
              type(net).__name__, sigma.norm(), count
          ))

    x = gaussian(mu.view(-1), sigma).sample((count,)).view(-1, *mu.shape[1:])
    for layer in gnm.net.Sequential.split_layers(net):
        x = layer(x)
        if isinstance(layer, gnm.utils.Flatten):
            continue
        if hasattr(layer, 'layers'):
            for el in layer.layers:
                print(el)
        else:
            print(layer)
        reshaped_x = x.view(x.size(0), -1)
        normality = gnm.utils.stats.gaussian.gaussianity(reshaped_x, std_threshold=1e-5)
        normality[normality > 0.99] = 0  # remove deterministic (variance = 0)
#         fit_normal_and_display(reshaped_x[:, normality.argmin()])
        display_layer_gaussianity(normality)

In [7]:
# network_gaussianity(lenet, get_image(mnist_train_loader, 1) + 127.5, sigma=64, count=1000)

In [None]:
# network_gaussianity(lenet, torch.randn(1, 1, 28, 28), sigma=0.3, count=1000)

In [11]:
# net = gnm.net.Sequential(
#     torch.nn.Linear(1, 2),
#     torch.nn.ReLU(inplace=True),
#     torch.nn.Linear(2, 4),
#     torch.nn.ReLU(inplace=True),
#     torch.nn.Linear(4, 15),
#     torch.nn.ReLU(inplace=True),
#     torch.nn.Linear(15, 50),
#     torch.nn.ReLU(inplace=True),
#     torch.nn.Linear(50, 500),
# ).double().eval()
# network_gaussianity(net, torch.randn(1, 1).double(), sigma=0.3, count=1000)

## Computing the output variance of affine for Gaussian input $\mathbf{x}\sim\mathcal{N}\left(\mathbf{\mu}, \mathbf{\Sigma} =\text{diag}\left(\mathbf{v}\right)\right)$

The variance of affine ($\mathbf{A}\mathbf{x}+\mathbf{b}$) is $\text{diag}\left(\mathbf{A}\mathbf{\Sigma}\mathbf{A}^\top\right) = \text{diag}\left(\mathbf{A}\text{diag}\left(\mathbf{v}\right)\mathbf{A}^\top\right) = (\mathbf{A}^2)\mathbf{v}$ (because $\mathbf{v} \geq \mathbf{0}$)

If $\mathbf{v} = \sigma^2\mathbf{1}$, The variance will be $\sigma^2\text{diag}\left(\mathbf{A}\mathbf{A}^\top\right) = \sigma^2\left(\mathbf{A}^2\right)\mathbf{1} = \sigma^2\ \text{sum_columns}\left(\mathbf{A}^2\right)$

In [2]:
import torch
import network_moments.torch.gaussian as gnm

class Net(gnm.net.Sequential):
    def __init__(self, conv=True, shallow=False):
        if conv:
            part1 = [
                torch.nn.Conv2d(3, 2, kernel_size=5),
                gnm.utils.Flatten(),
            ]
        else:
            part1 = [torch.nn.Linear(5, 72)]
        part2 = [] if shallow else [
            torch.nn.ReLU(inplace=True),
            torch.nn.Linear(10, 2),
            torch.nn.ReLU(inplace=True),
            torch.nn.Linear(2, 2),
        ]
        super().__init__(
            *part1,
            torch.nn.ReLU(inplace=True),
            torch.nn.Linear(72, 10),
            *part2,
        )
        
    def mean(self, mu, var):
        '''Compute the output mean of the network for gaussian input.
        
        Args:
            mu: Input mean (Batch, *Size).
            var: The input variance (Size) or a scalar.
            
        Returns:
            The output mean of the network.
        '''
        layer = self[0]
        if not torch.is_tensor(var):
            var = torch.tensor(var, dtype=mu.dtype, device=mu.device)
        if isinstance(layer, torch.nn.Linear):
            if not isinstance(self[1], torch.nn.ReLU):
                raise ValueError('The second layer of the network must be a ReLU')
            layers = self[2:]
            w = layer.weight
            affine_mu = layer(mu)
            if var.numel() == 1:
                affine_std = ((w * w).sum(1) * var).sqrt().unsqueeze_(0)
            else:
                affine_std = (w * w).mv(var).sqrt().unsqueeze_(0)
        elif isinstance(layer, torch.nn.Conv2d):
            if not isinstance(self[1], gnm.utils.Flatten):
                raise ValueError('The second layer of the network must be a gnm.utils.Flatten')
            if not isinstance(self[2], torch.nn.ReLU):
                raise ValueError('The third layer of the network must be a ReLU')
            layers = self[3:]
            w = layer.weight
            affine_mu = self[1](layer(mu))
            if var.numel() == 1:
                var = var.repeat(1, *mu.shape[1:])
            else:
                var = var.view(1, *mu.shape[1:])
            affine_std = self[1](torch.nn.functional.conv2d(var, w**2,
                                                            stride=layer.stride,
                                                            padding=layer.padding,
                                                            dilation=layer.dilation,
                                                            groups=layer.groups).sqrt())
        else:
            msg = 'Don\'t know how to compute the moments for {}'
            raise NotImplemented(msg.format(type(layer)))
        relu_mu = gnm.relu.mean(affine_mu, affine_std, std=True)
        return self.forward(relu_mu, layers=layers)
    
    @staticmethod
    def test_linear(shallow=True, n=7, sigma=10, count=100000, dtype=torch.float64):
        net = Net(conv=False, shallow=shallow).to(dtype)
        if not isinstance(net[0], torch.nn.Linear):
            raise ValueError('The first layer of the network must be a torch.nn.Linear')
        print(net)
        mu = sigma * torch.randn(n, net[0].in_features, dtype=dtype)
        var = sigma**2 * torch.rand(mu.size(-1), dtype=dtype)
        normal_samples = torch.distributions.MultivariateNormal(
            mu[0, ...] * 0, var.diag()).sample((count,))
        for i in range(mu.size(0)):
            with torch.no_grad():
                samples = normal_samples + mu[i, ...]
                out_mu = net.mean(mu[i:i+1, ...], var)
                mc_mu = net(samples).mean(0, keepdim=True)
            print(round((out_mu / mc_mu).abs().mean().item(), 1))
    
    @staticmethod
    def test_conv2d(shallow=True, n=7, sigma=10, count=100000, dtype=torch.float64):
        net = Net(conv=True, shallow=shallow).to(dtype)
        if not isinstance(net[0], torch.nn.Conv2d):
            raise ValueError('The first layer of the network must be a torch.nn.Conv2d')
        print(net)
        mu = sigma * torch.randn(n, net[0].in_channels, 10, 10, dtype=dtype)
        var = sigma**2 * torch.rand(*mu.shape[1:], dtype=dtype)
        normal_samples = torch.distributions.MultivariateNormal(
            mu[0, ...].view(-1) * 0, var.view(-1).diag()).sample((count,)).view(count, *mu[0].size())
        for i in range(mu.size(0)):
            with torch.no_grad():
                samples = normal_samples + mu[i, ...]
                out_mu = net.mean(mu[i:i+1, ...], var)
                mc_mu = net(samples).mean(0, keepdim=True)
            print(round((out_mu / mc_mu).abs().mean().item(), 1))
    
    def loss(self, x, gamma=0.4):
        mu = self.moments(x, torch.tensor(0.3, dtype=x.dtype, device=x.device))
        return self.forward(x).sum() + gamma * mu.sum()

# Net.test_linear(shallow=True)
# Net.test_linear(shallow=False)
# Net.test_conv2d(shallow=True)
# Net.test_conv2d(shallow=False)