In [1]:
# In linear networks, PDLT says that covariance for activation at some neuron
# will be $G^{(l)}_2 = C_W^l 1/n_0 \sum_{j=1}^{n_0} x_{j}^2
#
# The intralayer interaction of the magnitudes then is
# E[(z_j z_j - G_2)(z_k z_k - G_2)] = G_4 - G_2^2 = 2(l-1)/n G_2^2

# TODO: Vary l and n to see around when the binomial approximation really breaks down

In [38]:
from collections import defaultdict

import numpy as np
import torch

In [75]:
# So we'll randomly initialize an input vector
# Then randomly initialize the network 1_000 times and store the activation at each
# layer. Then we'll compute the covariances of these activations

input_dim = 10
# Random input
input = torch.randn((1, input_dim))

attempts = 1_000

activations_per_layer = defaultdict(list)

In [76]:
n = 1000
l = 3
C_W = 1

for attempt in range(attempts):
    if attempt % 10 == 0:
        print(attempt)
    x = input
    layers = []
    for i in range(l):
        first_dim = input_dim if i == 0 else n
        layer = torch.nn.Linear(first_dim, n)
        # initialize layer
        torch.nn.init.normal_(layer.weight, std=(C_W / first_dim) ** 0.5)
        torch.nn.init.zeros_(layer.bias)
        layers.append(layer)

    model = torch.nn.Sequential(*layers)
    # Forward pass and compute covariance (passing layer by layer to avoid using hooks)
    for i, layer in enumerate(model):
        x = layer(x)
        activations_per_layer[i].append(x.detach().numpy().flatten())

In [77]:
len(activations_per_layer), len(activations_per_layer[0])

(3, 1000)

In [78]:
# Covariance of activations per layer
covariances = []
for layer, activations in activations_per_layer.items():
    arr = np.array(activations)
    assert arr.shape == (attempts, n)  # cols are different variables, rows are observatiosn
    cov = np.cov(arr, rowvar=False)
    covariances.append(cov)

In [95]:
# Off-diagonal ones should be 0
for layer, cov in enumerate(covariances):
    maximum_deviation = 0
    for i in range(n):
        for j in range(n):
            if i != j:
                var = cov[i][j]
                if var > maximum_deviation:
                    maximum_deviation = var
    print(f"{layer=} {maximum_deviation=}")

layer=0 maximum_deviation=0.0610354025876653
layer=1 maximum_deviation=0.059889834126678164
layer=2 maximum_deviation=0.05731890901522189


In [90]:
# What's inner product of input?
inner_product = sum(component**2 for component in input.numpy().flatten()) / len(
    input.numpy().flatten()
)
inner_product

0.39986245322023706

In [96]:
# Diagonal entries should scale with C_W
for layer, cov in enumerate(covariances):
    maximum_deviation = 0
    for i in range(n):
        abs_deviation = abs(cov[i][i] - C_W**layer * inner_product)
        if abs_deviation > maximum_deviation:
            maximum_deviation = abs_deviation
    print(f"{layer=} {maximum_deviation=}")

layer=0 maximum_deviation=0.06427837598920177
layer=1 maximum_deviation=0.059946861309742694
layer=2 maximum_deviation=0.06606357322730416


In [122]:
# Now let's try the fourth cumulant
# For every pair of activations per layer, subtract the covariance of each activation
# and get product of deviations
fourth_cumulant = []
for layer in range(l):
    print(f"{layer=}")
    cov = covariances[layer]
    fourth_cumulant.append([])
    # Let's just use the first attempt
    activations = activations_per_layer[layer][0]
    # Choose two disjoint neurons
    maximum_ = 0
    i_, j_ = (0, 0)
    for i in range(n):
        fourth_cumulant[layer].append([])
        for j in range(n):
            deviation_i = activations[i] ** 2 - cov[i][i]
            deviation_j = activations[j] ** 2 - cov[j][j]
            prod = deviation_i * deviation_j
            if prod > maximum_ and i != j:
                maximum_ = prod
                i_, j_ = i, j

            fourth_cumulant[layer][i].append(prod)
    print(maximum_, i_, j_)

    fourth_cumulant[layer] = np.array(fourth_cumulant[layer])

layer=0
38.866870436205375 115 538
layer=1
21.8250891684642 125 766
layer=2
12.861546160345714 4 842


In [123]:
fourth_cumulant[0].shape

(1000, 1000)

In [125]:
# Off-diagonal ones should be 2(l-1)/n * G_2^2
for layer in range(l):
    print(f"{layer=}")
    cov = covariances[layer]
    deviations_products = fourth_cumulant[layer]

    expected_correlation = 2 * (layer - 1) / n * (C_W**layer * inner_product) ** 2

    total_deviations_sum = 0
    for i in range(n):
        for j in range(n):
            if i != j:
                total_deviations_sum += deviations_products[i][j]
    mean_deviation = total_deviations_sum / (n**2 - n)
    print(f"{layer=} {mean_deviation=} {expected_correlation}")

layer=0
layer=0 mean_deviation=-0.0003021667286407805 -0.00031977996299061256
layer=1
layer=1 mean_deviation=7.721738728385593e-05 0.0
layer=2
layer=2 mean_deviation=0.00023255435845350234 0.00031977996299061256
