In [1]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
import sys
import torch as t
from torch import nn, Tensor
from torch.distributions.categorical import Categorical
from torch.nn import functional as F
from dataclasses import dataclass
import numpy as np
import einops
from jaxtyping import Float, Int
from typing import Optional, Callable, Union, List, Tuple, Dict
from functools import partial
from tqdm.notebook import tqdm
from dataclasses import dataclass
from rich import print as rprint
from rich.table import Table
from IPython.display import display, HTML
from pathlib import Path
from scipy.optimize import fsolve
#import matplotlib.pyplot as plt

# Make sure exercises are in the path
sys.path.append(str(Path.cwd().parent))

device = t.device("cuda" if t.cuda.is_available() else "cpu")

from superposition_lib import Config, Model




In [2]:
device

device(type='cuda')

In [2]:
n_features = 80
n_hidden = 20

inner_goup_distance = 1.
group_size = 5
p_transfer = 0.9

importance = (1.0 ** t.arange(n_features))
importance = einops.rearrange(importance, "features -> () features")

n_instances = 50
feature_probability = t.linspace(0.01, 0.9, n_instances)

groupings =[ [dict(members=list(range(i, i+group_size)), semantic_distance=inner_goup_distance, p_transfer = p_transfer) for i in range(0, n_features, group_size)] for _ in range(n_instances)]

#feature_probability = t.ones(n_instances)*0.001
feature_probability = einops.rearrange(feature_probability, "instances -> instances ()")

cfg = Config(
    n_instances = len(feature_probability.squeeze()),
    n_features = n_features,
    n_hidden = n_hidden,
)

model = Model(
    cfg = cfg,
    device = device,
    importance = importance,
    feature_probability = feature_probability,
    groupings = groupings,
)
model.optimize(steps=10_000)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [7]:
#make a new supfolder of data named experiment_1

os.makedirs("data/experiment_1", exist_ok=True)

#save the model

t.save(model.state_dict(), "data/experiment_1/model.pth")
# save the feature probabilities
t.save(feature_probability, "data/experiment_1/feature_probability.pth")
# save the groupings
t.save(groupings, "data/experiment_1/groupings.pth")

In [8]:
n_features = 80
n_hidden = 20

inner_goup_distance = 1.
group_size = 5

importance = (1.0 ** t.arange(n_features))
#importance = t.cat([t.tensor([1-0.1*(i+1)/2 for i in range(group_size)])**(j+1) for j in  range(0, n_features, group_size)])
importance = einops.rearrange(importance, "features -> () features")
n_instances = 25
p_transfer = t.linspace(0.01, 0.9, n_instances)

feature_probability = t.ones(n_instances)*0.4
groupings =[[dict(members=list(range(i, i+group_size)), semantic_distance=inner_goup_distance, p_transfer = p_t) for i in range(0, n_features, group_size)] for p_t in p_transfer]

#feature_probability = t.ones(n_instances)*0.001
feature_probability = einops.rearrange(feature_probability, "instances -> instances ()")

cfg = Config(
    n_instances = len(feature_probability.squeeze()),
    n_features = n_features,
    n_hidden = n_hidden,
)

model = Model(
    cfg = cfg,
    device = device,
    importance = importance,
    feature_probability = feature_probability,
    groupings = groupings,
)
model.optimize(steps=10_000)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [9]:
#make a new supfolder of data named experiment_1

os.makedirs("data/experiment_2", exist_ok=True)

#save the model

t.save(model.state_dict(), "data/experiment_2/model.pth")
# save the feature probabilities
t.save(p_transfer, "data/experiment_2/p_transfer.pth")
# save the groupings
t.save(groupings, "data/experiment_2/groupings.pth")

In [2]:
n_features = 80
n_hidden = 20
group_size = 5

importance = (1.0 ** t.arange(n_features))
#importance = t.cat([t.tensor([1-0.1*(i+1)/2 for i in range(group_size)])**(j+1) for j in  range(0, n_features, group_size)])
importance = einops.rearrange(importance, "features -> () features")
p_transfer = 0.2

n_instances = 25
inner_goup_distance = t.logspace(-4, 0, n_instances)

feature_probability = t.ones(n_instances)*0.4

groupings =[[dict(members=list(range(i, i+group_size)), semantic_distance=igd, p_transfer = p_transfer) for i in range(0, n_features, group_size)] for igd in inner_goup_distance]

#feature_probability = t.ones(n_instances)*0.001
feature_probability = einops.rearrange(feature_probability, "instances -> instances ()")

cfg = Config(
    n_instances = len(feature_probability.squeeze()),
    n_features = n_features,
    n_hidden = n_hidden,
)

model = Model(
    cfg = cfg,
    device = device,
    importance = importance,
    feature_probability = feature_probability,
    groupings = groupings,
)
model.optimize(steps=10_000)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [3]:
model.to("cpu")

Model()

In [4]:
#make a new supfolder of data named experiment_1

os.makedirs("data/experiment_3", exist_ok=True)

#save the model

t.save(model.state_dict(), "data/experiment_3/model.pth")
# save the feature probabilities
t.save(inner_goup_distance, "data/experiment_3/inner_goup_distance.pth")
# save the groupings
t.save(groupings, "data/experiment_3/groupings.pth")

In [14]:
n_instances = 25
inner_goup_distance = t.logspace(-2, 1, n_instances)

In [8]:
plot_features_in_Nd(
    model.W,
    height = 600,
    width = 1400,
    title = "ReLU output model: n_features = 80, d_hidden = 20, I<sub>i</sub> = 0.9<sup>i</sup>",
    subplot_titles = [f"p<sub>transfer</sub> = {p_t}" for p_t in p_transfer],
)

tensor([1.0000e-02, 1.3335e-02, 1.7783e-02, 2.3714e-02, 3.1623e-02, 4.2170e-02,
        5.6234e-02, 7.4989e-02, 1.0000e-01, 1.3335e-01, 1.7783e-01, 2.3714e-01,
        3.1623e-01, 4.2170e-01, 5.6234e-01, 7.4989e-01, 1.0000e+00, 1.3335e+00,
        1.7783e+00, 2.3714e+00, 3.1623e+00, 4.2170e+00, 5.6234e+00, 7.4989e+00,
        1.0000e+01])