In [1]:
# !pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/cu121
# !pip install tqdm
# !pip install matplotlib 
# !pip install plotly
# !pip install pandas
# !pip install wandb
!pip install einops

Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.8.0


In [2]:
# adapted from https://colab.research.google.com/drive/1F6_1_cWXE5M7WocUcpQWp3v8z4b1jL20 (https://arxiv.org/abs/2301.05217), thanks!

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import einops
import tqdm

import random
import time

from pathlib import Path
import pickle
import os
import sys

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "colab"
import plotly.graph_objects as go

from torch.utils.data import DataLoader

from functools import *
import pandas as pd
import gc

# import comet_ml
import wandb
import itertools

# find path of the project from the script
root_path = '/kaggle/working'
# from analysis.utils import extract_embeddings

# 改成你的wandb key
wandb.login(key='2b1626edceae9b68a67d66923587da64398da02c') 
# 改成你的wandb key

class HookPoint(nn.Module):
    def __init__(self):
        super().__init__()
        self.fwd_hooks = []
        self.bwd_hooks = []
    def give_name(self, name):
        self.name = name
    def add_hook(self, hook, dir='fwd'):
        def full_hook(module, module_input, module_output):
            return hook(module_output, name=self.name)
        if dir=='fwd':
            handle = self.register_forward_hook(full_hook)
            self.fwd_hooks.append(handle)
        elif dir=='bwd':
            handle = self.register_backward_hook(full_hook)
            self.bwd_hooks.append(handle)
        else:
            raise ValueError(f"Invalid direction {dir}")
    def remove_hooks(self, dir='fwd'):
        if (dir=='fwd') or (dir=='both'):
            for hook in self.fwd_hooks:
                hook.remove()
            self.fwd_hooks = []
        if (dir=='bwd') or (dir=='both'):
            for hook in self.bwd_hooks:
                hook.remove()
            self.bwd_hooks = []
        if dir not in ['fwd', 'bwd', 'both']:
            raise ValueError(f"Invalid direction {dir}")
    def forward(self, x):
        return x

class Embed(nn.Module):
    def __init__(self, d_vocab, d_model):
        super().__init__()
        self.W_E = nn.Parameter(torch.randn(d_model, d_vocab)/np.sqrt(d_model))
    def forward(self, x):
        return torch.einsum('dbp -> bpd', self.W_E[:, x])

class Unembed(nn.Module):
    def __init__(self, d_vocab, d_model):
        super().__init__()
        self.W_U = nn.Parameter(torch.randn(d_model, d_vocab)/np.sqrt(d_vocab))
    def forward(self, x):
        return (x @ self.W_U)

# Positional Embeddings
class PosEmbed(nn.Module):
    def __init__(self, max_ctx, d_model):
        super().__init__()
        self.W_pos = nn.Parameter(torch.randn(max_ctx, d_model)/np.sqrt(d_model))
    def forward(self, x):
        return x+self.W_pos[:x.shape[-2]]

# Attention
class Attention(nn.Module):
    def __init__(self, d_model, num_heads, d_head, n_ctx, attn_coeff):
        super().__init__()
        self.W_K = nn.Parameter(torch.randn(num_heads, d_head, d_model)/np.sqrt(d_model))
        self.W_Q = nn.Parameter(torch.randn(num_heads, d_head, d_model)/np.sqrt(d_model))
        self.W_V = nn.Parameter(torch.randn(num_heads, d_head, d_model)/np.sqrt(d_model))
        self.W_O = nn.Parameter(torch.randn(d_model, d_head * num_heads)/np.sqrt(d_model))
        self.attn_coeff = attn_coeff
        self.register_buffer('mask', torch.tril(torch.ones((n_ctx, n_ctx))))
        self.d_head = d_head
        self.hook_k = HookPoint()
        self.hook_q = HookPoint()
        self.hook_v = HookPoint()
        self.hook_z = HookPoint()
        self.hook_attn = HookPoint()
        self.hook_attn_pre = HookPoint()

    def forward(self, x):
        k = self.hook_k(torch.einsum('ihd,bpd->biph', self.W_K, x))
        q = self.hook_q(torch.einsum('ihd,bpd->biph', self.W_Q, x))
        v = self.hook_v(torch.einsum('ihd,bpd->biph', self.W_V, x))
        attn_scores_pre = torch.einsum('biph,biqh->biqp', k, q)
        attn_scores_masked =attn_scores_pre
        normalized = self.hook_attn_pre(attn_scores_masked/np.sqrt(self.d_head))
        normalized = F.softmax(normalized, dim=-1)
        attn_matrix = self.hook_attn(
            normalized*self.attn_coeff+(1-self.attn_coeff))
        z = self.hook_z(torch.einsum('biph,biqp->biqh', v, attn_matrix))
        z_flat = einops.rearrange(z, 'b i q h -> b q (i h)')
        out = torch.einsum('df,bqf->bqd', self.W_O, z_flat)
        return out

# +
class MLP(nn.Module):
    def __init__(self, d_model, d_mlp, act_type):
        super().__init__()
        self.W_in = nn.Parameter(torch.randn(d_mlp, d_model)/np.sqrt(d_mlp))
        self.b_in = nn.Parameter(torch.zeros(d_mlp))
        self.W_out = nn.Parameter(torch.randn(d_model, d_mlp)/np.sqrt(d_model))
        self.b_out = nn.Parameter(torch.zeros(d_model))
        self.act_type = act_type
        # self.ln = LayerNorm(d_mlp, model=self.model)
        self.hook_pre = HookPoint()
        self.hook_post = HookPoint()
        assert act_type in ['ReLU', 'GeLU', 'Tanh']
        
    def forward(self, x):
        x = self.hook_pre(torch.einsum('md,bpd->bpm', self.W_in, x) + self.b_in)
        if self.act_type=='ReLU':
            x = F.relu(x)
        elif self.act_type=='GeLU':
            x = F.gelu(x)
        elif self.act_type=='Tanh':
            x = F.tanh(x)
        x = self.hook_post(x)
#        return x
        x = torch.einsum('dm,bpm->bpd', self.W_out, x) + self.b_out
        return x

class MyLinear(nn.Module):
    def __init__(self, d_model, act_type):
        super().__init__()
        self.W_in = nn.Parameter(torch.randn(d_model, d_model)/np.sqrt(d_model))
        self.b_in = nn.Parameter(torch.zeros(d_model))
        self.act_type = act_type
        self.hook_pre = HookPoint()
        self.hook_post = HookPoint()
        assert act_type in ['ReLU', 'GeLU', 'Tanh']
        
    def forward(self, x):
        x = torch.einsum('md,bpd->bpm', self.W_in, self.hook_pre(x)) + self.b_in
        if self.act_type=='ReLU':
            x = F.relu(x)
        elif self.act_type=='GeLU':
            x = F.gelu(x)
        elif self.act_type=='Tanh':
            x = F.tanh(x)
        x = self.hook_post(x)
        return x
        
# Transformer Block
class TransformerBlock(nn.Module):
    def __init__(self, d_model, d_head, num_heads, n_ctx, act_type, attn_coeff):
        super().__init__()
        self.attn = Attention(d_model, num_heads, d_head, n_ctx, attn_coeff=attn_coeff)
        self.mlp = MLP(d_model, d_model*4,act_type)
        self.hook_attn_out = HookPoint()
        self.hook_mlp_out = HookPoint()
        self.hook_resid_pre = HookPoint()
        self.hook_resid_mid = HookPoint()
        self.hook_resid_post = HookPoint()
    
    def forward(self, x):
        x = self.hook_resid_mid(x + self.hook_attn_out(self.attn(self.hook_resid_pre(x))))
        x = self.hook_resid_post(x + self.hook_mlp_out(self.mlp(x)))
        return x


# -

# Full transformer
class Transformer(nn.Module): # Model B
    def __init__(self, num_layers, d_vocab, d_model, d_head, num_heads, n_ctx, act_type, attn_coeff, use_cache=False, use_ln=True):
        super().__init__()
        assert 0<=attn_coeff<=1
        print('parameters', num_layers, d_vocab, d_model, d_head, num_heads, n_ctx, act_type, attn_coeff, use_cache, use_ln)
        self.cache = {}
        self.use_cache = use_cache

        self.embed = Embed(d_vocab, d_model)
        self.pos_embed = PosEmbed(n_ctx, d_model)
        self.unembed = Unembed(d_vocab, d_model)
        self.use_ln = use_ln
        self.blocks = nn.ModuleList([TransformerBlock(d_model, d_head, num_heads, n_ctx, act_type, attn_coeff) for i in range(num_layers)])

        for name, module in self.named_modules():
            if type(module)==HookPoint:
                module.give_name(name)
    
    def forward(self, x):
        x = self.embed(x)
        x = self.pos_embed(x)
        for blk in self.blocks:
            x = blk(x)
        x = self.unembed(x)
        return x

    def set_use_cache(self, use_cache):
        self.use_cache = use_cache
    
    def hook_points(self):
        return [module for name, module in self.named_modules() if 'hook' in name]

    def remove_all_hooks(self):
        for hp in self.hook_points():
            hp.remove_hooks('fwd')
            hp.remove_hooks('bwd')
    
    def cache_all(self, cache, incl_bwd=False):
        # Caches all activations wrapped in a HookPoint
        def save_hook(tensor, name):
            cache[name] = tensor.detach()
        def save_hook_back(tensor, name):
            cache[name+'_grad'] = tensor[0].detach()
        for hp in self.hook_points():
            hp.add_hook(save_hook, 'fwd')
            if incl_bwd:
                hp.add_hook(save_hook_back, 'bwd')
    
    def parameters_norm(self):
        # Returns the l2 norm of all parameters
        return sum([torch.sum(p*p).item() for p in self.parameters()])**0.5
    
    def l2_norm(self):
        # Returns the l2 norm of all parameters
        return sum([torch.sum(p*p) for p in self.parameters()])
    
    def parameters_flattened(self):
        # Returns all parameters as a single tensor
        return torch.cat([p.view(-1) for p in self.parameters()]).detach().cpu().numpy()


class Linearformer(nn.Module): # Model A???
    def __init__(self, num_layers, d_vocab, d_model, d_head, num_heads, n_ctx, act_type, attn_coeff, use_cache=False, use_ln=True):
        super().__init__()
        print('parameters(L)', num_layers, d_vocab, d_model, d_head, num_heads, n_ctx, act_type, attn_coeff, use_cache, use_ln)
        self.cache = {}
        self.use_cache = use_cache
        self.attn_coeff = attn_coeff

        self.embed = Embed(d_vocab, d_model//n_ctx)
        # pos embed is being commented in original code
        self.pos_embed = PosEmbed(n_ctx, d_model)
        self.unembed = Unembed(d_vocab, d_model)
        self.use_ln = use_ln
        self.blocks = nn.ModuleList([MyLinear(d_model, act_type) for i in range(num_layers)])
        self.padder = nn.ConstantPad1d((0,d_model%n_ctx),0)

        for name, module in self.named_modules():
            if type(module)==HookPoint:
                module.give_name(name)
    
    def forward(self, x):
        x = self.embed(x)
        #print(x.shape)
        x = x.reshape(-1,1,x.shape[1]*x.shape[2])
        #print(x.shape)
        x = self.padder(x)
        #print(x.shape)
        #print(x.shape)
        assert len(x.shape)==3 and x.shape[1:]==(1,d_model)
        x = self.pos_embed(x)
        for blk in self.blocks:
            x = blk(x)
        x = self.unembed(x)
        return x

    def set_use_cache(self, use_cache):
        self.use_cache = use_cache
    
    def hook_points(self):
        return [module for name, module in self.named_modules() if 'hook' in name]

    def remove_all_hooks(self):
        for hp in self.hook_points():
            hp.remove_hooks('fwd')
            hp.remove_hooks('bwd')
    
    def cache_all(self, cache, incl_bwd=False):
        # Caches all activations wrapped in a HookPoint
        def save_hook(tensor, name):
            cache[name] = tensor.detach()
        def save_hook_back(tensor, name):
            cache[name+'_grad'] = tensor[0].detach()
        for hp in self.hook_points():
            hp.add_hook(save_hook, 'fwd')
            if incl_bwd:
                hp.add_hook(save_hook_back, 'bwd')
    
    def parameters_norm(self):
        # Returns the l2 norm of all parameters
        return sum([torch.sum(p*p).item() for p in self.parameters()])**0.5
    
    def l2_norm(self):
        # Returns the l2 norm of all parameters
        return sum([torch.sum(p*p) for p in self.parameters()])
    
    def parameters_flattened(self):
        # Returns all parameters as a single tensor
        return torch.cat([p.view(-1) for p in self.parameters()]).detach().cpu().numpy()

DEVICE='cuda'
# DEVICE='cuda:'+str(random.randint(0,1))
print(DEVICE)
class MyAddDataSet(torch.utils.data.Dataset):
    def __init__(self, func, C, diff_vocab=False, eqn_sign=False):
        self.func = func
        dim = 2
        self.dim = dim
        self.C = C
        self.inputs = []
        self.outputs = []
        self.vocab=C
        if diff_vocab:
            self.vocab*=2
        if eqn_sign:
            self.vocab+=1
            self.dim+=1
        self.vocab_out=0
        for p in range(C**dim):
            x = np.unravel_index(p, (C,)*dim)
            o=self.func(x)
            s=[x[0],x[1]]
            if diff_vocab:
                s[1]+=C
            if eqn_sign:
                s.append(self.vocab-1)
            self.inputs.append(s)
            self.outputs.append(o)
            self.vocab_out=max(self.vocab_out, o+1)
        if self.vocab_out!=C:
            print(f'warning {self.vocab_out=} neq to {C=}')
        self.inputs = torch.tensor(self.inputs, dtype=torch.long, device=DEVICE)
        self.outputs = torch.tensor(self.outputs, dtype=torch.long, device=DEVICE)
        # print(self.inputs,self.outputs)
    def __len__(self):
        return len(self.outputs)
    def __getitem__(self, idx):
        return self.inputs[idx], self.outputs[idx]

def cross_entropy_high_precision(logits, labels):
    # Shapes: batch x vocab, batch
    # Cast logits to float64 because log_softmax has a float32 underflow on overly 
    # confident data and can only return multiples of 1.2e-7 (the smallest float x
    # such that 1+x is different from 1 in float32). This leads to loss spikes 
    # and dodgy gradients
    logprobs = F.log_softmax(logits.to(torch.float64), dim=-1)
    prediction_logprobs = torch.gather(logprobs, index=labels[:, None], dim=-1)
    loss = -torch.mean(prediction_logprobs)
    return loss

def run_experiment(config):
    exp_name=config['name']
    print('parsing func',config['funcs'])
    config['func']=eval(config['funcs'])
    #useLinear=config.get('use_linear',False)
    full_dataset = MyAddDataSet(func=config['func'],C=config['C'],diff_vocab=config['diff_vocab'],eqn_sign=config['eqn_sign'])
    model = Transformer(
        num_layers=config.get('n_layers',1),
        num_heads=config['n_heads'],
        d_model=config['d_model'],
        d_head=config.get('d_head',config['d_model']//config['n_heads']),
        attn_coeff=config['attn_coeff'],
        d_vocab=full_dataset.vocab,
#        attention_dir=config.get('attention_dir','bidirectional'),
        act_type=config.get('act_fn','relu'),
        n_ctx=full_dataset.dim,
#        normalization_type=None,
    )
    model.to(DEVICE)
    train_frac = config['trainfrac']
    train_size = int(config['frac'] * len(full_dataset))
    test_size = len(full_dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
    if train_frac is not None:
        new_train_size = int(train_frac * train_size)
        remove_size = train_size - new_train_size
        train_dataset, _ = torch.utils.data.random_split(train_dataset, [new_train_size, remove_size])
    print('random split',len(train_dataset),len(test_dataset))
    batch_size = config.get('batch_size',len(full_dataset))
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    opt = optim.AdamW(model.parameters(),lr=config.get('lr',1e-3),weight_decay=config.get('weight_decay',1e-4),betas=(0.9,0.98))
    scheduler = optim.lr_scheduler.LambdaLR(opt, lambda step: min(step/10, 1)) # 10 epoch warmup
    print(config.get('lr',1e-3),config.get('weight_decay',1e-4))
    print(opt,scheduler)
    losses=[]
    accs=[]
    losses_val=[]
    accs_val=[]
    norms=[]
    loss_val=10
    acc_val=0
    stop=None
    best_train_acc=0.
    best_test_acc=0.
    perfect_train_time=None
    perfect_test_time=None

    # modification start here
    embeddings=[]
    pbar = tqdm.tqdm(range(config.get('epoch',10000)))
    gaps=[]
    early_stop_a=2
    early_stop_b=1
    if config.get('early_stop',None) is not None:
        early_stop_a, early_stop_b = config['early_stop']
    early_stop_timer=0
    #model.train()

    run = wandb.init(reinit=True,config=config,project='modadd_longer')#,settings=wandb.Settings(start_method="spawn"))
    try:
        for i in range(config.get('epoch',10000)):
            def evaluation():
                nonlocal best_test_acc
                nonlocal perfect_test_time
                nonlocal early_stop_timer
                nonlocal early_stop_a
                nonlocal early_stop_b
                # evaluate on test set, return loss and accuracy
                # with torch.inference_mode():
                    #model.eval()
                losses_eval=[]
                accs_eval=[]
                for inp,ans in test_loader:
                    # print(inp.shape)
                    out = model(inp)[:,-1,:]
                    loss = cross_entropy_high_precision(out,ans)
                    acc = torch.sum((out.argmax(dim=1)==ans).float())/len(ans)
                    # print(inp,'test',out.argmax(dim=1),ans)
#                    acc = (out.argmax(dim=1)==ans).float().mean()
                    losses_eval.append(loss.item())
                    accs_eval.append(acc.item())
                    # print(loss,acc)
                #print(losses_eval,accs_eval)
                eval_loss, eval_acc = np.mean(losses_eval), np.mean(accs_eval)
                best_test_acc = max(best_test_acc, eval_acc)
                if eval_acc==1. and perfect_test_time is None:
                    perfect_test_time = i
                if eval_acc>=early_stop_a:
                    early_stop_timer+=1
                else:
                    early_stop_timer=0
                #print(eval_loss,eval_acc)
                return eval_loss, eval_acc
            if early_stop_timer>=early_stop_b:
                break
            for inp,ans in train_loader:
                #print(inp.shape,inp.dtype)
                # print(inp,'train')
                #print(len(inp))
                #model.train()
                out = model(inp)[:,-1,:]
                loss = cross_entropy_high_precision(out,ans)
                loss_val, acc_val = evaluation()
                #print(loss_val,acc_val)
                loss.backward()
                # clip gradients
                #if config.get('clip',None) is not None:
                #    nn.utils.clip_grad_norm_(model.parameters(), config['clip'])
                opt.step()
                scheduler.step()
                opt.zero_grad()
                acc = (out.argmax(dim=1)==ans).float().mean()
                norm = sum([torch.sum(p*p).item() for p in model.parameters()])**0.5
                #sum(p.norm()**2 for p in model.parameters()).sqrt().item()

                # save every 10 epochs
                if config['save_embeddings'] and i % 10 == 9:
                    embeddings.append(extract_embeddings(model))
                
                losses.append(loss.item())
                accs.append(acc.item())
                losses_val.append(loss_val)
                accs_val.append(acc_val)
                norms.append(norm)

                best_train_acc=max(best_train_acc,acc.item())
                if acc.item()==1. and perfect_train_time is None:
                    perfect_train_time = i
                gaps.append(best_train_acc-best_test_acc)
                # Store the final description instead of setting it during the loop
                final_description = (
                    f"loss: {loss.item():.3f}, accm: {best_train_acc:.3f}, "
                    f"vloss: {loss_val:.3f}, vaccm: {best_test_acc:.3f}, "
                    f"norm: {norm:.3f}, acc: {acc.item():.3f}, vacc: {acc_val:.3f}"
                )
                run.log({'training_loss': loss.item(),
                'validation_loss': loss_val,
                'training_accuracy': acc.item(),
                'validation_accuracy': acc_val,
                'parameter_norm': norm,
                'best_train_accuracy': best_train_acc,
                'best_test_accuracy': best_test_acc,
                'generalization_gap': best_train_acc-best_test_acc,
                'generalization_delay1': sum(gaps)})
        print(final_description)
    except KeyboardInterrupt:
        print('Keyboard interrupt. Gracefully exiting...')
        pass
    print('Finished.')
    generalization_gap=best_train_acc-best_test_acc
    generalization_delay1=sum(gaps)
    generalization_delay2=sum(max(t-(best_train_acc-best_test_acc),0) for t in gaps)
    run.summary["generalization_delay2"] = generalization_delay2
    # run.finish()
    return dict(
        losses=losses,
        accs=accs,
        losses_val=losses_val,
        accs_val=accs_val,
        norms=norms,
        model=model,
        config=config,
        generalization_gap=generalization_gap,
        generalization_delay1=generalization_delay1,
        generalization_delay2=generalization_delay2,
        best_train_acc=best_train_acc,
        best_test_acc=best_test_acc,
        perfect_train_time=perfect_train_time,
        perfect_test_time=perfect_test_time,
        dataset=full_dataset,
        embeddings=embeddings,
        run=run
    )

import random
import string
import seaborn as sns
import sys


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


cuda


### weight decay factor = 0.001, 0.1, 5, d_model =64

In [3]:
# # C_list = [19,29,39,49,59,89,119,239]
    
# for count in range(100):
#     # experiment_name = sys.argv[1] # 实验名字，等于保存模型的文件夹名字 ./save/{experiment_name}
#     # modify_part = sys.argv[2] # 改动的部分，保存模型的命名的一部分
#     # 随着loop变化的参数
#     attn_coeff = count * 0.01 # 可以同步设置weight_decay改动
#     weight_decay = 5.
#     epoch = 20000 
    
#     C=59
#     n_layers=1
#     diff_vocab=0
#     eqn_sign=0
#     d_model=64 #用不同的d_model
#     run_name = f"d{d_model}_wd{weight_decay}_attn{attn_coeff:.6f}"
#     print(run_name)
#     config=dict(
#         name='modadd_'+str(C),
#         funcs='lambda x: (x[0]+x[1])%'+str(C),
#         C=C,
#         n_heads=4,
#         d_model=d_model,
#         n_layers=n_layers,
#         attention_dir='casual',
#         # act_fn='GeLU' if random.randint(0,3)==0 else 'ReLU',
#         act_fn='ReLU',
#         epoch=epoch,
#         batch_size=C*C,
#         lr=1e-3,
#         weight_decay=weight_decay,
#         frac=0.8,
#         trainfrac=None, # 训练集大小改变
#         # should adjust the attn_coeff
#         # attn_coeff=frac_coeff,
#         attn_coeff=attn_coeff,
#         runid=run_name,
#         diff_vocab=diff_vocab,
#         eqn_sign=eqn_sign,
#         # use_linear=use_linear,
#         save_embeddings=False,
#     )
#     result_modadd=run_experiment(config)
    
#     # save embeddings, see analysis.utils.extract_embeddings for details
#     # if config['save_embeddings']:
#     #     embed_path = f'result/model_{"B" if config["attn_coeff"] else "A"}_embeddings.npz'
#     #     np.savez_compressed(os.path.join(root_path, embed_path), result_modadd['embeddings'])
    
#     run=result_modadd['run']
#     path = root_path + '/weight_decay'
#     if not os.path.exists(path):
#         os.makedirs(path)
#     model_name=os.path.join(root_path, f'{path}/model_{run_name}.pt')
#     model=result_modadd['model']
#     torch.save(model.state_dict(), model_name)
#     import json
#     config['func']=None
#     with open(os.path.join(root_path, f'{path}/config_{run_name}.json'),'w') as f:
#         json.dump(config,f,separators=(',\n', ': '))
#     run.finish()
    
#     # !python -m wandb offline

### keep C = 59 but change the frac of training set, attention rate [0, 1]

In [4]:
for attention_rate in [0]:
    for trainfrac in [0.4,0.5,0.6,0.7]:
        for i in range(10):
            experiment_name = 'trainfrac'
            modify_part = f'frac{trainfrac}'
            # for use_linear in [False]: # false model B, true model A
            # letters_and_numbers = string.ascii_lowercase + string.digits.replace('0', '')
            #run_name = 'A_repr_'.join(random.choices(letters_and_numbers, k=10))
            # run_name = 'A_repr_trans_'+str(count+1)
            attn_coeff = attention_rate
            epoch = 20000 #跑的时间实在太久
            if attn_coeff == 0:
                run_name = f"A_{modify_part}_{i}"
            else:
                run_name = f"B_{modify_part}_{i}"
            print(run_name)
            C=59
            n_layers=1
            # if random.randint(0,3):
            #     n_layers=random.randint(1,4)
            # frac_coeff=0.8
            diff_vocab=0
            eqn_sign=0
            # if random.randint(0,4)==0:
            #     diff_vocab=random.randint(0,1)
            #     eqn_sign=random.randint(0,1)
            d_model=128
            # if random.randint(0,2)==0:
            #     d_model=int(2**random.uniform(5,9))
            print(f'd={d_model}')
            config=dict(
                name='modadd_59',
                funcs='lambda x: (x[0]+x[1])%59',
                C=C,
                n_heads=4,
                d_model=d_model,
                n_layers=n_layers,
                attention_dir='casual',
                # act_fn='GeLU' if random.randint(0,3)==0 else 'ReLU',
                act_fn='ReLU',
                epoch=epoch,
                batch_size=C*C,
                lr=1e-3,
                weight_decay=2.,
                frac=0.8,
                trainfrac=trainfrac,
                # should adjust the attn_coeff
                # attn_coeff=frac_coeff,
                attn_coeff=attn_coeff,
                runid=run_name,
                diff_vocab=diff_vocab,
                eqn_sign=eqn_sign,
                # use_linear=use_linear,
                save_embeddings=False,
            )
            result_modadd=run_experiment(config)
        
            # save embeddings, see analysis.utils.extract_embeddings for details
            if config['save_embeddings']:
                embed_path = f'result/model_{"B" if config["attn_coeff"] else "A"}_embeddings.npz'
                np.savez_compressed(os.path.join(root_path, embed_path), result_modadd['embeddings'])
    
            run=result_modadd['run']
            path = root_path + f'/save/{experiment_name}'
            if not os.path.exists(path):
                os.makedirs(path)
            model_name=os.path.join(root_path, f'{path}/model_{run_name}.pt')
            model=result_modadd['model']
            torch.save(model.state_dict(), model_name)
            import json
            config['func']=None
            with open(os.path.join(root_path, f'{path}/config_{run_name}.json'),'w') as f:
                json.dump(config,f,separators=(',\n', ': '))
            run.finish()
    
    # !python -m wandb offline

A_frac0.4_0
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1113 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a26f177e1a0>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Currently logged in as: [33myhanmowsnoo[0m ([33myhanmowsnoo-royal-institute-of-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_202355-jahvt2hj[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33msweet-water-830[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/jahvt2hj[0m
  0%|          | 0/20000 [07:35<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.002, vaccm: 1.000, norm: 23.562, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▂██████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m: generalization_delay1 ▁▂▄▅▅███████████████████████████████████
[34m[1mwandb[0m:    generalization_gap ███▇▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm ██▇▇▆▅▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▁▁▁▁▇██████████████████████████████████
[34m[1mwandb[0m:       validation_loss ▆█▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.4_1
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1113 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605d624d0>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_203132-iw53urru[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mjolly-darkness-831[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/iw53urru[0m
  0%|          | 0/20000 [07:49<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.825, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▁██████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▁██████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▇▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.4_2
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1113 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605d60070>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_203923-7e3l3qym[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mhappy-river-832[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/7e3l3qym[0m
  0%|          | 0/20000 [07:41<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.905, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▄▄▅████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁███████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap ███▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm ███▆▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▁▃▇████████████████████████████████████
[34m[1mwandb[0m:       validation_loss ▆██▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.4_3
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1113 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605cde6e0>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_204706-pbberjh7[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mmisunderstood-shadow-833[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/pbberjh7[0m
  0%|          | 0/20000 [07:41<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.006, vaccm: 1.000, norm: 23.408, acc: 1.000, vacc: 0.997
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▁██████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▇▇█████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap █▇▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▇▆▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▇██████████████████████████████████████
[34m[1mwandb[0m:       validation_loss ████▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.4_4
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1113 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a25ca10a500>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_205449-abaoiq5u[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mhearty-shadow-834[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/abaoiq5u[0m
  0%|          | 0/20000 [07:44<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.044, vaccm: 0.989, norm: 24.097, acc: 1.000, vacc: 0.987
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▁▁▁▁███████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▃▃▃▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████████
[34m[1mwandb[0m:    generalization_gap ███▅▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▇▇▆▅▄▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▁▁▁▂███████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 0.

A_frac0.4_5
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1113 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a263f6011b0>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_210235-nkbg7t7n[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mwise-dream-835[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/nkbg7t7n[0m
  0%|          | 0/20000 [07:42<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.020, vaccm: 0.999, norm: 24.169, acc: 1.000, vacc: 0.994
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▁▁▂▇███████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁███████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap ██▇▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▆▅▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:         training_loss █▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▂▇█████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▇▇▆▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 0.

A_frac0.4_6
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1113 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605d397e0>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_211019-hgrh303x[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mhopeful-jazz-836[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/hgrh303x[0m
  0%|          | 0/20000 [07:44<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.799, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▁▁▁▂▅██████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▂▃▄████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap ██▆▄▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▇▅▅▄▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▁▁▁▁▃▅█████████████████████████████████
[34m[1mwandb[0m:       validation_loss ▇██▇▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.4_7
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1113 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a263ff677c0>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_211805-pcy394ju[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mdazzling-mountain-837[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/pcy394ju[0m
  0%|          | 0/20000 [07:40<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 22.554, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▂▆█████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▂▂▃████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap █████▆▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm ██▇▇▆▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▂▂█████████████████████████████████████
[34m[1mwandb[0m:       validation_loss ██▇▇▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.4_8
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1113 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605d39fc0>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_212547-dvwugqwm[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mneat-voice-838[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/dvwugqwm[0m
  0%|          | 0/20000 [07:41<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.039, vaccm: 0.997, norm: 23.430, acc: 1.000, vacc: 0.989
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▃▃▃▃▆██████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▁▂▂▅███████████████████████████████████
[34m[1mwandb[0m:    generalization_gap █▆▄▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm ██▇▆▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▁▃▄████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 0.

A_frac0.4_9
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1113 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605cdc670>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_213331-b3mqlwjx[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mstellar-leaf-839[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/b3mqlwjx[0m
  0%|          | 0/20000 [07:38<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.012, vaccm: 1.000, norm: 23.628, acc: 1.000, vacc: 0.996
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▁▁▂▂███████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▆▆▇████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap ██▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm ██▇▅▄▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▁▁▁▁▂▃█████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▇▅▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.5_0
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1392 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a263ffa4310>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_214111-h7zezne1[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mbreezy-oath-840[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/h7zezne1[0m
  0%|          | 0/20000 [08:45<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.561, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▄██████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap █▇▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm ██▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▂██████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.5_1
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1392 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2606e5f5e0>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_214958-omj7ofmb[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33msuper-totem-841[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/omj7ofmb[0m
  0%|          | 0/20000 [08:44<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 24.264, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▅██████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m: generalization_delay1 ▁▄▅▇▇███████████████████████████████████
[34m[1mwandb[0m:    generalization_gap █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▇▆▆▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.5_2
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1392 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605d623b0>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_215844-3h1dwgn5[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mvaliant-night-842[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/3h1dwgn5[0m
  0%|          | 0/20000 [08:45<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.471, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▃██████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▄▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▃▇█████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.5_3
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1392 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605cde560>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_220731-kj1rqdsx[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mdesert-night-843[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/kj1rqdsx[0m
  0%|          | 0/20000 [08:48<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.001, vaccm: 1.000, norm: 23.764, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▁▇█████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁███████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▄██████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.5_4
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1392 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605ccbac0>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_221621-illkbylp[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33msweet-glitter-844[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/illkbylp[0m
  0%|          | 0/20000 [08:47<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.001, vaccm: 1.000, norm: 24.768, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁███████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▇▇▅▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▆▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.5_5
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1392 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605d2bc40>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_222510-gpmqsbyc[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mazure-star-845[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/gpmqsbyc[0m
  0%|          | 0/20000 [08:42<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.504, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▁▃▅████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁███████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▆▅▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▇▅▅▅▄▄▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.5_6
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1392 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605cf8eb0>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_223354-ivx5fx59[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mmild-rain-846[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/ivx5fx59[0m
  0%|          | 0/20000 [08:46<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 22.768, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▁▁▂████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▁██████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap ▃█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm ███▄▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▃▇█████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.5_7
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1392 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605ccbb20>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_224242-jvbsmj5u[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mlilac-dragon-847[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/jvbsmj5u[0m
  0%|          | 0/20000 [08:46<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 22.976, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▁▇█████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m: generalization_delay1 ▁▇██████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap ██▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▇▅▄▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.5_8
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1392 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a25c9d3ce20>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_225130-6rdad7eo[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mdashing-shape-848[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/6rdad7eo[0m
  0%|          | 0/20000 [08:45<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.616, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁███████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▆▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▂▂█████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.5_9
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1392 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2606e5ece0>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_230017-88gujgpm[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33meager-pine-849[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/88gujgpm[0m
  0%|          | 0/20000 [08:44<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.844, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▁██████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▆██████████████████████████████████████
[34m[1mwandb[0m: generalization_delay1 ▁▇██████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap ▆██▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▁▃█████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.6_0
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1670 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2606e47940>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_230903-tiamfebr[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mgolden-blaze-850[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/tiamfebr[0m
  0%|          | 0/20000 [10:05<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.001, vaccm: 1.000, norm: 24.788, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:    generalization_gap ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▅▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:         training_loss █▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▄██████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.6_1
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1670 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605cde560>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_231911-25v2fj6g[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mrestful-voice-851[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/25v2fj6g[0m
  0%|          | 0/20000 [10:07<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.767, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▄██████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap █▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▇▇▃▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:         training_loss █▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.6_2
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1670 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605d624d0>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_232920-tshtyp3n[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mradiant-microwave-852[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/tshtyp3n[0m
  0%|          | 0/20000 [10:07<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 24.113, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▁██████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁███████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap ██▇▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▆▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:       validation_loss ▇█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.6_3
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1670 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605d39b40>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_233929-u3999kd3[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mazure-forest-853[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/u3999kd3[0m
  0%|          | 0/20000 [10:01<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.727, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▁██████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:    generalization_gap ▇█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▆▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ████▁▁▄█████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.6_4
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1670 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605ca7040>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_234932-vgyxg093[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33msmart-microwave-854[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/vgyxg093[0m
  0%|          | 0/20000 [10:05<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.093, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   best_train_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m: generalization_delay1 ▁███████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm ██▇▅▅▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▆██████████████████████████████████████
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:       validation_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.6_5
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1670 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a262bb32050>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241214_235939-hzij7996[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mgolden-armadillo-855[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/hzij7996[0m
  0%|          | 0/20000 [10:04<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.002, vaccm: 1.000, norm: 24.477, acc: 1.000, vacc: 0.999
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m: generalization_delay1 ▁███████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▆▄▄▄▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ████████████▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:       validation_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.6_6
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1670 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a25ca00d6c0>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241215_000945-arvv7l5h[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mdauntless-sun-856[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/arvv7l5h[0m
  0%|          | 0/20000 [10:05<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.640, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▁██████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m: generalization_delay1 ▁▄██████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm ██▅▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:         training_loss █▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:       validation_loss █▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.6_7
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1670 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605d29d80>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241215_001952-3sjqn57s[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mclear-river-857[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/3sjqn57s[0m
  0%|          | 0/20000 [10:04<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.544, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▁▂▄████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:    generalization_gap █▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▅▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:       validation_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.6_8
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1670 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605d2a500>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241215_002958-dwsdqlcs[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mwobbly-disco-858[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/dwsdqlcs[0m
  0%|          | 0/20000 [10:02<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.788, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:    generalization_gap █▇▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▄▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▅██████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.6_9
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1670 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605d39510>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241215_004001-x53uyba7[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mvital-voice-859[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/x53uyba7[0m
  0%|          | 0/20000 [10:01<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.515, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:    generalization_gap ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.7_0
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1948 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605ca76d0>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241215_005005-ul3jv6lm[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mdrawn-puddle-860[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/ul3jv6lm[0m
  0%|          | 0/20000 [11:18<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.892, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁███████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▆▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.7_1
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1948 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605ca4b20>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241215_010125-xdd9jpk3[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mspring-wind-861[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/xdd9jpk3[0m
  0%|          | 0/20000 [11:22<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 22.853, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▄██████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:    generalization_gap █▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▄██████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.7_2
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1948 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a25c9d3f190>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241215_011249-e9pj31o1[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mhelpful-firefly-862[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/e9pj31o1[0m
  0%|          | 0/20000 [11:20<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 22.673, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▅▅█████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▄▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.7_3
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1948 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a25c9d0d210>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241215_012411-4qoebhcm[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mcelestial-donkey-863[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/4qoebhcm[0m
  0%|          | 0/20000 [11:19<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.808, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▃██████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▁██████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap █▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▇▆▅▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▂██████████████████████████████████████
[34m[1mwandb[0m:         training_loss █▅▄▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.7_4
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1948 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605cc8c10>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241215_013532-rp6puof2[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mcomfy-dust-864[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/rp6puof2[0m
  0%|          | 0/20000 [11:23<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.439, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▃██████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap █▇▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm ▇█▇▄▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▆▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:       validation_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.7_5
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1948 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605cf9660>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241215_014657-atjge12y[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mgenial-sun-865[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/atjge12y[0m
  0%|          | 0/20000 [11:10<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.638, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▂██████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▇██████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm ██▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▇██████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▆▅▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.7_6
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1948 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605cfada0>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241215_015809-62tkvu3u[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mscarlet-hill-866[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/62tkvu3u[0m
  0%|          | 0/20000 [11:29<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 22.620, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁███████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm ███▇▇▄▄▄▄▄▄▄▄▄▄▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ██▁▁▁▁██████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.7_7
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1948 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605d38370>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241215_020940-r43ab2n3[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mhelpful-leaf-867[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/r43ab2n3[0m
  0%|          | 0/20000 [11:26<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 22.805, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁▅██████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap ▄█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▅▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.7_8
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1948 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a26f177eec0>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241215_022108-yq2dq5ez[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mcrisp-fire-868[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/yq2dq5ez[0m
  0%|          | 0/20000 [11:19<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 22.838, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁▅██████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: generalization_delay1 ▁███████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap █▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▅▃▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:       validation_loss █▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1


A_frac0.7_9
d=128
parsing func lambda x: (x[0]+x[1])%59
parameters 1 59 128 32 4 2 ReLU 0 False True
random split 1948 697
0.001 2.0
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.98)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.001
    lr: 0.0
    maximize: False
    weight_decay: 2.0
) <torch.optim.lr_scheduler.LambdaLR object at 0x7a2605cc9d80>


  0%|          | 0/20000 [00:00<?, ?it/s][34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241215_023229-3ew66sxp[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mlikely-haze-869[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/yhanmowsnoo-royal-institute-of-technology/modadd_longer/runs/3ew66sxp[0m
  0%|          | 0/20000 [11:18<?, ?it/s]

loss: 0.000, accm: 1.000, vloss: 0.000, vaccm: 1.000, norm: 23.789, acc: 1.000, vacc: 1.000
Finished.



[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:    best_test_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m:   best_train_accuracy ▁███████████████████████████████████████
[34m[1mwandb[0m: generalization_delay1 ▁███████████████████████████████████████
[34m[1mwandb[0m:    generalization_gap █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:        parameter_norm █▅▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:     training_accuracy ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:         training_loss █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:   validation_accuracy ▁▁▁█████████████████████████████████████
[34m[1mwandb[0m:       validation_loss █▇▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:    best_test_accuracy 1
