# Liner demo

## One

In [1]:
import torch
from sklearn.datasets import make_regression

X_np, y_np = make_regression(n_samples=100, n_features=1, noise=10.0, random_state=0)
X = torch.tensor(X_np, dtype=torch.float32)
y = torch.tensor(y_np, dtype=torch.float32).unsqueeze(1)

# 线性模型 y = b + w*x
w = torch.zeros((1,1), requires_grad=True)
b = torch.zeros(1, requires_grad=True)
opt = torch.optim.SGD([w, b], lr=0.01)
loss_fn = torch.nn.MSELoss()

for epoch in range(1000):
	preds = X.mm(w) + b        # 前向
	loss = loss_fn(preds, y)
	opt.zero_grad()
	loss.backward()             # 自动计算梯度
	opt.step()                  # 参数更新由优化器完成

print("params (bias, weight):", b.item(), w.item())
print("final MSE:", loss.item())

params (bias, weight): -0.8141747117042542 42.61933898925781
final MSE: 114.1714859008789


## Part

In [1]:
import torch
from sklearn.datasets import make_regression

In [2]:
X_np, y_np = make_regression(n_samples=100, n_features=1, noise=10.0, random_state=0)
X = torch.tensor(X_np, dtype=torch.float32)
y = torch.tensor(y_np, dtype=torch.float32).unsqueeze(1)

print(X_np[:10])
print(y_np[:10])
print(X[:10])
print(y[:10])

[[-0.35955316]
 [ 0.97663904]
 [ 0.40234164]
 [-0.81314628]
 [-0.88778575]
 [ 0.44386323]
 [-0.97727788]
 [ 0.42833187]
 [ 0.20827498]
 [-0.31155253]]
[-19.95588561  21.33977271  11.55689458 -16.34206917 -35.70063849
  27.99539547 -56.32353045  17.61041414  21.45106196 -22.35286466]
tensor([[-0.3596],
        [ 0.9766],
        [ 0.4023],
        [-0.8131],
        [-0.8878],
        [ 0.4439],
        [-0.9773],
        [ 0.4283],
        [ 0.2083],
        [-0.3116]])
tensor([[-19.9559],
        [ 21.3398],
        [ 11.5569],
        [-16.3421],
        [-35.7006],
        [ 27.9954],
        [-56.3235],
        [ 17.6104],
        [ 21.4511],
        [-22.3529]])


In [3]:
# 线性模型 y = b + w*x
w = torch.zeros((1,1), requires_grad=True)
b = torch.zeros(1, requires_grad=True)
opt = torch.optim.SGD([w, b], lr=0.01)
loss_fn = torch.nn.MSELoss()

opt

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [4]:
recorder = lambda epoch,step,w,b,loss: {"epoch": epoch, "step": step, "w": w.item(), "w.grad": w.grad, "b": b.item(), "b.grad": b.grad, "loss": loss.item()}
recouter = lambda print_fn,epoch,step,w,b,loss,rti=-1: [
	r := recorder(epoch,step,w,b,loss), 
	x := type('',(),r)(), 
	print_fn(f"({x.epoch}) {x.step} - w: {x.w}, w.grad: {getattr(x,'w.grad')}; b: {x.b}, b.grad: {getattr(x,'b.grad')}; loss: {x.loss}"), 
	x][rti]

In [5]:
import polars as pl

# records, loss, preds = [], torch.tensor(float('nan')), None
# for epoch in range(6):
# 	print(f"({epoch}) a - w: {w.item()}, w.grad: {w.grad}; b: {b.item()}, b.grad: {b.grad}; loss: {loss}")
# 	records.append({"epoch": epoch, "step": "a", "w": w.item(), "w.grad": w.grad, "b": b.item(), "b.grad": b.grad, "loss": loss.item()})
# 	preds = X.mm(w) + b        # 前向
# 	print(f"({epoch}) b - w: {w.item()}, w.grad: {w.grad}; b: {b.item()}, b.grad: {b.grad}; loss: {loss}")
# 	records.append({"epoch": epoch, "step": "b", "w": w.item(), "w.grad": w.grad, "b": b.item(), "b.grad": b.grad, "loss": loss.item()})
# 	loss = loss_fn(preds, y)
# 	print(f"({epoch}) c - w: {w.item()}, w.grad: {w.grad}; b: {b.item()}, b.grad: {b.grad}; loss: {loss}")
# 	records.append({"epoch": epoch, "step": "c", "w": w.item(), "w.grad": w.grad, "b": b.item(), "b.grad": b.grad, "loss": loss.item()})
# 	opt.zero_grad()
# 	print(f"({epoch}) d - w: {w.item()}, w.grad: {w.grad}; b: {b.item()}, b.grad: {b.grad}; loss: {loss}")
# 	records.append({"epoch": epoch, "step": "d", "w": w.item(), "w.grad": w.grad, "b": b.item(), "b.grad": b.grad, "loss": loss.item()})
# 	loss.backward()             # 自动计算梯度
# 	print(f"({epoch}) e - w: {w.item()}, w.grad: {w.grad}; b: {b.item()}, b.grad: {b.grad}; loss: {loss}")
# 	records.append({"epoch": epoch, "step": "e", "w": w.item(), "w.grad": w.grad, "b": b.item(), "b.grad": b.grad, "loss": loss.item()})
# 	opt.step()                  # 参数更新由优化器完成
# 	print(f"({epoch}) f - w: {w.item()}, w.grad: {w.grad}; b: {b.item()}, b.grad: {b.grad}; loss: {loss}")
# 	records.append({"epoch": epoch, "step": "f", "w": w.item(), "w.grad": w.grad, "b": b.item(), "b.grad": b.grad, "loss": loss.item()})
# records_df = pl.DataFrame(records)

records, loss, preds = [], torch.tensor(float('nan')), None
for epoch in range(6):
	records.append(recouter(print, epoch, 'a', w, b, loss, 0))
	preds = X.mm(w) + b        # 前向
	records.append(recouter(print, epoch, 'b', w, b, loss, 0))
	loss = loss_fn(preds, y)
	records.append(recouter(print, epoch, 'c', w, b, loss, 0))
	opt.zero_grad()
	records.append(recouter(print, epoch, 'd', w, b, loss, 0))
	loss.backward()             # 自动计算梯度
	records.append(recouter(print, epoch, 'e', w, b, loss, 0))
	opt.step()                  # 参数更新由优化器完成
	records.append(recouter(print, epoch, 'f', w, b, loss, 0))
records_df = pl.DataFrame(records)

print("params (bias, weight):", b.item(), w.item())
print("final MSE:", loss.item())
opt

(0) a - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: nan
(0) b - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: nan
(0) c - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: 1962.344482421875
(0) d - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: 1962.344482421875
(0) e - w: 0.0, w.grad: tensor([[-86.7954]]); b: 0.0, b.grad: tensor([-3.4696]); loss: 1962.344482421875
(0) f - w: 0.8679539561271667, w.grad: tensor([[-86.7954]]); b: 0.03469603508710861, b.grad: tensor([-3.4696]); loss: 1962.344482421875
(1) a - w: 0.8679539561271667, w.grad: tensor([[-86.7954]]); b: 0.03469603508710861, b.grad: tensor([-3.4696]); loss: 1962.344482421875
(1) b - w: 0.8679539561271667, w.grad: tensor([[-86.7954]]); b: 0.03469603508710861, b.grad: tensor([-3.4696]); loss: 1962.344482421875
(1) c - w: 0.8679539561271667, w.grad: tensor([[-86.7954]]); b: 0.03469603508710861, b.grad: tensor([-3.4696]); loss: 1887.6624755859375
(1) d - w: 0.8679539561271667, w.grad: None; b: 0.03469603508710861, b.gra

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [6]:
records_df

epoch,step,w,w.grad,b,b.grad,loss
i64,str,f64,f64,f64,f64,f64
0,"""a""",0.0,,0.0,,
0,"""b""",0.0,,0.0,,
0,"""c""",0.0,,0.0,,1962.344482
0,"""d""",0.0,,0.0,,1962.344482
0,"""e""",0.0,-86.795395,0.0,-3.469604,1962.344482
…,…,…,…,…,…,…
5,"""b""",4.165996,-79.91571,0.15671,-2.809649,1681.296265
5,"""c""",4.165996,-79.91571,0.15671,-2.809649,1618.006592
5,"""d""",4.165996,,0.15671,,1618.006592
5,"""e""",4.165996,-78.283012,0.15671,-2.657864,1618.006592


In [7]:
records_df.filter(pl.col("epoch") == 0)

epoch,step,w,w.grad,b,b.grad,loss
i64,str,f64,f64,f64,f64,f64
0,"""a""",0.0,,0.0,,
0,"""b""",0.0,,0.0,,
0,"""c""",0.0,,0.0,,1962.344482
0,"""d""",0.0,,0.0,,1962.344482
0,"""e""",0.0,-86.795395,0.0,-3.469604,1962.344482
0,"""f""",0.867954,-86.795395,0.034696,-3.469604,1962.344482


In [8]:
records_df.filter(pl.col("epoch") == 1)

epoch,step,w,w.grad,b,b.grad,loss
i64,str,f64,f64,f64,f64,f64
1,"""a""",0.867954,-86.795395,0.034696,-3.469604,1962.344482
1,"""b""",0.867954,-86.795395,0.034696,-3.469604,1962.344482
1,"""c""",0.867954,-86.795395,0.034696,-3.469604,1887.662476
1,"""d""",0.867954,,0.034696,,1887.662476
1,"""e""",0.867954,-85.02166,0.034696,-3.296391,1887.662476
1,"""f""",1.718171,-85.02166,0.06766,-3.296391,1887.662476


In [9]:
records_df.filter(pl.col("epoch") == 2)

epoch,step,w,w.grad,b,b.grad,loss
i64,str,f64,f64,f64,f64,f64
2,"""a""",1.718171,-85.02166,0.06766,-3.296391,1887.662476
2,"""b""",1.718171,-85.02166,0.06766,-3.296391,1887.662476
2,"""c""",1.718171,-85.02166,0.06766,-3.296391,1816.008301
2,"""d""",1.718171,,0.06766,,1816.008301
2,"""e""",1.718171,-83.284286,0.06766,-3.128763,1816.008301
2,"""f""",2.551013,-83.284286,0.098948,-3.128763,1816.008301


In [10]:
records_df.filter(pl.col("epoch") == 3)

epoch,step,w,w.grad,b,b.grad,loss
i64,str,f64,f64,f64,f64,f64
3,"""a""",2.551013,-83.284286,0.098948,-3.128763,1816.008301
3,"""b""",2.551013,-83.284286,0.098948,-3.128763,1816.008301
3,"""c""",2.551013,-83.284286,0.098948,-3.128763,1747.258911
3,"""d""",2.551013,,0.098948,,1747.258911
3,"""e""",2.551013,-81.582542,0.098948,-2.966566,1747.258911
3,"""f""",3.366839,-81.582542,0.128613,-2.966566,1747.258911


# Model Class style

## One

In [1]:
import torch
from sklearn.datasets import make_regression

# 数据
X_np, y_np = make_regression(n_samples=100, n_features=1, noise=10.0, random_state=0)
X = torch.tensor(X_np, dtype=torch.float32)
y = torch.tensor(y_np, dtype=torch.float32).unsqueeze(1)

# 裸实现初始化
w = torch.zeros((1,1), requires_grad=True)
b = torch.zeros(1, requires_grad=True)

# 封装模型
class LinearModel(torch.nn.Module):
	def __init__(self):
		super().__init__()
		self.w = torch.nn.Parameter(torch.zeros(1,1))
		self.b = torch.nn.Parameter(torch.zeros(1))
	def forward(self, x):
		return x.mm(self.w) + self.b

model = LinearModel()

# 将裸参数拷贝到 model（确保完全相同起点）
with torch.no_grad():
	model.w.copy_(w)
	model.b.copy_(b)

# 验证在相同输入下预测和损失一致
preds_bare = X.mm(w) + b
preds_mod  = model(X)
print('max abs pred diff:', (preds_bare - preds_mod).abs().max().item())

loss_fn = torch.nn.MSELoss()
print('loss diff:', abs(loss_fn(preds_bare, y).item() - loss_fn(preds_mod, y).item()))

# 训练两者（完全相同超参数）
opt_bare = torch.optim.SGD([w, b], lr=0.01)
opt_mod  = torch.optim.SGD(model.parameters(), lr=0.01)

# for epoch in range(1000):
# 	# 裸实现 step
# 	p_bare = X.mm(w) + b
# 	loss_bare = loss_fn(p_bare, y)
# 	opt_bare.zero_grad(); loss_bare.backward(); opt_bare.step()
# 	
# 	# 模型实现 step
# 	p_mod = model(X)
# 	loss_mod = loss_fn(p_mod, y)
# 	opt_mod.zero_grad(); loss_mod.backward(); opt_mod.step()


recorder = lambda epoch,step,w,b,loss: {"epoch": epoch, "step": step, "w": w.item(), "w.grad": w.grad, "b": b.item(), "b.grad": b.grad, "loss": loss.item()}
recouter = lambda print_fn,epoch,step,w,b,loss,rti=-1: [
	r := recorder(epoch,step,w,b,loss), 
	x := type('',(),r)(), 
	print_fn(f"({x.epoch}) {x.step} - w: {x.w}, w.grad: {getattr(x,'w.grad')}; b: {x.b}, b.grad: {getattr(x,'b.grad')}; loss: {x.loss}"), 
	x][rti]


records = []
loss_bare, p_bare = loss_mod, p_mod = torch.tensor(float('nan')), None
for epoch in range(10):
	
	# 裸实现 step
	print("=== bare ===")                         ; records.append(recouter(print, epoch, 'a (bare)', w, b, loss_bare, 0))
	p_bare = X.mm(w) + b                          ; records.append(recouter(print, epoch, 'b (bare)', w, b, loss_bare, 0))
	loss_bare = loss_fn(p_bare, y)                ; records.append(recouter(print, epoch, 'c (bare)', w, b, loss_bare, 0))
	opt_bare.zero_grad()                          ; records.append(recouter(print, epoch, 'd (bare)', w, b, loss_bare, 0))
	loss_bare.backward()                          ; records.append(recouter(print, epoch, 'e (bare)', w, b, loss_bare, 0))
	opt_bare.step()                               ; records.append(recouter(print, epoch, 'f (bare)', w, b, loss_bare, 0))
	
	# 模型实现 step
	print("=== mod ===")                ; records.append(recouter(print, epoch, 'a (mod)', model.w, model.b, loss_mod, 0))
	p_mod = model(X)                    ; records.append(recouter(print, epoch, 'b (mod)', model.w, model.b, loss_mod, 0))
	loss_mod = loss_fn(p_mod, y)        ; records.append(recouter(print, epoch, 'c (mod)', model.w, model.b, loss_mod, 0))
	opt_mod.zero_grad()                 ; records.append(recouter(print, epoch, 'd (mod)', model.w, model.b, loss_mod, 0))
	loss_mod.backward()                 ; records.append(recouter(print, epoch, 'e (mod)', model.w, model.b, loss_mod, 0))
	opt_mod.step()                      ; records.append(recouter(print, epoch, 'f (mod)', model.w, model.b, loss_mod, 0))

import polars as pl
records_df = pl.DataFrame(records)

# 比较最终参数与损失
print('final bias diff:', abs(b.item() - model.b.item()))
print('final weight diff:', (w - model.w).abs().max().item())
print('final loss diff:', abs(loss_bare.item() - loss_mod.item()))

records_df

max abs pred diff: 0.0
loss diff: 0.0
=== bare ===
(0) a (bare) - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: nan
(0) b (bare) - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: nan
(0) c (bare) - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: 1962.344482421875
(0) d (bare) - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: 1962.344482421875
(0) e (bare) - w: 0.0, w.grad: tensor([[-86.7954]]); b: 0.0, b.grad: tensor([-3.4696]); loss: 1962.344482421875
(0) f (bare) - w: 0.8679539561271667, w.grad: tensor([[-86.7954]]); b: 0.03469603508710861, b.grad: tensor([-3.4696]); loss: 1962.344482421875
=== mod ===
(0) a (mod) - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: nan
(0) b (mod) - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: nan
(0) c (mod) - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: 1962.344482421875
(0) d (mod) - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: 1962.344482421875
(0) e (mod) - w: 0.0, w.grad: tensor([[-86.7954]]); b: 0.0, b.grad: tensor([-3

epoch,step,w,w.grad,b,b.grad,loss
i64,str,f64,f64,f64,f64,f64
0,"""a (bare)""",0.0,,0.0,,
0,"""b (bare)""",0.0,,0.0,,
0,"""c (bare)""",0.0,,0.0,,1962.344482
0,"""d (bare)""",0.0,,0.0,,1962.344482
0,"""e (bare)""",0.0,-86.795395,0.0,-3.469604,1962.344482
…,…,…,…,…,…,…
9,"""b (mod)""",7.202668,-73.583015,0.254409,-2.231884,1443.112305
9,"""c (mod)""",7.202668,-73.583015,0.254409,-2.231884,1389.472168
9,"""d (mod)""",7.202668,,0.254409,,1389.472168
9,"""e (mod)""",7.202668,-72.080109,0.254409,-2.09923,1389.472168


## Part

In [1]:
import torch
from sklearn.datasets import make_regression

# 数据
X_np, y_np = make_regression(n_samples=100, n_features=1, noise=10.0, random_state=0)
X = torch.tensor(X_np, dtype=torch.float32)
y = torch.tensor(y_np, dtype=torch.float32).unsqueeze(1)

print(X_np[:10])
print(y_np[:10])
print(X[:10])
print(y[:10])

[[-0.35955316]
 [ 0.97663904]
 [ 0.40234164]
 [-0.81314628]
 [-0.88778575]
 [ 0.44386323]
 [-0.97727788]
 [ 0.42833187]
 [ 0.20827498]
 [-0.31155253]]
[-19.95588561  21.33977271  11.55689458 -16.34206917 -35.70063849
  27.99539547 -56.32353045  17.61041414  21.45106196 -22.35286466]
tensor([[-0.3596],
        [ 0.9766],
        [ 0.4023],
        [-0.8131],
        [-0.8878],
        [ 0.4439],
        [-0.9773],
        [ 0.4283],
        [ 0.2083],
        [-0.3116]])
tensor([[-19.9559],
        [ 21.3398],
        [ 11.5569],
        [-16.3421],
        [-35.7006],
        [ 27.9954],
        [-56.3235],
        [ 17.6104],
        [ 21.4511],
        [-22.3529]])


In [2]:
# 裸实现初始化
w = torch.zeros((1,1), requires_grad=True)
b = torch.zeros(1, requires_grad=True)

# 封装模型
class LinearModel(torch.nn.Module):
	def __init__(self):
		super().__init__()
		self.w = torch.nn.Parameter(torch.zeros(1,1))
		self.b = torch.nn.Parameter(torch.zeros(1))
	def forward(self, x):
		return x.mm(self.w) + self.b

model = LinearModel()

# 将裸参数拷贝到 model（确保完全相同起点）
with torch.no_grad():
	model.w.copy_(w)
	model.b.copy_(b)

# 验证在相同输入下预测和损失一致
preds_bare = X.mm(w) + b
preds_mod  = model(X)
print('max abs pred diff:', (preds_bare - preds_mod).abs().max().item())

loss_fn = torch.nn.MSELoss()
print('loss diff:', abs(loss_fn(preds_bare, y).item() - loss_fn(preds_mod, y).item()))

# 训练两者（完全相同超参数）
opt_bare = torch.optim.SGD([w, b], lr=0.01)
opt_mod  = torch.optim.SGD(model.parameters(), lr=0.01)

opt_bare, opt_mod

max abs pred diff: 0.0
loss diff: 0.0


(SGD (
 Parameter Group 0
     dampening: 0
     differentiable: False
     foreach: None
     fused: None
     lr: 0.01
     maximize: False
     momentum: 0
     nesterov: False
     weight_decay: 0
 ),
 SGD (
 Parameter Group 0
     dampening: 0
     differentiable: False
     foreach: None
     fused: None
     lr: 0.01
     maximize: False
     momentum: 0
     nesterov: False
     weight_decay: 0
 ))

In [3]:
recorder = lambda epoch,step,w,b,loss: {"epoch": epoch, "step": step, "w": w.item(), "w.grad": w.grad, "b": b.item(), "b.grad": b.grad, "loss": loss.item()}
recouter = lambda print_fn,epoch,step,w,b,loss,rti=-1: [
	r := recorder(epoch,step,w,b,loss), 
	x := type('',(),r)(), 
	print_fn(f"({x.epoch}) {x.step} - w: {x.w}, w.grad: {getattr(x,'w.grad')}; b: {x.b}, b.grad: {getattr(x,'b.grad')}; loss: {x.loss}"), 
	x][rti]

In [4]:
# for epoch in range(1000):
# 	# 裸实现 step
# 	p_bare = X.mm(w) + b
# 	loss_bare = loss_fn(p_bare, y)
# 	opt_bare.zero_grad(); loss_bare.backward(); opt_bare.step()
# 	
# 	# 模型实现 step
# 	p_mod = model(X)
# 	loss_mod = loss_fn(p_mod, y)
# 	opt_mod.zero_grad(); loss_mod.backward(); opt_mod.step()

records = []
loss_bare, p_bare = loss_mod, p_mod = torch.tensor(float('nan')), None
for epoch in range(3):
	
	# 裸实现 step
	print("=== bare ===")                         ; records.append(recouter(print, epoch, 'a (bare)', w, b, loss_bare, 0))
	p_bare = X.mm(w) + b                          ; records.append(recouter(print, epoch, 'b (bare)', w, b, loss_bare, 0))
	loss_bare = loss_fn(p_bare, y)                ; records.append(recouter(print, epoch, 'c (bare)', w, b, loss_bare, 0))
	opt_bare.zero_grad()                          ; records.append(recouter(print, epoch, 'd (bare)', w, b, loss_bare, 0))
	loss_bare.backward()                          ; records.append(recouter(print, epoch, 'e (bare)', w, b, loss_bare, 0))
	opt_bare.step()                               ; records.append(recouter(print, epoch, 'f (bare)', w, b, loss_bare, 0))
	
	# 模型实现 step
	print("=== mod ===")                ; records.append(recouter(print, epoch, 'a (mod)', model.w, model.b, loss_mod, 0))
	p_mod = model(X)                    ; records.append(recouter(print, epoch, 'b (mod)', model.w, model.b, loss_mod, 0))
	loss_mod = loss_fn(p_mod, y)        ; records.append(recouter(print, epoch, 'c (mod)', model.w, model.b, loss_mod, 0))
	opt_mod.zero_grad()                 ; records.append(recouter(print, epoch, 'd (mod)', model.w, model.b, loss_mod, 0))
	loss_mod.backward()                 ; records.append(recouter(print, epoch, 'e (mod)', model.w, model.b, loss_mod, 0))
	opt_mod.step()                      ; records.append(recouter(print, epoch, 'f (mod)', model.w, model.b, loss_mod, 0))

import polars as pl
records_df = pl.DataFrame(records)

# 比较最终参数与损失
print('final bias diff:', abs(b.item() - model.b.item()))
print('final weight diff:', (w - model.w).abs().max().item())
print('final loss diff:', abs(loss_bare.item() - loss_mod.item()))

opt_bare, opt_mod

=== bare ===
(0) a (bare) - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: nan
(0) b (bare) - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: nan
(0) c (bare) - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: 1962.344482421875
(0) d (bare) - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: 1962.344482421875
(0) e (bare) - w: 0.0, w.grad: tensor([[-86.7954]]); b: 0.0, b.grad: tensor([-3.4696]); loss: 1962.344482421875
(0) f (bare) - w: 0.8679539561271667, w.grad: tensor([[-86.7954]]); b: 0.03469603508710861, b.grad: tensor([-3.4696]); loss: 1962.344482421875
=== mod ===
(0) a (mod) - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: nan
(0) b (mod) - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: nan
(0) c (mod) - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: 1962.344482421875
(0) d (mod) - w: 0.0, w.grad: None; b: 0.0, b.grad: None; loss: 1962.344482421875
(0) e (mod) - w: 0.0, w.grad: tensor([[-86.7954]]); b: 0.0, b.grad: tensor([-3.4696]); loss: 1962.344482421875
(0) f

(SGD (
 Parameter Group 0
     dampening: 0
     differentiable: False
     foreach: None
     fused: None
     lr: 0.01
     maximize: False
     momentum: 0
     nesterov: False
     weight_decay: 0
 ),
 SGD (
 Parameter Group 0
     dampening: 0
     differentiable: False
     foreach: None
     fused: None
     lr: 0.01
     maximize: False
     momentum: 0
     nesterov: False
     weight_decay: 0
 ))

In [5]:
pl.Config.set_tbl_rows(12)
records_df

epoch,step,w,w.grad,b,b.grad,loss
i64,str,f64,f64,f64,f64,f64
0,"""a (bare)""",0.0,,0.0,,
0,"""b (bare)""",0.0,,0.0,,
0,"""c (bare)""",0.0,,0.0,,1962.344482
0,"""d (bare)""",0.0,,0.0,,1962.344482
0,"""e (bare)""",0.0,-86.795395,0.0,-3.469604,1962.344482
0,"""f (bare)""",0.867954,-86.795395,0.034696,-3.469604,1962.344482
…,…,…,…,…,…,…
2,"""a (mod)""",1.718171,-85.02166,0.06766,-3.296391,1887.662476
2,"""b (mod)""",1.718171,-85.02166,0.06766,-3.296391,1887.662476
2,"""c (mod)""",1.718171,-85.02166,0.06766,-3.296391,1816.008301


In [6]:
records_df.filter(pl.col("epoch") == 0)

epoch,step,w,w.grad,b,b.grad,loss
i64,str,f64,f64,f64,f64,f64
0,"""a (bare)""",0.0,,0.0,,
0,"""b (bare)""",0.0,,0.0,,
0,"""c (bare)""",0.0,,0.0,,1962.344482
0,"""d (bare)""",0.0,,0.0,,1962.344482
0,"""e (bare)""",0.0,-86.795395,0.0,-3.469604,1962.344482
0,"""f (bare)""",0.867954,-86.795395,0.034696,-3.469604,1962.344482
0,"""a (mod)""",0.0,,0.0,,
0,"""b (mod)""",0.0,,0.0,,
0,"""c (mod)""",0.0,,0.0,,1962.344482
0,"""d (mod)""",0.0,,0.0,,1962.344482


In [7]:
records_df.filter(pl.col("epoch") == 1)

epoch,step,w,w.grad,b,b.grad,loss
i64,str,f64,f64,f64,f64,f64
1,"""a (bare)""",0.867954,-86.795395,0.034696,-3.469604,1962.344482
1,"""b (bare)""",0.867954,-86.795395,0.034696,-3.469604,1962.344482
1,"""c (bare)""",0.867954,-86.795395,0.034696,-3.469604,1887.662476
1,"""d (bare)""",0.867954,,0.034696,,1887.662476
1,"""e (bare)""",0.867954,-85.02166,0.034696,-3.296391,1887.662476
1,"""f (bare)""",1.718171,-85.02166,0.06766,-3.296391,1887.662476
1,"""a (mod)""",0.867954,-86.795395,0.034696,-3.469604,1962.344482
1,"""b (mod)""",0.867954,-86.795395,0.034696,-3.469604,1962.344482
1,"""c (mod)""",0.867954,-86.795395,0.034696,-3.469604,1887.662476
1,"""d (mod)""",0.867954,,0.034696,,1887.662476


In [8]:
records_df.filter(pl.col("epoch") == 2)

epoch,step,w,w.grad,b,b.grad,loss
i64,str,f64,f64,f64,f64,f64
2,"""a (bare)""",1.718171,-85.02166,0.06766,-3.296391,1887.662476
2,"""b (bare)""",1.718171,-85.02166,0.06766,-3.296391,1887.662476
2,"""c (bare)""",1.718171,-85.02166,0.06766,-3.296391,1816.008301
2,"""d (bare)""",1.718171,,0.06766,,1816.008301
2,"""e (bare)""",1.718171,-83.284286,0.06766,-3.128763,1816.008301
2,"""f (bare)""",2.551013,-83.284286,0.098948,-3.128763,1816.008301
2,"""a (mod)""",1.718171,-85.02166,0.06766,-3.296391,1887.662476
2,"""b (mod)""",1.718171,-85.02166,0.06766,-3.296391,1887.662476
2,"""c (mod)""",1.718171,-85.02166,0.06766,-3.296391,1816.008301
2,"""d (mod)""",1.718171,,0.06766,,1816.008301
