In [1]:
# Cell 1: 导入依赖
import torch
from torch import nn, optim

In [2]:
# Cell 2: 准备示例数据
X = torch.tensor([[0.0], [1.0], [2.0], [3.0]])
Y = torch.tensor([[0.0], [2.0], [4.0], [6.0]])

In [3]:
# Cell 3: 定义模型函数
def linear_model():
	layer = nn.Linear(1, 1)
	return layer

In [4]:
# Cell 4: 定义训练函数
def train(model, x, y, lr=0.1, epochs=200):
	optimizer = optim.SGD(model.parameters(), lr=lr)
	loss_fn = nn.MSELoss()
	for _ in range(epochs):
		preds = model(x)
		loss = loss_fn(preds, y)
		optimizer.zero_grad()
		loss.backward()
		optimizer.step()
	return model

In [5]:
# Cell 5: 实例化并训练
model = linear_model()
model = train(model, X, Y)

In [6]:
# Cell 6: 推理
with torch.no_grad():
	preds = model(X)
print(preds)

tensor([[6.8747e-06],
        [2.0000e+00],
        [4.0000e+00],
        [6.0000e+00]])


In [None]:
#######################################

In [5]:
# Cell 1: 导入依赖并加载数据
import pandas as pd
import polars as pl
import torch
from torch import nn, optim
from dfply import diamonds

In [19]:
df = pl.DataFrame(diamonds.copy())
# df = diamonds.copy()
df

carat,cut,color,clarity,depth,table,price,x,y,z
f64,str,str,str,f64,f64,i64,f64,f64,f64
0.23,"""Ideal""","""E""","""SI2""",61.5,55.0,326,3.95,3.98,2.43
0.21,"""Premium""","""E""","""SI1""",59.8,61.0,326,3.89,3.84,2.31
0.23,"""Good""","""E""","""VS1""",56.9,65.0,327,4.05,4.07,2.31
0.29,"""Premium""","""I""","""VS2""",62.4,58.0,334,4.2,4.23,2.63
0.31,"""Good""","""J""","""SI2""",63.3,58.0,335,4.34,4.35,2.75
…,…,…,…,…,…,…,…,…,…
0.72,"""Ideal""","""D""","""SI1""",60.8,57.0,2757,5.75,5.76,3.5
0.72,"""Good""","""D""","""SI1""",63.1,55.0,2757,5.69,5.75,3.61
0.7,"""Very Good""","""D""","""SI1""",62.8,60.0,2757,5.66,5.68,3.56
0.86,"""Premium""","""H""","""SI2""",61.0,58.0,2757,6.15,6.12,3.74


In [20]:
# Cell 2: 特征编码与张量转换
# one-hot 编码分类特征
# df_encoded = pd.get_dummies(df, columns=["cut","color","clarity"], drop_first=True)
df_encoded = df.to_dummies(columns=["cut", "color", "clarity"])
df_encoded

carat,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,depth,table,price,x,y,z
f64,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,f64,f64,i64,f64,f64,f64
0.23,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,61.5,55.0,326,3.95,3.98,2.43
0.21,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,59.8,61.0,326,3.89,3.84,2.31
0.23,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,56.9,65.0,327,4.05,4.07,2.31
0.29,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,62.4,58.0,334,4.2,4.23,2.63
0.31,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,63.3,58.0,335,4.34,4.35,2.75
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.72,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,60.8,57.0,2757,5.75,5.76,3.5
0.72,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,63.1,55.0,2757,5.69,5.75,3.61
0.7,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,62.8,60.0,2757,5.66,5.68,3.56
0.86,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,61.0,58.0,2757,6.15,6.12,3.74


In [24]:
# 选择输入列和输出列
input_cols  = ["carat", "depth", "table"] + [
	# c for c in df_encoded.columns if c.startswith("cut_") or c.startswith("color_") or c.startswith("clarity_")
	col for col in df_encoded.columns if col.startswith("cut_") or col.startswith("color_") or col.startswith("clarity_")
]
output_cols = ["price", "x", "y", "z"]

print(df_encoded.select(input_cols).to_numpy())
print(df_encoded.select(output_cols).to_numpy())

# X = torch.tensor(df[input_cols].values, dtype=torch.float32)
# Y = torch.tensor(df[output_cols].values, dtype=torch.float32)
X = torch.tensor(df_encoded.select(input_cols).to_numpy(), dtype=torch.float32)
Y = torch.tensor(df_encoded.select(output_cols).to_numpy(), dtype=torch.float32)

print(X)
print(Y)

[[ 0.23 61.5  55.   ...  0.    0.    0.  ]
 [ 0.21 59.8  61.   ...  0.    0.    0.  ]
 [ 0.23 56.9  65.   ...  0.    0.    0.  ]
 ...
 [ 0.7  62.8  60.   ...  0.    0.    0.  ]
 [ 0.86 61.   58.   ...  0.    0.    0.  ]
 [ 0.75 62.2  55.   ...  0.    0.    0.  ]]
[[3.260e+02 3.950e+00 3.980e+00 2.430e+00]
 [3.260e+02 3.890e+00 3.840e+00 2.310e+00]
 [3.270e+02 4.050e+00 4.070e+00 2.310e+00]
 ...
 [2.757e+03 5.660e+00 5.680e+00 3.560e+00]
 [2.757e+03 6.150e+00 6.120e+00 3.740e+00]
 [2.757e+03 5.830e+00 5.870e+00 3.640e+00]]
tensor([[ 0.2300, 61.5000, 55.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.2100, 59.8000, 61.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.2300, 56.9000, 65.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.7000, 62.8000, 60.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.8600, 61.0000, 58.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.7500, 62.2000, 55.0000,  ...,  0.0000,  0.0000,  0.0000]])
tensor([[3.2600e+02, 3.9500e+00, 3.980

In [26]:
# Cell 3: 构建数据加载器
dataset = torch.utils.data.TensorDataset(X, Y)
loader  = torch.utils.data.DataLoader(dataset, batch_size=256, shuffle=True)

print(dataset)
print(loader)

<torch.utils.data.dataset.TensorDataset object at 0x7f32d72796c0>
<torch.utils.data.dataloader.DataLoader object at 0x7f32d7279660>


In [34]:
# Cell 4: 定义模型
class DiamondRegressor(nn.Module):
	def __init__(self, in_dim, out_dim):
		super().__init__()
		self.net = nn.Sequential(
			nn.Linear(in_dim, 64),
			nn.ReLU(),
			nn.Linear(64, out_dim)
		)
	def forward(self, x):
		return self.net(x)

model = DiamondRegressor(X.shape[1], Y.shape[1])

In [35]:
# Cell 5: 训练函数
def train(model, loader, lr=1e-3, epochs=20, epoch_hint = False):
	opt = optim.Adam(model.parameters(), lr=lr)
	loss_fn = nn.MSELoss()
	for epoch in range(epochs):
		total_loss = 0
		count = 0
		for xb, yb in loader:
			preds = model(xb)
			loss  = loss_fn(preds, yb)
			opt.zero_grad()
			loss.backward()
			opt.step()
			total_loss += loss.item()
			count += 1
		if epoch_hint is True: print(f"Epoch {epoch+1}/{epochs}, Loss: {(total_loss / count):.4f}")
	return model

model = train(model, loader, epoch_hint = True)

Epoch 1/20, Loss: 7601426.2227
Epoch 2/20, Loss: 6365256.3614
Epoch 3/20, Loss: 4855984.8827
Epoch 4/20, Loss: 4101397.4550
Epoch 5/20, Loss: 3939362.8045
Epoch 6/20, Loss: 3920747.0675
Epoch 7/20, Loss: 3910894.9396
Epoch 8/20, Loss: 3903147.7855
Epoch 9/20, Loss: 3892079.4360
Epoch 10/20, Loss: 3881204.7382
Epoch 11/20, Loss: 3868176.7180
Epoch 12/20, Loss: 3855031.9159
Epoch 13/20, Loss: 3840662.3175
Epoch 14/20, Loss: 3819131.1706
Epoch 15/20, Loss: 3801144.4206
Epoch 16/20, Loss: 3779823.8969
Epoch 17/20, Loss: 3757295.6576
Epoch 18/20, Loss: 3732965.6836
Epoch 19/20, Loss: 3706975.0616
Epoch 20/20, Loss: 3681913.0664


In [36]:
model = train(model, loader, epoch_hint = True, epochs = 12)
model = train(model, loader, epochs = 1024)

Epoch 1/12, Loss: 3655039.8092
Epoch 2/12, Loss: 3629140.9017
Epoch 3/12, Loss: 3601229.6445
Epoch 4/12, Loss: 3574243.1611
Epoch 5/12, Loss: 3543397.4597
Epoch 6/12, Loss: 3513506.6943
Epoch 7/12, Loss: 3480259.5012
Epoch 8/12, Loss: 3446525.9538
Epoch 9/12, Loss: 3412928.2275
Epoch 10/12, Loss: 3376254.7701
Epoch 11/12, Loss: 3338831.2749
Epoch 12/12, Loss: 3301236.1659


In [37]:
model = train(model, loader, epoch_hint = True, epochs = 7)

Epoch 1/7, Loss: 334661.9546
Epoch 2/7, Loss: 334475.4819
Epoch 3/7, Loss: 334504.0910
Epoch 4/7, Loss: 334492.6707
Epoch 5/7, Loss: 334636.6874
Epoch 6/7, Loss: 334397.7912
Epoch 7/7, Loss: 334429.2247


In [38]:
# Cell 6: 单行输入推理

# def predict_row(model, row_dict):
# 	df_row = pd.DataFrame([row_dict])
# 	df_row = pd.get_dummies(df_row, columns=["cut","color","clarity"], drop_first=True)
# 	# 对齐所有 one-hot 列
# 	for col in X.shape[1] - 6 and df.columns:
# 		if col not in df_row:
# 			df_row[col] = 0
# 	x = torch.tensor(df_row[X.shape[1]-X.shape[1]:].values, dtype=torch.float32)
# 	with torch.no_grad():
# 		y = model(x)
# 	return y.numpy()[0]

# 假设 input_cols 是训练时的列顺序
def predict_row(sample, model, input_cols):
	# 将 sample 转为 Polars DataFrame
	df_sample = pl.DataFrame([sample])
	# One-hot 编码
	df_encoded = df_sample.to_dummies(columns=["cut", "color", "clarity"])
	# 补齐缺失列
	for col in input_cols:
		if col not in df_encoded.columns:
			df_encoded = df_encoded.with_columns(pl.lit(0).alias(col))
	# 按顺序排列列
	df_encoded = df_encoded.select(input_cols)
	# 转为 tensor
	x = torch.tensor(df_encoded.to_numpy(), dtype=torch.float32)
	# 模型预测
	with torch.no_grad():
		prediction = model(x).numpy()
	return prediction

In [39]:
# 示例用法
sample = {
	"carat":0.5, "cut":"Ideal", "color":"E", "clarity":"VS2",
	"depth":61.0, "table":55.0
}

# print(predict_row(model, sample))
print(predict_row(sample, model, input_cols))

[[2122.8896       4.9385304    5.089703     3.0139039]]
