In [1]:
from configuration import Config
from modeling import CasualLM
import torch
from torch.utils.data import TensorDataset, DataLoader

### 1 定义模型

In [2]:
config = Config(
    vocab_size=5000,
    num_hiddens=256,
    num_layers=12,
    num_heads=16,
    num_mlp_intermediate=1024,
    max_context_length=1024,
    dropout=0.1
)

model = CasualLM(config)

### 2 测试推理

In [3]:
x = torch.arange(0,10).repeat(2, 2)
print(x)
y = model(x)
print(y.logits)

tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])
tensor([[[-0.1078,  1.1004,  0.5563,  ..., -0.2622, -0.3747, -0.2667],
         [ 0.3528, -0.1032, -0.6390,  ..., -0.3498,  0.4159, -0.2350],
         [ 0.4912, -0.6048,  0.0915,  ...,  0.0234, -0.5541,  0.1849],
         ...,
         [ 0.3970,  0.6454,  0.3539,  ...,  0.1974, -0.0288,  0.0264],
         [ 0.7619,  0.3111,  0.6169,  ...,  0.2740,  0.3187, -0.3959],
         [ 0.2494, -0.1373,  0.7362,  ..., -0.2177,  0.3799, -0.1806]],

        [[-1.0070,  0.9892,  0.3765,  ...,  0.0530, -0.1038, -0.4448],
         [-0.2819,  0.3956, -0.4799,  ...,  0.1168, -0.4340, -0.4608],
         [-0.3284,  0.4360,  0.1284,  ...,  0.0734, -0.1911,  0.1953],
         ...,
         [ 0.6058,  0.7129,  0.4267,  ...,  0.5368, -0.4061, -0.3195],
         [ 0.2839,  0.8878,  0.4793,  ...,  0.2028,  0.3346, -0.4606],
         [-0.0501,  0.2916,  0.6988,  ...,  0.16

### 3 打印参数量

In [4]:
print(sum([param.nelement() for param in model.parameters()]))

12030344


### 4 测试训练

#### 4.1 定义超参数

In [5]:
num_epochs = 100
batch_size = 4
trainer = torch.optim.SGD(model.parameters(), lr=1e-3)

#### 4.2 构造训练数据

In [6]:
features = torch.randint(0, 10, (100, 15))
labels = torch.randint(0, 10, (100, 15))
labels[:, :10] = -100
dataset = TensorDataset(features, labels)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

#### 4.3 定义损失

In [7]:
lossfn = torch.nn.CrossEntropyLoss()
lossfn(model(features).logits.transpose(1, 2), labels)

tensor(8.7146, grad_fn=<NllLoss2DBackward0>)

#### 4.4 开始训练

In [8]:
for i in range(num_epochs):
    for feature, label in dataloader:
        loss = lossfn(model(feature).logits.transpose(1, 2), label)
        trainer.zero_grad()
        loss.backward()
        trainer.step()
    with torch.no_grad():
        loss = lossfn(model(features).logits.transpose(1, 2), labels)
        print(f"第{i}个epoch：loss大小为{loss}")

第0个epoch：loss大小为7.20601749420166
第1个epoch：loss大小为6.11973762512207
第2个epoch：loss大小为5.319315433502197
第3个epoch：loss大小为4.695573329925537
第4个epoch：loss大小为4.1675238609313965
第5个epoch：loss大小为3.748037815093994
第6个epoch：loss大小为3.4087507724761963
第7个epoch：loss大小为3.163405656814575
第8个epoch：loss大小为2.9779975414276123
第9个epoch：loss大小为2.837214469909668
第10个epoch：loss大小为2.7445359230041504
第11个epoch：loss大小为2.666536331176758
第12个epoch：loss大小为2.6032042503356934
第13个epoch：loss大小为2.565561056137085
第14个epoch：loss大小为2.5243537425994873
第15个epoch：loss大小为2.5040407180786133
第16个epoch：loss大小为2.4810259342193604
第17个epoch：loss大小为2.454805850982666
第18个epoch：loss大小为2.4362950325012207
第19个epoch：loss大小为2.4223792552948
第20个epoch：loss大小为2.4069271087646484
第21个epoch：loss大小为2.3958041667938232
第22个epoch：loss大小为2.398460626602173
第23个epoch：loss大小为2.38771653175354
第24个epoch：loss大小为2.3798112869262695
第25个epoch：loss大小为2.3744165897369385
第26个epoch：loss大小为2.361802339553833
第27个epoch：loss大小为2.362191677093506
第28个epoch：loss大小为2.355