In [1]:
from configuration import Config
from modeling import CasualLM
import torch
from torch.utils.data import TensorDataset, DataLoader

### 1 定义模型

In [2]:
config = Config(
    vocab_size=10,
    num_hiddens=8,
    num_layers=6,
    num_heads=4,
    num_mlp_intermediate=16,
    max_context_length=1024,
    dropout=0.1
)

model = CasualLM(config)

### 2 测试推理

In [4]:
x = torch.arange(0,10).repeat(2, 2)
print(x)
y = model(x)
print(y.logits)

tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])
tensor([[[ 0.4537,  0.6064,  0.1828,  0.8070,  0.1566, -0.4741,  1.0012,
           0.1089, -1.6774, -0.1002],
         [ 0.9177, -0.5914,  0.4403,  1.5636,  0.0064,  0.2381,  1.3150,
           1.0761, -1.8197, -0.9223],
         [ 0.6540, -0.1636, -0.1460,  1.2669, -0.2130,  0.4608,  1.6212,
           0.6160, -1.6032, -1.0312],
         [ 0.2613, -0.4499,  1.0814,  0.2962,  0.9806, -0.4047, -0.1170,
          -0.0600, -0.5498, -0.5024],
         [ 1.2181,  0.1510, -0.0955,  0.6422,  0.5491, -0.1185,  0.3183,
           0.2473, -1.5293, -0.8475],
         [ 1.0520,  0.0345, -0.3135,  0.4455,  0.1137, -0.1267,  0.9736,
           0.3378, -0.6104, -0.2732],
         [ 0.2827, -0.3446,  0.1079,  1.1703, -0.3649,  0.0671,  1.5342,
           0.6921, -1.6689, -0.7889],
         [-0.0284,  0.1585,  0.8709,  1.3353, -0.0749, -0.3744,  1.2216,
         

### 3 打印参数量

In [5]:
print(sum([param.nelement() for param in model.parameters()]))

3594


### 4 测试训练

#### 4.1 定义超参数

In [6]:
num_epochs = 100
batch_size = 4
trainer = torch.optim.SGD(model.parameters(), lr=1e-3)

#### 4.2 构造训练数据

In [7]:
features = torch.randint(0, 10, (100, 15))
labels = torch.randint(0, 10, (100, 15))
labels[:, :10] = -100
dataset = TensorDataset(features, labels)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

#### 4.3 定义损失

In [8]:
lossfn = torch.nn.CrossEntropyLoss()
lossfn(model(features).logits.transpose(1, 2), labels)

tensor(2.6287, grad_fn=<NllLoss2DBackward0>)

#### 4.4 开始训练

In [10]:
for i in range(num_epochs):
    for feature, label in dataloader:
        loss = lossfn(model(feature).logits.transpose(1, 2), label)
        trainer.zero_grad()
        loss.backward()
        trainer.step()
    with torch.no_grad():
        loss = lossfn(model(features).logits.transpose(1, 2), labels)
        print(f"第{i}个epoch：loss大小为{loss}")

第0个epoch：loss大小为2.622328758239746
第1个epoch：loss大小为2.59209942817688
第2个epoch：loss大小为2.5784034729003906
第3个epoch：loss大小为2.5696887969970703
第4个epoch：loss大小为2.5630176067352295
第5个epoch：loss大小为2.5420303344726562
第6个epoch：loss大小为2.548342704772949
第7个epoch：loss大小为2.539367437362671
第8个epoch：loss大小为2.547559976577759
第9个epoch：loss大小为2.517329692840576
第10个epoch：loss大小为2.5099427700042725
第11个epoch：loss大小为2.488586902618408
第12个epoch：loss大小为2.487199306488037
第13个epoch：loss大小为2.4925119876861572
第14个epoch：loss大小为2.4728024005889893
第15个epoch：loss大小为2.475064277648926
第16个epoch：loss大小为2.4652397632598877
第17个epoch：loss大小为2.447746515274048
第18个epoch：loss大小为2.4462976455688477
第19个epoch：loss大小为2.44828724861145
第20个epoch：loss大小为2.458442211151123
第21个epoch：loss大小为2.449829339981079
第22个epoch：loss大小为2.4360058307647705
第23个epoch：loss大小为2.433671474456787
第24个epoch：loss大小为2.4426229000091553
第25个epoch：loss大小为2.4313805103302
第26个epoch：loss大小为2.4418718814849854
第27个epoch：loss大小为2.409180164337158
第28个epoch：loss大小为2.415