# 模型微调概念和流程

## 制作Dataset

In [17]:
from torch.utils.data import Dataset
from datasets import load_from_disk

class CustomDataset(Dataset):
    def __init__(self, split):
        self.dataset = load_from_disk('/home/yichen/LLM_Learn/data/lansinuote/chn_senti_corp_dataset')

        if split == 'train':
            self.data = self.dataset['train']
        elif split == 'validation':
            self.data = self.dataset['validation']
        elif split == 'test':
            self.data = self.dataset['test']
        else:
            raise ValueError('Invalid split')
    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        return self.data[item]['text'], self.data[item]['label']

if __name__ == '__main__':
    train_dataset = CustomDataset('train')
    print(len(train_dataset))
    print(train_dataset[0])

9600
('选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般', 1)


## 下游任务模型设计

### 加载预训练模型

查看模型的输入和输出尺寸
```bash
BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    ........
(pooler): BertPooler(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    ......
```
`
输入部分的词嵌入层的输入维度是21128，即BERT模型的词表大小，输出维度是768，即BERT模型的embedding size。

输出部分的pooler层的输入维度是768，输出维度是768，即BERT模型的embedding size。

`

In [32]:
from transformers import BertModel
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_dir = "./model/google-bert/bert-base-chinese/models--google-bert--bert-base-chinese/snapshots/c30a6ed22ab4564dc1e3b2ecbf6e766b0611a33f"

bert_model = BertModel.from_pretrained(model_dir)
#if cuda is available, move the model to cuda
# model.to(device)




### 定义下游任务模型(将主干网络提取的特征进行分类)
`注意上游任务不参与训练`

```python
def forward(self, input_ids, attention_mask, token_type_ids):
```
```
参数名                   说明                       
----------------------  --------------------------  ------------------------------------------------------
input_ids                每个 token 的 ID，表示输入文本

attention_mask           标记哪些位置是有效的（1）或 padding（0）

token_type_ids           区分第一个句子和第二个句子（对于句对任 务），单句时全 0

```

In [34]:
class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.fc = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        #上游不参与训练
        with torch.no_grad():
            out = bert_model(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)

        #下游参与训练

        out = self.fc(out.last_hidden_state[:,0])
        out = out.softmax(dim=1)
        return out


### 自定义模型微调

In [35]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer
from torch.optim import AdamW


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
epochs = 100

model_dir = "./model/google-bert/bert-base-chinese/models--google-bert--bert-base-chinese/snapshots/c30a6ed22ab4564dc1e3b2ecbf6e766b0611a33f"

token = BertTokenizer.from_pretrained(model_dir)

#数据编码处理
def collate_fn(data):
    sentences = [item[0] for item in data]
    labels = [item[1] for item in data]

    data = token.batch_encode_plus(
        sentences,
        padding="max_length",
        max_length=350,
        truncation=True,
        return_tensors="pt",
        return_length=True
    )
    input_ids = data["input_ids"]
    attention_mask = data["attention_mask"]
    token_type_ids = data["token_type_ids"]
    labels = torch.LongTensor(labels)
    return input_ids, attention_mask, token_type_ids, labels


train_dataset = CustomDataset('train')

train_loader = DataLoader(
    dataset = train_dataset,
    batch_size = 32,
    shuffle = True,
    drop_last = True,
    collate_fn = collate_fn
)

if __name__ == '__main__':
    print(device)
    my_model = Model() #if use cuda my_model = Model().to(device)

    optimizer = AdamW(my_model.parameters(), lr=5e-4)

    loss_func = nn.CrossEntropyLoss()

    my_model.train()

    for epoch in range(epochs):
        for step, (input_ids, attention_mask, token_type_ids, labels) in enumerate(train_loader):
            input_ids = input_ids # if use cuda input_ids = input_ids.to(device)
            attention_mask = attention_mask
            token_type_ids = token_type_ids
            labels = labels

            out = my_model(input_ids, attention_mask, token_type_ids)

            loss = loss_func(out, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if step % 10 == 0:
                out = out.argmax(dim=1)
                acc = (out == labels).sum().item() / len(labels)
                print(f"Epoch: {epoch}, Step: {step}, Loss: {loss.item()}, Acc: {acc}")

        torch.save(model.state_dict(), f"params/{epoch}bert.pt")
        print(f"Epoch: {epoch}, Save model params")



cpu
Epoch: 0, Step: 0, Loss: 0.6916692852973938, Acc: 0.5625
Epoch: 0, Step: 10, Loss: 0.6360597610473633, Acc: 0.8125
Epoch: 0, Step: 20, Loss: 0.6370005011558533, Acc: 0.6875


KeyboardInterrupt: 