<a href="https://colab.research.google.com/github/yichen-qi/LLM_Learn/blob/main/model_finetunning_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LLM模型微调，训练，测试，调用

## 依赖库

In [None]:
!pip install transformers datasets
import torch
from transformers import BertTokenizer, BertModel
from datasets import load_dataset

print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())


### 导入依赖所有库

In [3]:
from datasets import load_dataset
from torch.utils.data import Dataset
from transformers import BertModel
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer
from torch.optim import AdamW


## 制作Dataset

In [4]:
class CustomDataset(Dataset):
    def __init__(self, split):
        self.dataset = load_dataset("lansinuote/ChnSentiCorp")

        if split == 'train':
            self.data = self.dataset['train']
        elif split == 'validation':
            self.data = self.dataset['validation']
        elif split == 'test':
            self.data = self.dataset['test']
        else:
            raise ValueError('Invalid split')
    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        return self.data[item]['text'], self.data[item]['label']

if __name__ == '__main__':
    train_dataset = CustomDataset('train')
    print(len(train_dataset))
    print(train_dataset[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


dataset_infos.json:   0%|          | 0.00/960 [00:00<?, ?B/s]

(…)-00000-of-00001-02f200ca5f2a7868.parquet:   0%|          | 0.00/2.16M [00:00<?, ?B/s]

(…)-00000-of-00001-405befbaa3bcf1a2.parquet:   0%|          | 0.00/276k [00:00<?, ?B/s]

(…)-00000-of-00001-5372924f059fe767.parquet:   0%|          | 0.00/275k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9600 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1200 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1200 [00:00<?, ? examples/s]

9600
('选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般', 1)


## 下游任务模型设计

### 加载预训练模型

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

bert_model = BertModel.from_pretrained("google-bert/bert-base-chinese")
bert_model.to(device)

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

### 定义下游任务模型(将主干网络提取的特征进行分类)


In [6]:
class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.fc = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        #上游不参与训练
        with torch.no_grad():
            out = bert_model(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)

        #下游参与训练

        out = self.fc(out.last_hidden_state[:,0])
        out = out.softmax(dim=1)
        return out

### 自定义模型微调

In [None]:
token = BertTokenizer.from_pretrained("google-bert/bert-base-chinese")
epochs = 100

#数据编码处理
def collate_fn(data):
    sentences = [item[0] for item in data]
    labels = [item[1] for item in data]

    data = token.batch_encode_plus(
        sentences,
        padding="max_length",
        max_length=350,
        truncation=True,
        return_tensors="pt",
        return_length=True
    )
    input_ids = data["input_ids"]
    attention_mask = data["attention_mask"]
    token_type_ids = data["token_type_ids"]
    labels = torch.LongTensor(labels)
    return input_ids, attention_mask, token_type_ids, labels

train_dataset = CustomDataset('train')

train_loader = DataLoader(
    dataset = train_dataset,
    batch_size = 32,
    shuffle = True,
    drop_last = True,
    collate_fn = collate_fn
)

print("Using device:", device)

if __name__ == '__main__':
    print(device)
    my_model = Model().to(device)

    optimizer = AdamW(my_model.parameters(), lr=5e-4)

    loss_func = nn.CrossEntropyLoss()

    my_model.train()

    for epoch in range(epochs):
        for step, (input_ids, attention_mask, token_type_ids, labels) in enumerate(train_loader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            labels = labels.to(device)

            out = my_model(input_ids, attention_mask, token_type_ids)

            loss = loss_func(out, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if step % 10 == 0:
                out = out.argmax(dim=1)
                acc = (out == labels).sum().item() / len(labels)
                print(f"Epoch: {epoch}, Step: {step}, Loss: {loss.item()}, Acc: {acc}")

        torch.save(my_model.state_dict(), f"params/{epoch}bert.pt")
        print(f"Epoch: {epoch}, Save model params")


## 模型性能测试


In [9]:
test_dataset = CustomDataset('test')

test_loader = DataLoader(
    dataset = test_dataset,
    batch_size = 32,
    shuffle = True,
    drop_last = True,
    collate_fn = collate_fn
)

if __name__ == '__main__':
    acc = 0
    total = 0

    test_model = Model().to(device)
    test_model.load_state_dict(torch.load('params/11bert.pt'))
    test_model.eval()

    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(test_loader):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        labels = labels.to(device)

        out = test_model(input_ids, attention_mask, token_type_ids)

        out = out.argmax(dim=1)
        acc += (out == labels).sum().item()
        total += len(labels)

    print("Test Accuracy: {:.4f}".format(acc/total))

Test Accuracy: 0.9037


## 模型调用

In [20]:
sentiment_model = Model().to(device)

names = ["负向评价", "正向评价"]

def collate_fn_sentiment(data):
    sentences = data


    data = token.batch_encode_plus(
        sentences,
        padding="max_length",
        max_length=350,
        truncation=True,
        return_tensors="pt",
        return_length=True
    )

    input_ids = data["input_ids"]
    attention_mask = data["attention_mask"]
    token_type_ids = data["token_type_ids"]
    return input_ids, attention_mask, token_type_ids


def prediction():
    sentiment_model.load_state_dict(torch.load('params/11bert.pt'))
    sentiment_model.eval()

    while True:
        sentence = input("请输入句子(输入q退出)：")
        if sentence == "q":
            print("退出")
            break

        input_ids, attention_mask, token_type_ids = collate_fn_sentiment([sentence])
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)

        with torch.no_grad():
            output = sentiment_model(input_ids, attention_mask, token_type_ids)
            output = output.argmax(dim=1)
            print("模型判定", names[output], "\n")

if __name__ == '__main__':
    prediction()


请输入句子(输入q退出)：<荐书> 推荐所有喜欢<红楼>的红迷们一定要收藏这本书,要知道当年我听说这本书的时候花很长时间去图书馆找和借都没能如愿,所以这次一看到当当有,马上买了,红迷们也要记得备货哦!
模型判定 正向评价 

请输入句子(输入q退出)：我爱你
模型判定 正向评价 

请输入句子(输入q退出)：我喜欢你
模型判定 负向评价 

请输入句子(输入q退出)：我爱你
模型判定 正向评价 

请输入句子(输入q退出)：我恨你
模型判定 负向评价 

请输入句子(输入q退出)：我爱这部电影
模型判定 正向评价 

请输入句子(输入q退出)：我讨厌你
模型判定 负向评价 

请输入句子(输入q退出)：q
退出


In [19]:
val_dataset = CustomDataset('validation')
val_dataset[1]

('<荐书> 推荐所有喜欢<红楼>的红迷们一定要收藏这本书,要知道当年我听说这本书的时候花很长时间去图书馆找和借都没能如愿,所以这次一看到当当有,马上买了,红迷们也要记得备货哦!',
 1)