<a href="https://colab.research.google.com/github/yuyu990116/transformers_tutorials/blob/main/P1_base_knowledge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#transformers及其相关的库：transformers,Tokenizer,Datasets,Evaluate(各种评价指标)
#PEFT（parameter efficient funetuning）高效微调模型的库，比如loara 用小量的参数调大模型
#Accelerate 分布式训练解决方法，包括大模型的加载与推理解决方法，有限的空间加载大的模型
#Optimum 优化加速库
#Gradio 可视化部署 几行代码实现基于web交互的算法演示系统

In [None]:
!pip install gradio
!pip install sentencepiece
import gradio as gr
from transformers import *

In [None]:
gr.Interface.from_pipeline(pipeline("question-answering", model="uer/roberta-base-chinese-extractive-qa")).launch()

In [None]:
#查看pipeline支持的任务类型
from transformers.pipelines import SUPPORTED_TASKS
for k,v in SUPPORTED_TASKS.items():
  print(k,v)

In [None]:
#pipeline调用,默认pipe.model.divece是cpu
# model = xxx.from_pretrained
# tokenizer = xxx.from_pretrained
# device = 0 #这样会加载到gpu上
# pipe = pipeline("your aim",model = model,tokenizer = tokenizer,device = device)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

In [None]:
#查看pipe的参数
pipe

<transformers.pipelines.text_classification.TextClassificationPipeline at 0x7baf28fce740>

In [None]:
from transformers import TextClassificationPipeline
print(TextClassificationPipeline)
#如果在编译器里的话直接ctrl进入这里，其中的call函数就是可调参数

<class 'transformers.pipelines.text_classification.TextClassificationPipeline'>


In [None]:
import torch
import time
times = []
for i in range(100):
    torch.cuda.synchronize()#如果用gpu的话，可以用这个，，程序会等待 GPU 上的所有操作都完成，然后再继续执行下面的代码。这样可以确保在 CPU 继续执行的同时，GPU 上的操作已经完成，避免了可能的异步问题。
    pipe("我觉得不太行！")
    torch.cuda.synchronize()
    times.append(end - start)
print(sum(times) / 100)

In [None]:
#从model的输出得到label
out = model(**inputs)
logits=out.logits
logits=torch.softmax(logits,dim=-1)
pred=torch.argmax(logits).item()
res=model.config.id2label.get(pred)

In [None]:
#Tokenizer有fast和slow: fast是默认
fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
fast_tokenizer
#在fast_tokenizer里面可以return_offsets_mapping=True,并且与inputs.word_ids()配合使用 inputs=Tokenizer(sentence,return_offsets_mapping=True)
#return_offsets_mapping=True 比如dreaming就会拆成dream ing 它俩的word_ids就会是相同的
#可以在QA中找到原句的起始和结束

In [None]:
# use_fast=False就是在用slow
slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=False)
slow_tokenizer

In [None]:
#加载chatglm-6b的tokenizer的时候要trust_remote_code=True 才能加载,因为这是它私有的
tokenizer=AutoTokenizer.from_pretrained("microsoft/phi-2",trust_remote_code=True)
tokenizer.save_pretrained("chatglm-6b_tokenizer")
tokenizer=AutoTokenizer.from_pretrained("chatglm-6b_tokenizer",trust_remote_code=True)

In [None]:
#编码器模型——自编码模型 bert 文本分类，命名实体识别，阅读理解
#解码器模型——自回归模型 gpt,bloom,llama 文本生成
#编码-解码器模型——seq to seq模型  bart,T5,glm,mBAER 文本摘要，机器翻译
#model head:连接在模型后的层，将模型的编码的表示结果进行映射以解决不同类型的任务

In [None]:
'''
*Model：模型本身，只返回编码结果
*ForCausalLM:解码器模型
*ForMaskedLM:编码器模型 预测mask的token是什么
*ForSeq2SeqLM
*ForMultipleAnswering
*ForSequenceClassification
*ForTokenClassifacation
'''

In [None]:
#从网上下载模型：
!git clone "https://huggingface.co/hfl/rbt3"  #这个会把tf等版本全下下来
!git lfs clone "https://huggingface.co/hfl/rbt3" --include="*.bin" #这个只下载pt版

In [None]:
#查看模型在调用时可以使用的参数：
from transformers import AutoConfig
config = AutoConfig.from_pretrained("./rbt3/")
#然后再config.  稍作停顿，就能看到参数

In [None]:
from transformers import BertConfig #这里也是ctrl查看

In [None]:
sen = "弱小的我也有大梦想！"
tokenizer = AutoTokenizer.from_pretrained("rbt3")
inputs = tokenizer(sen, return_tensors="pt")
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification
clz_model = AutoModelForSequenceClassification.from_pretrained("rbt3", num_labels=10)
print(clz_model.config.num_labels)#查看模型句子分类的类别数（是num_labels个类，默认是2）
clz_model(**inputs)


In [None]:
pooled_output是cls,size是batch*1*768

In [None]:
#划分数据集
from torch.utils.data import random_split

trainset, validset = random_split(dataset, lengths=[0.9, 0.1])
len(trainset), len(validset)

#第二种方法
dataset = datasets["train"]
dataset.train_test_split(test_size=0.1)
#第三种方法
dataset = dataset["train"]
dataset.train_test_split(test_size=0.1, stratify_by_column="label") # 分类数据集按照比例划分

In [None]:
#引入优化器
from torch.optim import Adam
optimizer = Adam(model.parameters(), lr=2e-5)

In [None]:
#训练与验证
if torch.cuda.is_available():
    model = model.cuda()
def evaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in validloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)
            acc_num += (pred.long() == batch["labels"].long()).float().sum()
    return acc_num / len(validset)

def train(epoch=3, log_step=100):
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f"ep: {ep}, global_step: {global_step}, loss: {output.loss.item()}")
            global_step += 1
        acc = evaluate()
        print(f"ep: {ep}, acc: {acc}")

In [None]:
#模型预测
sen = "我觉得这家酒店不错，饭很好吃！"
id2_label = {0: "差评！", 1: "好评！"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"输入：{sen}\n模型预测结果:{id2_label.get(pred.item())}")

In [None]:
#模型预测简便版：
from transformers import pipeline

model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
pipe(sen)

In [None]:
#Datasets加载数据库
dataset = load_dataset("madao33/new-title-chinese", split="train[:50%]")
dataset = load_dataset("madao33/new-title-chinese", split=["train[:50%]", "train[50%:]"])
datasets["train"].features
#{'title': Value(dtype='string', id=None),
#  'content': Value(dtype='string', id=None)}


In [None]:
#数据过滤
filter_dataset = datasets["train"].filter(lambda example: "中国" in example["title"])
start_with_ar = list(filter(lambda x: x['text'].startswith('选择'),dataset))

In [None]:
#数据映射
def f(data):
    data['text'] = 'My sentence: ' + data['text']
    return data
datatset_map = dataset.map(f)
datatset_map['text'][:5]

In [None]:
#保存与加载
datatset_map.save_to_disk("./processed_data")
processed_datasets = load_from_disk("./processed_data")
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
dataset = Dataset.from_csv("./ChnSentiCorp_htl_all.csv")
dataset = load_dataset("csv", data_files=["./all_data/ChnSentiCorp_htl_all.csv", "./all_data/ChnSentiCorp_htl_all copy.csv"], split='train')
dataset = load_dataset("json", data_files="./all_data/ChnSentiCorp_htl_all.json", field="data") #必须有field
#也可以用脚本来调用，github：https://github.com/zyds/transformers-code/blob/master/01-Getting%20Started/05-datasets/datasets.ipynb

In [None]:
#通过预先加载的其他格式转换加载数据集
import pandas as pd
data = pd.read_csv("./ChnSentiCorp_htl_all.csv")
dataset = Dataset.from_pandas(data)

# List格式的数据需要内嵌{}，明确数据字段
data = [{"text": "abc"}, {"text": "def"}]
# data = ["abc", "def"]
Dataset.from_list(data)
# Dataset({
#     features: ['text'],
#     num_rows: 2
# })

In [None]:
!pip install datasets
from datasets import load_dataset

dataset = load_dataset(path='lansinuote/ChnSentiCorp')



In [None]:
dataset

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
def process_function(examples):
    tokenized_examples = tokenizer(examples["text"], max_length=128, truncation=True, padding="max_length")
    tokenized_examples["labels"] = examples["label"] #注意是labels=label！！
    return tokenized_examples
tokenized_dataset = dataset.map(process_function, batched=True, remove_columns=dataset["train"].column_names) #如果这个tokenizer支持fast的话，这里batched=True就会加速编码
#remove_columns=dataset["train"].column_names会把原来数据库里面的key去掉，只留下inputids,attentionmask,tokentypeid和labels

Map:   0%|          | 0/9600 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

In [None]:
def process_function(examples,tokenizer=tokenizer):
    tokenized_examples = tokenizer(examples["text"], max_length=128, truncation=True)
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples
tokenized_dataset = dataset.map(process_function,num_proc=4,remove_columns=dataset["train"].column_names) #tokenizer不支持fast的时候，使用多进程。num_proc是多进程的进程数，在使用这个代码的时候需要给processfunction传入tokenizer

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 9600
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1200
    })
})

In [None]:
a=[{'role': 'user', 'content': '你好'}, {'role': 'assistant', 'content': '你好，我是安民智能医学助手，请问有什么问题需要我帮助你解答吗？'}, {'role': 'user', 'content': '我头疼'}, {'role': 'assistant', 'content': '头疼是一种常见的症状，可能由多种原因引起。首先，我们需要了解你的具体症状。你是否有其他不适感，比如恶心、呕吐、视力模糊等？'}, {'role': 'user', 'content': '有点恶心'}, {'role': 'assistant', 'content': '恶心可能是头疼的一个常见伴随症状。除了头疼和恶心，你还有其他不适感吗？比如头晕、乏力、视力模糊等？'}, {'role': 'user', 'content': '头晕'}, {'role': 'assistant', 'content': '头晕和头疼同时出现可能是一种颅内疾病的表现，比如颅内出血、脑血管疾病等。建议你尽快就医，进行详细的检查和评估。'}, {'role': 'user', 'content': '好的'}, {'role': 'assistant', 'content': '如果你有其他疑问或需要进一步帮助，请随时告诉我。我会尽力为你提供帮助。'}]
from datasets import Dataset
d=Dataset.from_list(a)
d
for data in d:
  print(data)

In [None]:
#datacollator
from transformers import DataCollatorWithPadding
collator = DataCollatorWithPadding(tokenizer=tokenizer) #官方提供的collate_fn，但是要求dataset只能有inputid......那四个key
from torch.utils.data import DataLoader
dl = DataLoader(tokenized_dataset,batch_size=16,collate_fn=collator,shuffle=True)

In [None]:
#Evaluate
!pip install evaluate
import evaluate
accuracy = evaluate.load("accuracy")
print(accuracy.description)
print(accuracy.inputs_description)
accuracy = evaluate.load("accuracy")
results = accuracy.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
print(results)
#迭代计算：
for ref, pred in zip([0,1,0,1], [1,0,0,1]):
    accuracy.add(references=ref, predictions=pred)
print(accuracy.compute())
#batch的迭代计算：
accuracy = evaluate.load("accuracy")
for refs, preds in zip([[0,1],[0,1]], [[1,0],[0,1]]):
    accuracy.add_batch(references=refs, predictions=preds)
print(accuracy.compute())
#多个评估指标的计算
clf_metrics = evaluate.combine(["accuracy", "f1", "recall", "precision"])
clf_metrics
print(clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1]))
# {'accuracy': 0.6666666666666666,
#  'f1': 0.6666666666666666,
#  'recall': 0.5,
#  'precision': 1.0}
#评估结果对比可视化
from evaluate.visualization import radar_plot

data = [
   {"accuracy": 0.99, "precision": 0.8, "f1": 0.95, "latency_in_seconds": 33.6},
   {"accuracy": 0.98, "precision": 0.87, "f1": 0.91, "latency_in_seconds": 11.2},
   {"accuracy": 0.98, "precision": 0.78, "f1": 0.88, "latency_in_seconds": 87.6},
   {"accuracy": 0.88, "precision": 0.78, "f1": 0.81, "latency_in_seconds": 101.6}
   ]
model_names = ["Model 1", "Model 2", "Model 3", "Model 4"]
plot = radar_plot(data=data, model_names=model_names)

In [None]:
#huggingface的Tasks里面有很多类任务的模型等，里面也有任务对应可用的评价指标