In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset('csv', data_files="./data.csv", split='train')
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 3997
})

In [3]:
dataset = dataset.train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3597
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 400
    })
})

In [4]:
tokenizer = AutoTokenizer.from_pretrained("../models/bert_base_chinese")

In [5]:
def process_function(examples):
    data = tokenizer(examples['text'], max_length=128, truncation=True)
    data['label'] = examples['label']
    return data

tokenizer_data = dataset.map(process_function, batched=True)
tokenizer_data

Map: 100%|██████████| 3597/3597 [00:00<00:00, 13252.26 examples/s]
Map: 100%|██████████| 400/400 [00:00<00:00, 10459.15 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3597
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 400
    })
})

In [6]:
model = AutoModelForSequenceClassification.from_pretrained("../models/bert_base_chinese")
model

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../models/bert_base_chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [7]:
model.config

BertConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "../models/bert_base_chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

In [8]:
import evaluate

acc_metric = evaluate.load('./metric_accuracy.py')
f1_metric = evaluate.load('./metric_f1.py')

In [9]:
def evaluate_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

In [10]:
train_args = TrainingArguments(
    output_dir='./output',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    logging_steps=10,
    # eval_strategy="epoch",
    # save_strategy="epoch",
    # save_total_limit=3,
    learning_rate=2e-5,
    weight_decay=1e-2,
    metric_for_best_model="f1",
    # load_best_model_at_end=True
)
train_args

TrainingArguments(
_n_gpu=4,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=no,
eval_use_gather_object=F

In [11]:
from transformers import DataCollatorWithPadding

trainer = Trainer(
    model = model,
    args = train_args, 
    train_dataset = tokenizer_data['train'],
    eval_dataset = tokenizer_data['test'],
    data_collator = DataCollatorWithPadding(tokenizer),
    compute_metrics=evaluate_metric
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2024-12-19 12:15:45,405] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/buding666/miniconda3/envs/llm/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/buding666/miniconda3/envs/llm/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/buding666/miniconda3/envs/llm/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/buding666/miniconda3/envs/llm/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/buding666/miniconda3/envs/llm/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/buding666/miniconda3/envs/llm/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/home/buding

In [12]:
trainer.train()



Step,Training Loss
10,0.2888
20,0.0433
30,0.0115
40,0.0046
50,0.0025
60,0.0017
70,0.0014
80,0.0012


TrainOutput(global_step=87, training_loss=0.0409007235055511, metrics={'train_runtime': 30.7252, 'train_samples_per_second': 351.211, 'train_steps_per_second': 2.832, 'total_flos': 246704642169720.0, 'train_loss': 0.0409007235055511, 'epoch': 3.0})

In [13]:
trainer.evaluate()



{'eval_loss': 0.0007476668106392026,
 'eval_accuracy': 1.0,
 'eval_f1': 1.0,
 'eval_runtime': 0.3353,
 'eval_samples_per_second': 1192.822,
 'eval_steps_per_second': 5.964,
 'epoch': 3.0}

In [14]:
trainer.predict(tokenizer_data['test'])



PredictionOutput(predictions=array([[-3.3225381,  3.7892299],
       [ 3.426923 , -3.838284 ],
       [ 3.3443522, -3.8460777],
       [-3.4059992,  3.8280907],
       [ 3.2926693, -3.6307192],
       [ 3.4668612, -3.832519 ],
       [-3.4214463,  3.8483446],
       [ 3.3797522, -3.833606 ],
       [-3.325368 ,  3.80441  ],
       [ 3.436747 , -3.8334434],
       [-3.3549585,  3.8502765],
       [-3.4060388,  3.7737503],
       [-3.3729205,  3.8642983],
       [ 3.4414515, -3.7449903],
       [-3.408424 ,  3.7874193],
       [-3.3775563,  3.8106518],
       [ 3.4125502, -3.777895 ],
       [ 3.4678688, -3.811379 ],
       [ 3.420768 , -3.8109813],
       [ 3.4353268, -3.8442965],
       [-3.3980489,  3.8315148],
       [-3.4347944,  3.7723746],
       [-3.4435806,  3.7428093],
       [ 3.4098465, -3.8519533],
       [ 3.4722242, -3.8243954],
       [ 3.4102666, -3.8242214],
       [ 3.4149966, -3.838958 ],
       [ 3.4845562, -3.7383256],
       [ 3.4344926, -3.8213367],
       [ 3.372

In [25]:
from transformers import pipeline

id2_label = {0: "文本理解", 1: "数据查询"}
model.config.id2_label = id2_label
pipe = pipeline("text-classification",model=model, tokenizer=tokenizer, device="cpu")

Device set to use cpu


In [21]:
pipe("股票在20200809的最高收盘价具体是多少？请保留X位小数？")

[{'label': 'LABEL_1', 'score': 0.9992446899414062}]

In [31]:
# 直接调用模型以获取原始输出
inputs = tokenizer("股票在20200809的最高收盘价具体是多少？请保留X位小数？", return_tensors="pt")
outputs = model(**inputs)

# 打印 logits 张量
print(outputs.logits)

tensor([[-3.3772,  3.8105]], grad_fn=<AddmmBackward0>)


In [37]:
import time

start = time.time()
pipe("股票在20200809的最高收盘价具体是多少？请保留X位小数？")
end = time.time() - start
end

0.03561735153198242