In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/MyDrive/transformers/天池-入门NLP - 新闻文本分类')

In [None]:
#安装
!pip install transformers datasets

In [None]:
# 文件读取
import pandas as pd
from datasets import load_dataset
from datasets import Dataset

train_df=pd.read_csv('./train_set.csv',sep='\t')
test_df=pd.read_csv('./test_a.csv', sep ='\t')
df=pd.concat((train_df,test_df))

In [None]:
#将3750/648/900改成标点符号，删除原text列，新增列重名为text列
import re
def replacepunc(x):
  x=re.sub('3750',",",x)
  x=re.sub('900',".",x)
  x=re.sub('648',"!",x)
  return x
df['words']=df['text'].map(lambda x: replacepunc(x))
df.drop('text',axis=1,inplace=True)
df.columns=['label','text']

#数据载入dataset，去除多余的列，只保留text列
data=Dataset.from_pandas(df).remove_columns(['label', '__index_level_0__'])
data

In [None]:
batch_size=1000
#all_texts=[data['text'][i:i+batch_size] for i in range(0,len(data),batch_size)]

def batch_iterator():
  for i in range(0,len(data),batch_size):
    yield data['text'][i:i+batch_size]

In [None]:
#初始化分词器、预分词器
from tokenizers import decoders,models,normalizers,pre_tokenizers,processors,trainers,Tokenizer

tokenizer=Tokenizer(models.WordPiece(unl_token="[UNK]"))

tokenizer.pre_tokenizer=pre_tokenizers.BertPreTokenizer()
special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer=trainers.WordPieceTrainer(vocab_size=7000,min_frequency=2,special_tokens=["[UNK]","[CLS]","[SEP]","[PAD]","[MASK]"])
tokenizer.decoders=decoders.WordPiece(prefix="##")

In [None]:
#开始训练
tokenizer.train_from_iterator(batch_iterator(),trainer=trainer)

In [None]:
#进行分词后处理
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
mask_token_id = tokenizer.token_to_id("[MASK]")
pad_token_id = tokenizer.token_to_id("[PAD]")

tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]",cls_token_id),("[SEP]",sep_token_id),("[MASK]",mask_token_id)],
    )

tokenizer.enable_truncation(max_length=512)
tokenizer.enable_padding(pad_token='[PAD]')


In [None]:
#测试分词结果
encoding = tokenizer.encode('2491 4109 1757 7539 648 3695 3038 4490 23 7019 3731 4109 3792 2465',' 2893 7212 5296 1667 3618 7044 1519 5413 1283 6122 4893 7495 2435 5510')
encoding.tokens

In [None]:
#保存模型并重新加载
#tokenizer.save("tokenizers.json")

from transformers import PreTrainedTokenizerFast
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizers.json",
   model_max_length=512,mask_token='[MASK]',pad_token='[PAD]',unk_token='[UNK]',
   cls_token='[CLS]',sep_token='[SEP]',padding_side='right',
   return_special_tokens_mask=True)
#PreTrainedTokenizerFast中一定要设置mask_token，pad_token等，不然报错

In [None]:
#data_collator是一个函数，负责获取样本并将它们批处理成张量
#在data_collator中可以确保每次以新的方式完成随机掩蔽。
from transformers import DataCollatorForLanguageModeling
data_collator=DataCollatorForLanguageModeling(tokenizer=fast_tokenizer,mlm=True,mlm_probability=0.15)

In [None]:
#加载训练中保存的模型，继续训练
from transformers import BertForMaskedLM
model = BertForMaskedLM.from_pretrained('./Test-Clm/checkpoint-18000')

In [None]:
#初始化bert模型
from transformers import BertConfig
config = BertConfig(
    vocab_size=7000,
    hidden_size=512,
    intermediate_size=4*512,
    max_position_embeddings=512,
    num_hidden_layers=4,
    num_attention_heads=4,
    type_vocab_size=2,
    attention_probs_dropout_prob=0.1,
    hidden_dropout_prob=0.1,
    initializer_range=0.02
)

from transformers import BertForMaskedLM
model = BertForMaskedLM(config=config)

In [None]:
#数据进行分词预处理，删除‘text'列，否则后面拼接的时候会报错。
tokenized_datasets=data.map(lambda examples:fast_tokenizer(examples['text']),batched=True).remove_columns("text")

#tokenized_datasets.save_to_disk('./tokenized_datasets')
#from datasets import load_from_disk
#tokenized_datasets=load_from_disk('./tokenized_datasets')
tokenized_datasets

  0%|          | 0/250 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'text', 'token_type_ids'],
    num_rows: 250000
})

In [None]:
"""def tokenize_function(examples):
    return fast_tokenizer (examples["text"])
tokenized_datasets = dataset.map(tokenize_function,batched=True,remove_columns=["text"])"""

In [None]:
block_size = 128
def group_texts(examples):
  # 拼接所有文本
  concatenated_examples={k:sum(examples[k],[]) for k in examples.keys()}
  total_length=len(concatenated_examples[list(examples.keys())[0]])
  # 我们将余数对应的部分去掉。但如果模型支持的话，可以添加padding，您可以根据需要定制此部件。
  total_length = (total_length//block_size)*block_size
  # 通过max_len进行分割。
  result={
      k:[t[i:i+block_size] for i in range(0,total_length,block_size)]
      for k,t in concatenated_examples.items()
  }
  result["labels"]=result["input_ids"].copy()
  return result

lm_datasets=tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [None]:
#加载和保存拼接后的文本
#lm_datasets.save_to_disk('./lm_datasets')

import pandas as pd
from datasets import load_from_disk
lm_datasets=load_from_disk('./lm_datasets')

In [None]:
lm_datasets

Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
    num_rows: 1779535
})

In [None]:
import os
assert os.environ['COLAB_TPU_ADDR']
#'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

!pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl

Collecting torch-xla==1.9
  Downloading https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl (149.9 MB)
[K     |████████████████████████████████| 149.9 MB 22 kB/s 
[?25hCollecting cloud-tpu-client==0.10
  Downloading cloud_tpu_client-0.10-py3-none-any.whl (7.4 kB)
Collecting google-api-python-client==1.8.0
  Downloading google_api_python_client-1.8.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 2.2 MB/s 
Installing collected packages: google-api-python-client, torch-xla, cloud-tpu-client
  Attempting uninstall: google-api-python-client
    Found existing installation: google-api-python-client 1.12.8
    Uninstalling google-api-python-client-1.12.8:
      Successfully uninstalled google-api-python-client-1.12.8
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
earthengine-api 0.1.284 requir

In [None]:
import torch_xla.core.xla_model as xm
device = xm.xla_device()
model.to(device)

In [None]:
#使用GPU训练
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    "pre-mlm",
    logging_strategy="steps",
    logging_steps=3000,
    save_strategy="steps",
    save_steps=9000,
    num_train_epochs=5,
    learning_rate=4e-4,
    per_device_train_batch_size=128,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets,
    data_collator=data_collator)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 1779535
  Num Epochs = 5
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 69515


Step,Training Loss
3000,4.056
6000,2.6248
9000,2.4071
12000,2.2838
15000,2.201
18000,2.1362
21000,2.0843
24000,2.0385
27000,2.0013
30000,1.9653


Saving model checkpoint to pre-mlm/checkpoint-9000
Configuration saved in pre-mlm/checkpoint-9000/config.json
Model weights saved in pre-mlm/checkpoint-9000/pytorch_model.bin
Saving model checkpoint to pre-mlm/checkpoint-18000
Configuration saved in pre-mlm/checkpoint-18000/config.json
Model weights saved in pre-mlm/checkpoint-18000/pytorch_model.bin
Saving model checkpoint to pre-mlm/checkpoint-27000
Configuration saved in pre-mlm/checkpoint-27000/config.json
Model weights saved in pre-mlm/checkpoint-27000/pytorch_model.bin
Saving model checkpoint to pre-mlm/checkpoint-36000
Configuration saved in pre-mlm/checkpoint-36000/config.json
Model weights saved in pre-mlm/checkpoint-36000/pytorch_model.bin
Saving model checkpoint to pre-mlm/checkpoint-45000
Configuration saved in pre-mlm/checkpoint-45000/config.json
Model weights saved in pre-mlm/checkpoint-45000/pytorch_model.bin
Saving model checkpoint to pre-mlm/checkpoint-54000
Configuration saved in pre-mlm/checkpoint-54000/config.json
M

TrainOutput(global_step=69515, training_loss=2.0477235787619468, metrics={'train_runtime': 16620.215, 'train_samples_per_second': 535.353, 'train_steps_per_second': 4.183, 'total_flos': 8.8070684806656e+16, 'train_loss': 2.0477235787619468, 'epoch': 5.0})

In [None]:
#保存模型
trainer.save_model("./Pre_Bert")

Saving model checkpoint to ./Pre_Bert
Configuration saved in ./Pre_Bert/config.json
Model weights saved in ./Pre_Bert/pytorch_model.bin


In [None]:
#准备进行下游任务微调
from datasets import load_metric
metric=load_metric("accuracy")

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

In [None]:
#加载训练好的预训练模型
from transformers import AutoModelForSequenceClassification
model=AutoModelForSequenceClassification.from_pretrained("./Pre_Bert",num_labels=14)

In [None]:
#GPU训练
import torch
from transformers import  TrainingArguments, Trainer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
#TPU训练
import torch_xla.core.xla_model as xm
device = xm.xla_device()
model.to(device)

In [None]:
import numpy as np
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  
  return metric.compute(predictions=predictions, references=labels)

In [None]:
#加载数据
from datasets import Dataset
train_df=pd.read_csv('./train_set.csv',sep='\t')


#准备将text文本首尾截断，各取255tokens
def slipt2(x):
  ls=x.split(' ')
  le=len(ls)
  if le<511:
    return x
  else:
    return ' '.join(ls[:255]+ls[-255:])
    
train_df
#small_df['summary']=small_df['text'].apply(lambda x:slipt2(x))

Unnamed: 0,label,text
0,2,2967 6758 339 2021 1854 3731 4109 3792 4149 15...
1,11,4464 486 6352 5619 2465 4802 1452 3137 5778 54...
2,3,7346 4068 5074 3747 5681 6093 1777 2226 7354 6...
3,2,7159 948 4866 2109 5520 2490 211 3956 5520 549...
4,3,3646 3055 3055 2490 4659 6065 3370 5814 2465 5...
...,...,...
199995,2,307 4894 7539 4853 5330 648 6038 4409 3764 603...
199996,2,3792 2983 355 1070 4464 5050 6298 3782 3130 68...
199997,11,6811 1580 7539 1252 1899 5139 1386 3870 4124 1...
199998,2,6405 3203 6644 983 794 1913 1678 5736 1397 191...


In [None]:
#进行五折交叉的数据验证，设k=0看看数据处理的结果
k=0
val_df=train_df.iloc[k*40000:(k+1)*40000, ]
trains_df=pd.concat([train_df.iloc[:k*40000, ],train_df.iloc[(k+1)*40000:, ]])
val_df['summary']=val_df['text'].apply(lambda x:slipt2(x))
trains_df['summary']=trains_df['text'].apply(lambda x:slipt2(x))
val_df['len']=val_df['summary'].apply(lambda x:len(x.split(' ')))

#加载到dataset并预处理
trains_ds=Dataset.from_pandas(trains_df).remove_columns("text").shuffle(seed=42)
val_ds=Dataset.from_pandas(val_df).remove_columns("text").shuffle(seed=42)
print(val_ds['len'])
print(trains_ds)

[510, 510, 510, 510, 510, 409, 510, 510, 462, 490, 510, 510, 358, 240, 510, 510, 510, 510, 510, 510, 510, 510, 510, 510, 476, 510, 362, 510, 510, 208, 421, 510, 52, 510, 375, 199, 510, 510, 478, 510, 381, 318, 267, 510, 395, 188, 510, 255, 510, 510, 254, 510, 510, 479, 510, 510, 413, 490, 490, 510, 75, 137, 450, 488, 510, 481, 510, 510, 510, 510, 510, 473, 510, 510, 510, 510, 510, 240, 150, 432, 510, 163, 296, 510, 186, 510, 510, 510, 172, 510, 510, 510, 510, 510, 510, 510, 462, 117, 241, 510, 506, 510, 467, 399, 510, 510, 245, 485, 344, 510, 510, 510, 510, 510, 381, 510, 507, 510, 405, 510, 225, 510, 510, 280, 510, 510, 174, 510, 510, 281, 503, 510, 330, 335, 510, 510, 510, 445, 510, 510, 510, 286, 510, 504, 510, 510, 510, 510, 488, 510, 510, 510, 63, 167, 188, 510, 510, 440, 510, 510, 505, 510, 510, 510, 510, 432, 510, 437, 499, 510, 510, 510, 410, 510, 510, 510, 510, 510, 137, 510, 510, 510, 510, 338, 510, 510, 510, 510, 510, 510, 510, 510, 127, 510, 352, 510, 219, 510, 510, 510, 51

In [None]:
#五折交叉验证
for k in range(5):
  val_df=train_df.iloc[k*40000:(k+1)*40000, ]
  trains_df=pd.concat([train_df.iloc[:k*40000, ],train_df.iloc[(k+1)*40000:, ]])

  val_df['summary']=val_df['text'].apply(lambda x:slipt2(x))
  trains_df['summary']=trains_df['text'].apply(lambda x:slipt2(x))

#加载到dataset并预处理
  trains_ds=Dataset.from_pandas(trains_df).remove_columns("text").shuffle(seed=42)
  val_ds=Dataset.from_pandas(val_df).remove_columns("text").shuffle(seed=42)
 
  tokenized_trains_ds=trains_ds.map(lambda examples:fast_tokenizer(examples['summary'],truncation=True,padding=True),batched=True)
  tokenized_val_ds=val_ds.map(lambda examples:fast_tokenizer(examples['summary'],truncation=True,padding=True),batched=True)

  path=("news-cla"+str(k))

  #进行任务微调
  args=TrainingArguments(
    output_dir=path,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=96,
    per_device_eval_batch_size=96,
    num_train_epochs=1,
    weight_decay=0.01,)
  
  trainer=Trainer(
    model,
    args,
    train_dataset=tokenized_trains_ds,
    eval_dataset=tokenized_val_ds,
    tokenizer=fast_tokenizer,
    compute_metrics=compute_metrics)
  trainer.train()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


  0%|          | 0/160 [00:00<?, ?ba/s]

  0%|          | 0/40 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: summary.
***** Running training *****
  Num examples = 160000
  Num Epochs = 1
  Instantaneous batch size per device = 96
  Total train batch size (w. parallel, distributed & accumulation) = 96
  Gradient Accumulation steps = 1
  Total optimization steps = 1667


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2032,0.182172,0.945925


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: summary.
***** Running Evaluation *****
  Num examples = 40000
  Batch size = 96
Saving model checkpoint to news-cla0/checkpoint-1667
Configuration saved in news-cla0/checkpoint-1667/config.json
Model weights saved in news-cla0/checkpoint-1667/pytorch_model.bin
tokenizer config file saved in news-cla0/checkpoint-1667/tokenizer_config.json
Special tokens file saved in news-cla0/checkpoint-1667/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


  0%|          | 0/160 [00:00<?, ?ba/s]

  0%|          | 0/40 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: summary, __index_level_0__.
***** Running training *****
  Num examples = 160000
  Num Epochs = 1
  Instantaneous batch size per device = 96
  Total train batch size (w. parallel, distributed & accumulation) = 96
  Gradient Accumulation steps = 1
  Total optimization steps = 1667


Epoch,Training Loss,Validation Loss,Accuracy
1,0.167,0.146571,0.95525


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: summary.
***** Running Evaluation *****
  Num examples = 40000
  Batch size = 96
Saving model checkpoint to news-cla1/checkpoint-1667
Configuration saved in news-cla1/checkpoint-1667/config.json
Model weights saved in news-cla1/checkpoint-1667/pytorch_model.bin
tokenizer config file saved in news-cla1/checkpoint-1667/tokenizer_config.json
Special tokens file saved in news-cla1/checkpoint-1667/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


  0%|          | 0/160 [00:00<?, ?ba/s]

  0%|          | 0/40 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: summary, __index_level_0__.
***** Running training *****
  Num examples = 160000
  Num Epochs = 1
  Instantaneous batch size per device = 96
  Total train batch size (w. parallel, distributed & accumulation) = 96
  Gradient Accumulation steps = 1
  Total optimization steps = 1667


Epoch,Training Loss,Validation Loss


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: summary.
***** Running Evaluation *****
  Num examples = 40000
  Batch size = 96


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1507,0.129751,0.960875


Saving model checkpoint to news-cla2/checkpoint-1667
Configuration saved in news-cla2/checkpoint-1667/config.json
Model weights saved in news-cla2/checkpoint-1667/pytorch_model.bin
tokenizer config file saved in news-cla2/checkpoint-1667/tokenizer_config.json
Special tokens file saved in news-cla2/checkpoint-1667/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


  0%|          | 0/160 [00:00<?, ?ba/s]

  0%|          | 0/40 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: summary, __index_level_0__.
***** Running training *****
  Num examples = 160000
  Num Epochs = 1
  Instantaneous batch size per device = 96
  Total train batch size (w. parallel, distributed & accumulation) = 96
  Gradient Accumulation steps = 1
  Total optimization steps = 1667


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1372,0.117883,0.9642


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: summary.
***** Running Evaluation *****
  Num examples = 40000
  Batch size = 96
Saving model checkpoint to news-cla3/checkpoint-1667
Configuration saved in news-cla3/checkpoint-1667/config.json
Model weights saved in news-cla3/checkpoint-1667/pytorch_model.bin
tokenizer config file saved in news-cla3/checkpoint-1667/tokenizer_config.json
Special tokens file saved in news-cla3/checkpoint-1667/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


  0%|          | 0/160 [00:00<?, ?ba/s]

  0%|          | 0/40 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: summary.
***** Running training *****
  Num examples = 160000
  Num Epochs = 1
  Instantaneous batch size per device = 96
  Total train batch size (w. parallel, distributed & accumulation) = 96
  Gradient Accumulation steps = 1
  Total optimization steps = 1667


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1275,0.102261,0.9684


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: summary.
***** Running Evaluation *****
  Num examples = 40000
  Batch size = 96
Saving model checkpoint to news-cla4/checkpoint-1667
Configuration saved in news-cla4/checkpoint-1667/config.json
Model weights saved in news-cla4/checkpoint-1667/pytorch_model.bin
tokenizer config file saved in news-cla4/checkpoint-1667/tokenizer_config.json
Special tokens file saved in news-cla4/checkpoint-1667/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




In [None]:
trainer.save_model("./EsperBERTo")

Saving model checkpoint to ./EsperBERTo
Configuration saved in ./EsperBERTo/config.json
Model weights saved in ./EsperBERTo/pytorch_model.bin
tokenizer config file saved in ./EsperBERTo/tokenizer_config.json
Special tokens file saved in ./EsperBERTo/special_tokens_map.json


In [None]:
trainer.evaluate()

In [None]:
#读取测试集
from datasets import load_dataset
dataset=load_dataset('csv',data_files='./test_a.csv')

Using custom data configuration default-e11314c8484ef22e


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-e11314c8484ef22e/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-e11314c8484ef22e/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
#数据预处理
def tokenize_function(examples):
    return fast_tokenizer (examples["text"],truncation=True,padding=True)
tokenized_datasets=dataset.map(tokenize_function,batched=True)

  0%|          | 0/50 [00:00<?, ?ba/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'text', 'token_type_ids'],
        num_rows: 50000
    })
})

In [None]:
predictions,metrics,loss=trainer.predict(tokenized_datasets['train'],metric_key_prefix="predict")
predictions

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 50000
  Batch size = 96


array([[ 1.8426243 ,  8.174837  , -1.2704638 , ..., -1.320911  ,
        -1.929365  , -2.486765  ],
       [-0.75752604, -1.2208161 ,  9.0089445 , ..., -1.2063974 ,
         0.7124819 , -1.1222411 ],
       [ 0.6969581 , -0.6338869 , -0.73178047, ...,  0.06477395,
        -1.4471081 , -2.1442966 ],
       ...,
       [-0.39861476,  6.7122326 , -1.4422836 , ..., -1.4204785 ,
        -1.1205809 , -2.2244375 ],
       [-0.83510345, -0.96118903,  0.43861002, ..., -0.11299364,
        -1.0912285 , -2.1063776 ],
       [ 0.75804245,  8.388309  , -1.4211463 , ..., -1.3522512 ,
        -1.7220726 , -2.6125865 ]], dtype=float32)

In [None]:
loss

{'predict_runtime': 103.1959,
 'predict_samples_per_second': 484.515,
 'predict_steps_per_second': 5.049}

In [None]:
pred=np.argmax(predictions,axis=1)
pred

array([1, 2, 8, ..., 1, 3, 1])

In [None]:
pd.DataFrame({'label':pred}).to_csv('submit1015_3.csv',index=None)