In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/MyDrive/transformers/天池-入门NLP - 新闻文本分类')

In [None]:
#安装
!pip install transformers datasets

In [None]:
# 文件读取
import pandas as pd
from datasets import load_dataset
from datasets import Dataset

train_df=pd.read_csv('./train_set.csv',sep='\t')
test_df=pd.read_csv('./test_a.csv', sep ='\t')
df=pd.concat((train_df,test_df))

In [None]:
#将3750/648/900改成标点符号，删除原text列，新增列重名为text列
import re
def replacepunc(x):
  x=re.sub('3750',",",x)
  x=re.sub('900',".",x)
  x=re.sub('648',"!",x)
  return x
"""
df['words']=df['text'].map(lambda x: replacepunc(x))
df.drop('text',axis=1,inplace=True)
df.columns=['label','text']

#数据载入dataset，去除多余的列，只保留text列
data=Dataset.from_pandas(df).remove_columns(['label', '__index_level_0__'])
data"""

"\ndf['words']=df['text'].map(lambda x: replacepunc(x))\ndf.drop('text',axis=1,inplace=True)\ndf.columns=['label','text']\n\n#数据载入dataset，去除多余的列，只保留text列\ndata=Dataset.from_pandas(df).remove_columns(['label', '__index_level_0__'])\ndata"

In [None]:
batch_size=1000
#all_texts=[data['text'][i:i+batch_size] for i in range(0,len(data),batch_size)]

def batch_iterator():
  for i in range(0,len(data),batch_size):
    yield data['text'][i:i+batch_size]

In [None]:
#初始化分词器、预分词器
from tokenizers import decoders,models,normalizers,pre_tokenizers,processors,trainers,Tokenizer

tokenizer=Tokenizer(models.WordPiece(unl_token="[UNK]"))

tokenizer.pre_tokenizer=pre_tokenizers.BertPreTokenizer()
special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer=trainers.WordPieceTrainer(vocab_size=7000,min_frequency=2,special_tokens=["[UNK]","[CLS]","[SEP]","[PAD]","[MASK]"])
tokenizer.decoders=decoders.WordPiece(prefix="##")

In [None]:
#开始训练
tokenizer.train_from_iterator(batch_iterator(),trainer=trainer)

In [None]:
#进行分词后处理
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
mask_token_id = tokenizer.token_to_id("[MASK]")
pad_token_id = tokenizer.token_to_id("[PAD]")

tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]",cls_token_id),("[SEP]",sep_token_id),("[MASK]",mask_token_id)],
    )

tokenizer.enable_truncation(max_length=512)
tokenizer.enable_padding(pad_token='[PAD]')


In [None]:
#测试分词结果
encoding = tokenizer.encode('2491 4109 1757 7539 648 3695 3038 4490 23 7019 3731 4109 3792 2465',' 2893 7212 5296 1667 3618 7044 1519 5413 1283 6122 4893 7495 2435 5510')
encoding.tokens

In [None]:
#保存模型并重新加载
#tokenizer.save("tokenizers.json")

from transformers import PreTrainedTokenizerFast
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizers.json",
   model_max_length=512,mask_token='[MASK]',pad_token='[PAD]',unk_token='[UNK]',
   cls_token='[CLS]',sep_token='[SEP]',padding_side='right',
   return_special_tokens_mask=True)
#PreTrainedTokenizerFast中一定要设置mask_token，pad_token等，不然报错

In [None]:
#data_collator是一个函数，负责获取样本并将它们批处理成张量
#在data_collator中可以确保每次以新的方式完成随机掩蔽。
from transformers import DataCollatorForLanguageModeling
data_collator=DataCollatorForLanguageModeling(tokenizer=fast_tokenizer,mlm=True,mlm_probability=0.15)

In [None]:
#加载训练中保存的模型，继续训练
from transformers import BertForMaskedLM
model = BertForMaskedLM.from_pretrained("./Pre_Bert")

In [None]:
#初始化bert模型
from transformers import BertConfig
config = BertConfig(
    vocab_size=7000,
    hidden_size=512,
    intermediate_size=4*512,
    max_position_embeddings=512,
    num_hidden_layers=4,
    num_attention_heads=4,
    type_vocab_size=2,
    attention_probs_dropout_prob=0.1,
    hidden_dropout_prob=0.1,
    initializer_range=0.02
)

from transformers import BertForMaskedLM
model = BertForMaskedLM(config=config)

In [None]:
#数据进行分词预处理，删除‘text'列，否则后面拼接的时候会报错。
tokenized_datasets=data.map(lambda examples:fast_tokenizer(examples['text']),batched=True).remove_columns("text")
tokenized_datasets

  0%|          | 0/250 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'text', 'token_type_ids'],
    num_rows: 250000
})

In [None]:
block_size = 128
def group_texts(examples):
  # 拼接所有文本
  concatenated_examples={k:sum(examples[k],[]) for k in examples.keys()}
  total_length=len(concatenated_examples[list(examples.keys())[0]])
  # 我们将余数对应的部分去掉。但如果模型支持的话，可以添加padding，您可以根据需要定制此部件。
  total_length = (total_length//block_size)*block_size
  # 通过max_len进行分割。
  result={
      k:[t[i:i+block_size] for i in range(0,total_length,block_size)]
      for k,t in concatenated_examples.items()
  }
  result["labels"]=result["input_ids"].copy()
  return result

lm_datasets=tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [None]:
#加载和保存拼接后的文本
#lm_datasets.save_to_disk('./lm_datasets')

import pandas as pd
from datasets import load_from_disk
lm_datasets=load_from_disk('./lm_datasets')

In [None]:
#解码分词器预处理的lm_datasets数据，里面有标点符号
la=fast_tokenizer.decode(lm_datasets[0]['input_ids'])
la

'[CLS] 2967 6758 339 2021 1854 3731 4109 3792 4149 1519 2058 3912 2465 2410 1219 6654 7539 264 2456 4811 1292 2109 6905 5520 7058 6045 3634 6591 3530 6508 2465 7044 1519 3659 2073, 3731 4109 3792 6831 2614 3370 4269 3370 486 5770 4109 4125, 5445 2466 6831 6758 3743 3630 1726 2313 5906 826 4516 657. 1871 7044, 2967 3731 1757 1939! 2828 4704 7039 3706, 965 2490 7399 3743 2145 2407 7451 3775 6017 5998 1641 299 4704 2621 7029 3056 6333 433! 1667 1099. 2289 1099! 5780 220 7044 1279 7426 4269, 2967 6758 6631 3099 2205 7305 2620 5977, 3329 1793 6666 2042 3193 4149 1519 7039 3706 2446 5399'

In [None]:
import os
assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

!pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl

In [None]:
#模型加载到TPU
import torch_xla.core.xla_model as xm
device = xm.xla_device()
model.to(device)

In [None]:
#使用GPU训练
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    "pre-mlm",
    logging_strategy="steps",
    logging_steps=3000,
    save_strategy="steps",
    save_steps=10000,
    num_train_epochs=3,
    learning_rate=2e-4,
    per_device_train_batch_size=128,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets,
    data_collator=data_collator)
#lr=4e-4，跑了5个epoch后loss=1.695。第二天接着跑，lr=2e-4，steps=3000时，loss=1.784。
#说明模型训练一定次数后loss离最小点更近，还用原学习率会震荡。

In [None]:
trainer.train()

***** Running training *****
  Num examples = 1779535
  Num Epochs = 3
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 41709


Step,Training Loss
3000,1.7838
6000,1.7807


Step,Training Loss
3000,1.7838
6000,1.7807
9000,1.7716
12000,1.7539
15000,1.739
18000,1.7208
21000,1.7066
24000,1.6903
27000,1.6765
30000,1.6632


Saving model checkpoint to pre-mlm/checkpoint-10000
Configuration saved in pre-mlm/checkpoint-10000/config.json
Model weights saved in pre-mlm/checkpoint-10000/pytorch_model.bin
Saving model checkpoint to pre-mlm/checkpoint-20000
Configuration saved in pre-mlm/checkpoint-20000/config.json
Model weights saved in pre-mlm/checkpoint-20000/pytorch_model.bin
Saving model checkpoint to pre-mlm/checkpoint-30000
Configuration saved in pre-mlm/checkpoint-30000/config.json
Model weights saved in pre-mlm/checkpoint-30000/pytorch_model.bin
Saving model checkpoint to pre-mlm/checkpoint-40000
Configuration saved in pre-mlm/checkpoint-40000/config.json
Model weights saved in pre-mlm/checkpoint-40000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=41709, training_loss=1.702359629179254, metrics={'train_runtime': 9891.1282, 'train_samples_per_second': 539.737, 'train_steps_per_second': 4.217, 'total_flos': 5.28424108839936e+16, 'train_loss': 1.702359629179254, 'epoch': 3.0})

In [None]:
#保存模型
trainer.save_model("./pre_Bert")

Saving model checkpoint to ./pre_Bert
Configuration saved in ./pre_Bert/config.json
Model weights saved in ./pre_Bert/pytorch_model.bin


In [None]:
#准备进行下游任务微调
from datasets import load_metric
metric=load_metric("accuracy")

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

In [None]:
#加载训练好的预训练模型
from transformers import AutoModelForSequenceClassification
model=AutoModelForSequenceClassification.from_pretrained("./news-classification/checkpoint-5625",num_labels=14)

In [None]:
#TPU训练
import torch_xla.core.xla_model as xm
device = xm.xla_device()
model.to(device)

In [None]:
import numpy as np
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  
  return metric.compute(predictions=predictions, references=labels)

In [None]:
#加载数据

from datasets import Dataset
import pandas as pd
train_df=pd.read_csv('./train_set.csv',sep='\t').sample(frac=1)

#将训练数据中三个token换成标点
train_df['texts']=train_df['text'].map(lambda x:replacepunc(x))


#准备将text文本首尾截断，各取255tokens
def slipt2(x):
  ls=x.split(' ')
  le=len(ls)
  if le<511:
    return x
  else:
    return ' '.join(ls[:255]+ls[-255:])    

In [None]:
#划分训练集和测试集

val_df=train_df.iloc[:20000, ]
trains_df=train_df.iloc[20000:,]

#首尾截断
val_df['summary']=val_df['texts'].apply(lambda x:slipt2(x))
trains_df['summary']=trains_df['texts'].apply(lambda x:slipt2(x))

#加载到dataset并预处理
trains_ds=Dataset.from_pandas(trains_df).remove_columns(["texts","text"])
val_ds=Dataset.from_pandas(val_df).remove_columns(["texts","text"])

tokenized_trains_ds=trains_ds.map(lambda examples:fast_tokenizer(examples['summary'],truncation=True,padding=True),batched=True)
tokenized_val_ds=val_ds.map(lambda examples:fast_tokenizer(examples['summary'],truncation=True,padding=True),batched=True)
tokenized_val_ds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


  0%|          | 0/180 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

Dataset({
    features: ['__index_level_0__', 'attention_mask', 'input_ids', 'label', 'summary', 'token_type_ids'],
    num_rows: 20000
})

In [None]:
#进行任务微调
from transformers import TrainingArguments,Trainer
args=TrainingArguments(
  output_dir='news-classification-2',
  evaluation_strategy="epoch",
  save_strategy="epoch",
  learning_rate=2e-5,
  per_device_train_batch_size=96,
  per_device_eval_batch_size=96,
  num_train_epochs=3,
  weight_decay=0.01,
  load_best_model_at_end=True,
  metric_for_best_model="accuracy")

trainer=Trainer(
  model,
  args,
  train_dataset=tokenized_trains_ds,
  eval_dataset=tokenized_val_ds,
  tokenizer=fast_tokenizer,
  compute_metrics=compute_metrics)

In [None]:
trainer.train()
#trainer.save_model("./finally_bert")

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, summary.
***** Running training *****
  Num examples = 180000
  Num Epochs = 3
  Instantaneous batch size per device = 96
  Total train batch size (w. parallel, distributed & accumulation) = 96
  Gradient Accumulation steps = 1
  Total optimization steps = 5625


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1129,0.097082,0.9698
2,0.0929,0.090939,0.97105


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, summary.
***** Running Evaluation *****
  Num examples = 20000
  Batch size = 96
Saving model checkpoint to news-classification-2/checkpoint-1875
Configuration saved in news-classification-2/checkpoint-1875/config.json
Model weights saved in news-classification-2/checkpoint-1875/pytorch_model.bin
tokenizer config file saved in news-classification-2/checkpoint-1875/tokenizer_config.json
Special tokens file saved in news-classification-2/checkpoint-1875/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, summary.
***** Running Evaluation *****
  Num examples = 20000
  Batch size = 96
Saving model checkpoint to news-classification-2/checkpoint-3750
Configuration saved in news-classification

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1129,0.097082,0.9698
2,0.0929,0.090939,0.97105
3,0.0806,0.089892,0.9719


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, summary.
***** Running Evaluation *****
  Num examples = 20000
  Batch size = 96
Saving model checkpoint to news-classification-2/checkpoint-5625
Configuration saved in news-classification-2/checkpoint-5625/config.json
Model weights saved in news-classification-2/checkpoint-5625/pytorch_model.bin
tokenizer config file saved in news-classification-2/checkpoint-5625/tokenizer_config.json
Special tokens file saved in news-classification-2/checkpoint-5625/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from news-classification-2/checkpoint-5625 (score: 0.9719).


TrainOutput(global_step=5625, training_loss=0.09479788547092013, metrics={'train_runtime': 2662.9885, 'train_samples_per_second': 202.78, 'train_steps_per_second': 2.112, 'total_flos': 2.136703463424e+16, 'train_loss': 0.09479788547092013, 'epoch': 3.0})

In [None]:
#读取测试集
import pandas as pd
from datasets import load_dataset
test_df=pd.read_csv('./test_a.csv',sep='\t')

#将训练数据中三个token换成标点
test_df['texts']=test_df['text'].map(lambda x:replacepunc(x))

In [None]:
#数据预处理
from datasets import Dataset
test_df['summary']=test_df['texts'].apply(lambda x:slipt2(x))

#加载到dataset并预处理
test_ds=Dataset.from_pandas(test_df).remove_columns(["texts","text"])

tokenized_test_ds=test_ds.map(lambda examples:fast_tokenizer(examples['summary'],truncation=True,padding=True),batched=True)

  0%|          | 0/50 [00:00<?, ?ba/s]

In [None]:
#用trainer预测结果并保存
predictions,metrics,Loss=trainer.predict(tokenized_test_ds,metric_key_prefix="test")
pred=np.argmax(predictions,axis=1)
pd.DataFrame({'label':pred}).to_csv('submit1022.csv',index=None)