In [11]:
from datasets import load_dataset
import random
import pandas as pd
import datasets
from IPython.display import display, HTML
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
dataset = load_dataset("yelp_review_full")
metric = evaluate.load("accuracy")


#数据预处理
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(50000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(2000))
model_dir = "model_split"

training_args = TrainingArguments(output_dir=model_dir,
                                  evaluation_strategy="steps", 
                                  per_device_train_batch_size=16,
                                  num_train_epochs=3,
                                  logging_steps=100)




trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
100,1.6019,1.269977,0.431
200,1.1943,1.045218,0.5365
300,1.0698,1.027893,0.539
400,1.0346,0.966572,0.5695
500,1.0264,0.928358,0.5925
600,1.0117,0.956681,0.567
700,0.996,0.966426,0.5895
800,1.0014,0.951918,0.584
900,0.9984,0.922761,0.593
1000,0.932,1.115908,0.5135


Checkpoint destination directory model_split/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory model_split/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=9375, training_loss=0.7039950518798828, metrics={'train_runtime': 18440.7837, 'train_samples_per_second': 8.134, 'train_steps_per_second': 0.508, 'total_flos': 3.94677213696e+16, 'train_loss': 0.7039950518798828, 'epoch': 3.0})

In [10]:
eva_dataset = tokenized_datasets["test"].shuffle(seed=32).select(range(1000))
result = trainer.evaluate(eva_dataset)
trainer.save_model(model_dir)
trainer.save_state()
print(result)
print("save mode success\n")

{'eval_loss': 1.0414470434188843, 'eval_accuracy': 0.583, 'eval_runtime': 29.2785, 'eval_samples_per_second': 34.155, 'eval_steps_per_second': 4.269, 'epoch': 3.0}
save mode success

