In [1]:
import numpy as np
from transformers import AutoTokenizer, DataCollatorWithPadding
import datasets

In [2]:
raw_datasets = datasets.load_dataset('glue', 'mrpc')

Found cached dataset glue (/Users/shuyuzhou/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
from datasets import load_metric
def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds.predictions, eval_preds.label_ids
    # 上一行可以直接简写成：
    # logits, labels = eval_preds  因为它相当于一个tuple
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [4]:
# checkpoint = 'roberta-large'
# checkpoint = 'facebook/bart-large'
# checkpoint = './models/gpt2-glue-tokenizer/'

checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(sample):
    return tokenizer(sample['sentence1'], sample['sentence2'], truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


from transformers import AutoModelForSequenceClassification
# model_checkpoint = "./models/bart_0.1/"
# model_checkpoint = "./models/roberta_0.5/"
# model_checkpoint = "./models/gpt2-glue_0.5/"
model_checkpoint = 'bert-base-uncased'
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)


from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    num_train_epochs = 1,
    output_dir='output',
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator, 
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Loading cached processed dataset at /Users/shuyuzhou/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-31906a1957e9dbf7.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at /Users/shuyuzhou/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-54149c91d00919a2.arrow
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (

In [5]:
x = trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, idx, sentence2. If sentence1, idx, sentence2 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3668
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 459
  Number of trainable parameters = 109483778


  0%|          | 0/459 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 1985.4646, 'train_samples_per_second': 1.847, 'train_steps_per_second': 0.231, 'train_loss': 0.4770612529679841, 'epoch': 1.0}


In [6]:
# for info in x:
#     print(info)

x.metrics

{'train_runtime': 1985.4646,
 'train_samples_per_second': 1.847,
 'train_steps_per_second': 0.231,
 'train_loss': 0.4770612529679841,
 'epoch': 1.0}

In [60]:
predictions = trainer.predict(tokenized_datasets['validation'])
print(predictions.metrics)

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, sentence1, sentence2. If idx, sentence1, sentence2 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1725
  Batch size = 8


  0%|          | 0/216 [00:00<?, ?it/s]

{'test_loss': 0.4371364116668701, 'test_runtime': 201.1947, 'test_samples_per_second': 8.574, 'test_steps_per_second': 1.074}


In [63]:
import json
 
# Data to be written
results = x.metrics
results.update(predictions.metrics)
print(results)
 
# Serializing json
json_object = json.dumps(results, indent=4)
 
# Writing to sample.json
with open("output/sample.json", "w") as outfile:
    outfile.write(json_object)

{'train_runtime': 2186.2738, 'train_samples_per_second': 1.678, 'train_steps_per_second': 0.21, 'train_loss': 0.525709463879953, 'epoch': 1.0, 'test_loss': 0.4371364116668701, 'test_runtime': 201.1947, 'test_samples_per_second': 8.574, 'test_steps_per_second': 1.074}


In [11]:
raw_datasets2 = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1')

Downloading builder script:   0%|          | 0.00/8.48k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/6.84k [00:00<?, ?B/s]

Downloading and preparing dataset wikitext/wikitext-2-raw-v1 (download: 4.50 MiB, generated: 12.90 MiB, post-processed: Unknown size, total: 17.40 MiB) to /Users/shuyuzhou/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126...


Downloading data:   0%|          | 0.00/4.72M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset wikitext downloaded and prepared to /Users/shuyuzhou/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [58]:
raw_datasets2

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})