In [52]:
from datasets import load_dataset, Features, Value

raw_datasets = load_dataset("glue", "sst2")
print(raw_datasets['train'][11])

{'sentence': "for those moviegoers who complain that ` they do n't make movies like they used to anymore ", 'label': 0, 'idx': 11}


In [53]:
new_datasets = raw_datasets.cast(Features({'idx': Value('int32'), 'sentence': Value('string'),'label': Value('float32')}))
print(new_datasets['train'].features)

{'idx': Value(dtype='int32', id=None), 'sentence': Value(dtype='string', id=None), 'label': Value(dtype='float32', id=None)}


In [54]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

tokenized_datasets = new_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [55]:
samples = tokenized_datasets["train"][:8]
print(samples)

{'idx': [0, 1, 2, 3, 4, 5, 6, 7], 'sentence': ['hide new secretions from the parental units ', 'contains no wit , only labored gags ', 'that loves its characters and communicates something rather beautiful about human nature ', 'remains utterly satisfied to remain the same throughout ', 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ', "that 's far too tragic to merit such superficial treatment ", 'demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop . ', 'of saucy '], 'label': [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0], 'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102], [101, 3397, 2053, 15966, 1010, 2069, 4450, 2098, 18201, 2015, 102], [101, 2008, 7459, 2049, 3494, 1998, 10639, 2015, 2242, 2738, 3376, 2055, 2529, 3267, 102], [101, 3464, 12580, 8510, 2000, 3961, 1996, 2168, 2802, 102], [101, 2006, 1996, 5409, 7195, 1011, 1997, 1011, 1996, 1011,

In [56]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

samples = {k: v for k, v in samples.items() if k not in ['idx', 'sentence']}
batch = data_collator(samples)
print(batch)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[  101,  5342,  2047,  3595,  8496,  2013,  1996, 18643,  3197,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  3397,  2053, 15966,  1010,  2069,  4450,  2098, 18201,  2015,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2008,  7459,  2049,  3494,  1998, 10639,  2015,  2242,  2738,
          3376,  2055,  2529,  3267,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  3464, 12580,  8510,  2000,  3961,  1996,  2168,  2802,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2006,  1996,  5409,  7195,  1011,  1997,  101

In [57]:
from transformers import TrainingArguments

training_args = TrainingArguments("sst-finetuned-model", per_device_train_batch_size=16, num_train_epochs=2)


In [58]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
from transformers import Trainer
from evaluate import load

metric = load('accuracy')

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = metric.compute(predictions=preds, references=labels)

    return {'accuracy': accuracy}

In [60]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.train()

  0%|          | 0/4210 [00:00<?, ?it/s]

{'loss': 0.0968, 'learning_rate': 4.406175771971497e-05, 'epoch': 0.24}




{'loss': 0.0664, 'learning_rate': 3.812351543942993e-05, 'epoch': 0.48}




{'loss': 0.0557, 'learning_rate': 3.21852731591449e-05, 'epoch': 0.71}




{'loss': 0.0505, 'learning_rate': 2.6247030878859858e-05, 'epoch': 0.95}




{'loss': 0.0342, 'learning_rate': 2.0308788598574824e-05, 'epoch': 1.19}




{'loss': 0.0278, 'learning_rate': 1.4370546318289787e-05, 'epoch': 1.43}




{'loss': 0.0267, 'learning_rate': 8.432304038004752e-06, 'epoch': 1.66}




{'loss': 0.0262, 'learning_rate': 2.494061757719715e-06, 'epoch': 1.9}




{'train_runtime': 393.864, 'train_samples_per_second': 341.991, 'train_steps_per_second': 10.689, 'train_loss': 0.04707029936149398, 'epoch': 2.0}


TrainOutput(global_step=4210, training_loss=0.04707029936149398, metrics={'train_runtime': 393.864, 'train_samples_per_second': 341.991, 'train_steps_per_second': 10.689, 'train_loss': 0.04707029936149398, 'epoch': 2.0})

In [61]:
evaluation_results = trainer.evaluate(tokenized_datasets['validation'])
print(evaluation_results)

  0%|          | 0/55 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.4908256880733945}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.05505853891372681, 'eval_accuracy': {'accuracy': 0.4908256880733945}, 'eval_runtime': 2.0895, 'eval_samples_per_second': 417.331, 'eval_steps_per_second': 26.322, 'epoch': 2.0}


In [62]:
trainer.save_model("fine_tuned_sst2_model")