# Preparing environment

In [None]:
!pip install -q datasets transformers

[K     |████████████████████████████████| 194kB 18.5MB/s 
[K     |████████████████████████████████| 2.1MB 47.8MB/s 
[K     |████████████████████████████████| 112kB 61.6MB/s 
[K     |████████████████████████████████| 245kB 40.8MB/s 
[K     |████████████████████████████████| 3.3MB 56.3MB/s 
[K     |████████████████████████████████| 870kB 51.9MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
from sklearn.metrics import accuracy_score
from datasets import load_dataset
import transformers

# Model loading

In [None]:
MODEL = 'distilbert-base-uncased'

bert = transformers.AutoModelForSequenceClassification.from_pretrained(MODEL,num_labels=2)

bert_tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




# Dataset loading

### downloading

In [None]:
dataset = load_dataset('amazon_reviews_multi', 'en')

train_dataset = dataset['train']
valid_dataset = dataset['validation']
test_dataset = dataset['test']

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2773.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3624.0, style=ProgressStyle(description…


Downloading and preparing dataset amazon_reviews_multi/en (download: 82.11 MiB, generated: 58.69 MiB, post-processed: Unknown size, total: 140.79 MiB) to /root/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/f3357bd271e187385a38574fe31b8fb10055303f67fa9fce55e84d08c4870efd...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=81989414.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2059600.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2045098.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset amazon_reviews_multi downloaded and prepared to /root/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/f3357bd271e187385a38574fe31b8fb10055303f67fa9fce55e84d08c4870efd. Subsequent calls will reuse this data.


### tokenize

In [None]:
def tokenizer(data):
  return bert_tokenizer(
    data['review_body'],
    padding=True,
    truncation=True, 
    max_length=40, 
    pad_to_max_length=True,
  )

train_dataset = train_dataset.map(tokenizer, batched=True)
valid_dataset = valid_dataset.map(tokenizer, batched=True)
test_dataset = test_dataset.map(tokenizer, batched=True)

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




### set `stars` to 2-classes: `bad` or `ok`

In [None]:
def limiter(data):
  data['label'] = 0 if data['stars'] <= 3 else 1 
  return data

train_dataset = train_dataset.map(limiter)
valid_dataset = valid_dataset.map(limiter)
test_dataset  = test_dataset.map(limiter)

train_dataset.set_format('torch')
valid_dataset.set_format('torch')
test_dataset.set_format('torch')

HBox(children=(FloatProgress(value=0.0, max=200000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))




# Training

In [None]:
batch_size = 512
num_train_epochs = 3

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc
        }

args = transformers.TrainingArguments(
    output_dir = "tmp",
    evaluation_strategy = "epoch",
    learning_rate = 1e-5,
    per_device_train_batch_size = batch_size,
    warmup_steps = 10,
    num_train_epochs = num_train_epochs,
)

trainer = transformers.Trainer(
    bert,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,No log,0.321024,0.8682,20.6109,242.59
2,0.352500,0.30381,0.8734,21.111,236.843
3,0.297400,0.300918,0.8742,20.6051,242.658


TrainOutput(global_step=1173, training_loss=0.3198486386662554, metrics={'train_runtime': 3203.3622, 'train_samples_per_second': 0.366, 'total_flos': 9641521440000000.0, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 2609684480, 'init_mem_gpu_alloc_delta': 268953088, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 150790144, 'train_mem_gpu_alloc_delta': 807732736, 'train_mem_cpu_peaked_delta': 266240, 'train_mem_gpu_peaked_delta': 7663377408})

# Evaluating

In [None]:
trainer.evaluate(test_dataset)

{'epoch': 3.0,
 'eval_accuracy': 0.8678,
 'eval_loss': 0.3097822368144989,
 'eval_mem_cpu_alloc_delta': -4096,
 'eval_mem_cpu_peaked_delta': 4096,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 17060352,
 'eval_runtime': 20.114,
 'eval_samples_per_second': 248.583}