In [14]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

!pip install -U adapter-transformers
!pip install datasets



In [15]:
from datasets import load_dataset, DatasetDict

dataset = load_dataset("yxchar/sciie-tlm")
dataset.num_rows

Using custom data configuration yxchar___sciie-tlm-a32f1f2c4e9b5c0d
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/yxchar___sciie-tlm-a32f1f2c4e9b5c0d/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/3 [00:00<?, ?it/s]

{'test': 974, 'train': 3219, 'validation': 455}

In [16]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], max_length=512, truncation=True, padding="max_length")

# Encode the input data
dataset = dataset.map(encode_batch, batched=True)
# The transformers model expects the target class column to be named "labels"
dataset.rename_column_("label", "labels")
# Transform to pytorch tensors and only output the required columns
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
import numpy as np

In [18]:
from transformers import RobertaConfig, RobertaModelWithHeads

config = RobertaConfig.from_pretrained(
    "roberta-base",
    num_labels=7,
)
model = RobertaModelWithHeads.from_pretrained(
    "roberta-base",
    config=config,
)

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "adapters": {
    "adapters": {},
    "config_map": {},
    "fusion_config_map": {},
    "fusions": {}
  },
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_

In [19]:
# Add a new adapter
model.add_adapter("sciie")
# Add a matching classification head
model.add_classification_head(
    "sciie",
    num_labels=7,
    # id2label={ 0: "👎", 1: "👍"}
  )

# Activate the adapter
model.train_adapter("sciie")

Adding adapter 'sciie'.
Adding head 'sciie' with config {'head_type': 'classification', 'num_labels': 7, 'layers': 2, 'activation_function': 'tanh', 'label2id': {'LABEL_0': 0, 'LABEL_1': 1, 'LABEL_2': 2, 'LABEL_3': 3, 'LABEL_4': 4, 'LABEL_5': 5, 'LABEL_6': 6}, 'use_pooler': False, 'bias': True}.


In [20]:
import numpy as np
from transformers import TrainingArguments, AdapterTrainer, EvalPrediction

training_args = TrainingArguments(
    learning_rate=1e-5,
    num_train_epochs=15,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

def compute_accuracy(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {"acc": (preds == p.label_ids).mean()}

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    compute_metrics=compute_accuracy,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [21]:
trainer.train()

***** Running training *****
  Num examples = 3219
  Num Epochs = 15
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3030


Step,Training Loss
200,1.7017
400,1.5475
600,1.509
800,1.501
1000,1.491
1200,1.4687
1400,1.4385
1600,1.3901
1800,1.359
2000,1.3324


Saving model checkpoint to ./training_output/checkpoint-500
Configuration saved in ./training_output/checkpoint-500/sciie/adapter_config.json
Module weights saved in ./training_output/checkpoint-500/sciie/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-500/sciie/head_config.json
Module weights saved in ./training_output/checkpoint-500/sciie/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-500/sciie/head_config.json
Module weights saved in ./training_output/checkpoint-500/sciie/pytorch_model_head.bin
Configuration saved in ./training_output/checkpoint-500/sciie/head_config.json
Module weights saved in ./training_output/checkpoint-500/sciie/pytorch_model_head.bin
Saving model checkpoint to ./training_output/checkpoint-1000
Configuration saved in ./training_output/checkpoint-1000/sciie/adapter_config.json
Module weights saved in ./training_output/checkpoint-1000/sciie/pytorch_adapter.bin
Configuration saved in ./training_output/checkpoint-

TrainOutput(global_step=3030, training_loss=1.4081218190712503, metrics={'train_runtime': 1949.1082, 'train_samples_per_second': 24.773, 'train_steps_per_second': 1.555, 'total_flos': 1.29251777497344e+16, 'train_loss': 1.4081218190712503, 'epoch': 15.0})

In [22]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 455
  Batch size = 16


{'epoch': 15.0,
 'eval_acc': 0.5230769230769231,
 'eval_loss': 1.3435348272323608,
 'eval_runtime': 8.9993,
 'eval_samples_per_second': 50.56,
 'eval_steps_per_second': 3.222}

In [23]:
from transformers import TextClassificationPipeline

classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=training_args.device.index)

classifier("This is awesome!")

[{'label': 'LABEL_0', 'score': 0.2477007508277893}]

In [24]:
model.save_adapter("./final_adapter", "sciie")

!ls -lh final_adapter

Configuration saved in ./final_adapter/adapter_config.json
Module weights saved in ./final_adapter/pytorch_adapter.bin
Configuration saved in ./final_adapter/head_config.json
Module weights saved in ./final_adapter/pytorch_model_head.bin


total 5.8M
-rw-r--r-- 1 root root  571 Dec 15 21:35 adapter_config.json
-rw-r--r-- 1 root root  477 Dec 15 21:35 head_config.json
-rw-r--r-- 1 root root 3.5M Dec 15 21:35 pytorch_adapter.bin
-rw-r--r-- 1 root root 2.3M Dec 15 21:35 pytorch_model_head.bin


In [25]:
from google.colab import files
!zip -r /content/file.zip /content
files.download("/content/file.zip")

updating: content/ (stored 0%)
updating: content/.config/ (stored 0%)
updating: content/.config/gce (stored 0%)
updating: content/.config/active_config (stored 0%)
updating: content/.config/configurations/ (stored 0%)
updating: content/.config/configurations/config_default (deflated 15%)
updating: content/.config/config_sentinel (stored 0%)
updating: content/.config/.last_survey_prompt.yaml (stored 0%)
updating: content/.config/.last_update_check.json (deflated 24%)
updating: content/.config/logs/ (stored 0%)
updating: content/.config/logs/2021.12.03/ (stored 0%)
updating: content/.config/logs/2021.12.03/14.32.30.027140.log (deflated 91%)
updating: content/.config/logs/2021.12.03/14.33.37.701606.log (deflated 53%)
updating: content/.config/logs/2021.12.03/14.33.36.903459.log (deflated 54%)
updating: content/.config/logs/2021.12.03/14.33.16.964195.log (deflated 54%)
updating: content/.config/logs/2021.12.03/14.33.09.955489.log (deflated 86%)
updating: content/.config/logs/2021.12.03/14.

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [26]:
pred, label, metrics = trainer.predict(dataset["test"])
pred_label = np.argmax(pred, axis = 1)

from sklearn.metrics import classification_report 
print(classification_report(label, pred_label))

***** Running Prediction *****
  Num examples = 974
  Batch size = 16


              precision    recall  f1-score   support

           0       0.76      0.13      0.22       123
           1       0.00      0.00      0.00        59
           2       0.00      0.00      0.00        67
           3       0.60      0.97      0.75       533
           4       0.00      0.00      0.00        63
           5       0.00      0.00      0.00        38
           6       0.44      0.46      0.45        91

    accuracy                           0.59       974
   macro avg       0.26      0.22      0.20       974
weighted avg       0.47      0.59      0.48       974



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
