# Setup

In [38]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/dl-group-project
%ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/dl-group-project
[0m[01;34mcheckpoints_adapter[0m/          modelT5.py                  t5savedv4.zip
combined_test_with_preds.tsv  PT_T5_adapter.ipynb         TF_T5.ipynb
[01;34mdata[0m/                         PT_T5_adapter_local2.ipynb  tf_test.ipynb
dataprep.ipynb                PT_T5.ipynb                 train.ipynb
[01;34mdataprovider[0m/                 README.md
data.zip                      [01;34mt5savedv4[0m/


In [39]:
!pip install transformers
!pip install adapter-transformers
!pip install datasets
!pip install pytorch_lightning
!pip install sentencepiece



In [40]:
# !nvidia-smi

In [41]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
# import transformers
# import datasets

# from transformers import (
#     # AdamW, 
#     T5Model, 
#     T5ForConditionalGeneration, 
#     T5AdapterModel, 
#     T5Tokenizer, 
#     get_linear_schedule_with_warmup,
#     TrainingArguments, 
#     AdapterTrainer,
#     Trainer
# )

# from dataprovider.DataProvider import DatasetProvider

In [43]:
base_path = "t5-small"
adapter_path = "paraphrase"

# Load Data

In [44]:
import pandas as pd
from datasets import Dataset

In [45]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained(base_path)

loading file https://huggingface.co/t5-small/resolve/main/spiece.model from cache at /root/.cache/huggingface/transformers/65fc04e21f45f61430aea0c4fedffac16a4d20d78b8e6601d8d996ebefefecd2.3b69006860e7b5d0a63ffdddc01ddcd6b7c318a6f4fd793596552c741734c62d
loading file https://huggingface.co/t5-small/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/t5-small/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/t5-small/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  

In [46]:
df_test = pd.read_csv('data/COMBINED/test.tsv', sep = '\t', names=['in', 'expected'])
df_train = pd.read_csv('data/COMBINED/train.tsv', sep = '\t', names=['in', 'expected'])
df_test

Unnamed: 0,in,expected
0,"PCCW's chief operating officer, Mike Butcher, ...",Current Chief Operating Officer Mike Butcher a...
1,The world's two largest automakers said their ...,Domestic sales at both GM and No. 2 Ford Motor...
2,According to the federal Centers for Disease C...,The Centers for Disease Control and Prevention...
3,A tropical storm rapidly developed in the Gulf...,A tropical storm rapidly developed in the Gulf...
4,The company didn't detail the costs of the rep...,But company officials expect the costs of the ...
...,...,...
5172,Twice Sparrow sold the island twice to Thomas ...,Sparrow twice sold the island to Thomas Polloc...
5173,The name in Tupi means `` insensitive stone ''...,"The name in Tupi means '' hard stone `` , '' i..."
5174,"The company has branches in Tokyo , based in t...",The company has branches in Tokyo based in Sai...
5175,The modern coat of arms of Bavaria was designe...,The modern coat of arms of Bavaria was designe...


In [47]:
dataset_test = Dataset.from_pandas(df_test)
dataset_train = Dataset.from_pandas(df_train)

In [48]:
def preprocess_function(examples):
    prefix = 'paraphrase: '
    inputs = [prefix + doc for doc in examples["in"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["expected"], max_length=512, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_test = dataset_test.map(preprocess_function, batched=True)
tokenized_train = dataset_train.map(preprocess_function, batched=True)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/59 [00:00<?, ?ba/s]

In [49]:
tokenized_train_small = tokenized_train.shuffle(seed=42).select(range(5000))
tokenized_test_small = tokenized_test.shuffle(seed=42).select(range(500))

In [50]:
print(len(tokenized_train[0]['input_ids']))
print(len(tokenized_train[0]['attention_mask']))
print(len(tokenized_train[0]['labels']))

512
512
512


# Adapter Training

## Setup Model

In [51]:
from transformers.adapters import T5AdapterModel, AutoAdapterModel, AutoModelWithHeads
from transformers import AdapterTrainer, TrainingArguments

In [52]:
model = AutoAdapterModel.from_pretrained(base_path)
model_houlsby = AutoAdapterModel.from_pretrained(base_path)
model_pfeiffer = AutoAdapterModel.from_pretrained(base_path)
model_parallel = AutoAdapterModel.from_pretrained(base_path)

loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
 

## Adapter Configuration

In [53]:
tokenized_dataset = {
    "train": tokenized_train_small.remove_columns(['in', 'expected']),
    "test": tokenized_test_small.remove_columns(['in', 'expected'])
}

training_args = TrainingArguments(
  output_dir="./checkpoints_adapter", 
  do_train=True,
  remove_unused_columns=False,
  learning_rate=1e-4,
  num_train_epochs=1,
  weight_decay=0.01,
  logging_steps=100,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [54]:
from transformers.adapters import AdapterConfig, HoulsbyConfig, PfeifferConfig, ParallelConfig

config = AdapterConfig(mh_adapter=True, output_adapter=True, reduction_factor=16, non_linearity="relu")
houlsby_config = HoulsbyConfig()
pfeiffer_config = PfeifferConfig()
parallel_config = ParallelConfig()

### Houlsby

In [55]:
### Houlsby config
model_houlsby.add_adapter(adapter_path, config=houlsby_config)
model_houlsby.add_seq2seq_lm_head(adapter_path)
model_houlsby.train_adapter(adapter_path)

trainer_houlsby = AdapterTrainer(
  model=model_houlsby,
  args=training_args,
  tokenizer=tokenizer,
  train_dataset=tokenized_dataset['train'],
  eval_dataset=tokenized_dataset['test']
)

trainer_houlsby.train()
trainer_houlsby.evaluate()

Adding adapter 'paraphrase'.
Adding head 'paraphrase' with config {'head_type': 'seq2seq_lm', 'vocab_size': 32128, 'layers': 1, 'activation_function': None, 'layer_norm': False, 'bias': False, 'shift_labels': False, 'label2id': None}.
***** Running training *****
  Num examples = 5000
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 625


Step,Training Loss
100,0.795
200,0.0845
300,0.0795
400,0.078
500,0.0752
600,0.0748


Saving model checkpoint to ./checkpoints_adapter/checkpoint-500
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/adapter_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_adapter.bin
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/head_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_model_head.bin
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/head_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_model_head.bin
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/head_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_model_head.bin
tokenizer config file saved in ./checkpoints_adapter/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./checkpoints_adapter/checkpoint-500/special_tokens_map.json


Training completed. Do

{'epoch': 1.0,
 'eval_loss': 0.08687897026538849,
 'eval_runtime': 27.3274,
 'eval_samples_per_second': 18.297,
 'eval_steps_per_second': 2.305}

### Pfeiffer

In [56]:
### Pfeiffer config
model_pfeiffer.add_adapter(adapter_path, config=pfeiffer_config)
model_pfeiffer.add_seq2seq_lm_head(adapter_path)
model_pfeiffer.train_adapter(adapter_path)

trainer_pfeiffer = AdapterTrainer(
  model=model_pfeiffer,
  args=training_args,
  tokenizer=tokenizer,
  train_dataset=tokenized_dataset['train'],
  eval_dataset=tokenized_dataset['test']
)

trainer_pfeiffer.train()

Adding adapter 'paraphrase'.
Adding head 'paraphrase' with config {'head_type': 'seq2seq_lm', 'vocab_size': 32128, 'layers': 1, 'activation_function': None, 'layer_norm': False, 'bias': False, 'shift_labels': False, 'label2id': None}.
***** Running training *****
  Num examples = 5000
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 625


Step,Training Loss
100,1.2222
200,0.0908
300,0.0838
400,0.0818
500,0.0785
600,0.0784


Saving model checkpoint to ./checkpoints_adapter/checkpoint-500
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/adapter_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_adapter.bin
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/head_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_model_head.bin
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/head_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_model_head.bin
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/head_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_model_head.bin
tokenizer config file saved in ./checkpoints_adapter/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./checkpoints_adapter/checkpoint-500/special_tokens_map.json


Training completed. Do

{'epoch': 1.0,
 'eval_loss': 0.09049589186906815,
 'eval_runtime': 26.9239,
 'eval_samples_per_second': 18.571,
 'eval_steps_per_second': 2.34}

In [57]:
trainer_pfeiffer.evaluate()

***** Running Evaluation *****
  Num examples = 500
  Batch size = 8


{'epoch': 1.0,
 'eval_loss': 0.09049589186906815,
 'eval_runtime': 26.955,
 'eval_samples_per_second': 18.549,
 'eval_steps_per_second': 2.337}

### Parallel

In [58]:
### Parallel config
model_parallel.add_adapter(adapter_path, config=parallel_config)
model_parallel.add_seq2seq_lm_head(adapter_path)
model_parallel.train_adapter(adapter_path)

trainer_parallel = AdapterTrainer(
  model=model_parallel,
  args=training_args,
  tokenizer=tokenizer,
  train_dataset=tokenized_dataset['train'],
  eval_dataset=tokenized_dataset['test']
)

trainer_parallel.train()

Adding adapter 'paraphrase'.
Adding head 'paraphrase' with config {'head_type': 'seq2seq_lm', 'vocab_size': 32128, 'layers': 1, 'activation_function': None, 'layer_norm': False, 'bias': False, 'shift_labels': False, 'label2id': None}.
***** Running training *****
  Num examples = 5000
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 625


Step,Training Loss
100,0.2999
200,0.061
300,0.0553
400,0.0548
500,0.0539
600,0.0534


Saving model checkpoint to ./checkpoints_adapter/checkpoint-500
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/adapter_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_adapter.bin
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/head_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_model_head.bin
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/head_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_model_head.bin
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/head_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_model_head.bin
tokenizer config file saved in ./checkpoints_adapter/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./checkpoints_adapter/checkpoint-500/special_tokens_map.json


Training completed. Do

TrainOutput(global_step=625, training_loss=0.09451336860656738, metrics={'train_runtime': 620.0547, 'train_samples_per_second': 8.064, 'train_steps_per_second': 1.008, 'total_flos': 725168947200000.0, 'train_loss': 0.09451336860656738, 'epoch': 1.0})

In [59]:
trainer_parallel.evaluate()

***** Running Evaluation *****
  Num examples = 500
  Batch size = 8


{'epoch': 1.0,
 'eval_loss': 0.08085960149765015,
 'eval_runtime': 27.7408,
 'eval_samples_per_second': 18.024,
 'eval_steps_per_second': 2.271}

# Save Adapters

In [63]:

# save adapter
model_houlsby.save_adapter('custom_adapters/houlsby-sm', 'paraphrase')

Configuration saved in custom_adapters/houlsby-sm/adapter_config.json
Module weights saved in custom_adapters/houlsby-sm/pytorch_adapter.bin
Configuration saved in custom_adapters/houlsby-sm/head_config.json
Module weights saved in custom_adapters/houlsby-sm/pytorch_model_head.bin


In [64]:
model_pfeiffer.save_adapter('custom_adapters/pfeiffer-sm', 'paraphrase')

Configuration saved in custom_adapters/pfeiffer-sm/adapter_config.json
Module weights saved in custom_adapters/pfeiffer-sm/pytorch_adapter.bin
Configuration saved in custom_adapters/pfeiffer-sm/head_config.json
Module weights saved in custom_adapters/pfeiffer-sm/pytorch_model_head.bin


In [65]:
model_parallel.save_adapter('custom_adapters/parallel-sm', 'paraphrase')

Configuration saved in custom_adapters/parallel-sm/adapter_config.json
Module weights saved in custom_adapters/parallel-sm/pytorch_adapter.bin
Configuration saved in custom_adapters/parallel-sm/head_config.json
Module weights saved in custom_adapters/parallel-sm/pytorch_model_head.bin
