# Setup

In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/dl-group-project
%ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/dl-group-project
adapter_bleu.ipynb  model_inference.py                  README.md
[0m[01;34mcustom_adapters[0m/    modelT5.py                          [01;34mresults[0m/
[01;34mdata[0m/               ParaphrasePipeline.py               T5Adapter.ipynb
dataprep.ipynb      PT_T5_adapter_custom_configs.ipynb  [01;34mt5savedv4[0m/
[01;34mdataprovider[0m/       PT_T5_adapter.ipynb                 TF_T5.ipynb
Inference1.ipynb    PT_T5_adapter_local2.ipynb          tf_test.ipynb
meta_1500_3.tsv     PT_T5.ipynb                         train.ipynb


In [3]:
!pip install transformers
!pip install adapter-transformers
!pip install datasets
!pip install pytorch_lightning
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 30.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 8.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 80.8 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 51.7 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 52.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml

In [None]:
# !nvidia-smi

In [4]:
%load_ext autoreload
%autoreload 2

In [None]:
# import transformers
# import datasets

# from transformers import (
#     # AdamW, 
#     T5Model, 
#     T5ForConditionalGeneration, 
#     T5AdapterModel, 
#     T5Tokenizer, 
#     get_linear_schedule_with_warmup,
#     TrainingArguments, 
#     AdapterTrainer,
#     Trainer
# )

# from dataprovider.DataProvider import DatasetProvider

In [5]:
base_path = "t5-small"
adapter_path = "paraphrase"

# Load Data

In [6]:
import pandas as pd
from datasets import Dataset

In [7]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained(base_path)

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

In [8]:
df_test = pd.read_csv('data/COMBINED/test.tsv', sep = '\t', names=['in', 'expected'])
df_train = pd.read_csv('data/COMBINED/train.tsv', sep = '\t', names=['in', 'expected'])
df_test

Unnamed: 0,in,expected
0,"PCCW's chief operating officer, Mike Butcher, ...",Current Chief Operating Officer Mike Butcher a...
1,The world's two largest automakers said their ...,Domestic sales at both GM and No. 2 Ford Motor...
2,According to the federal Centers for Disease C...,The Centers for Disease Control and Prevention...
3,A tropical storm rapidly developed in the Gulf...,A tropical storm rapidly developed in the Gulf...
4,The company didn't detail the costs of the rep...,But company officials expect the costs of the ...
...,...,...
5172,Twice Sparrow sold the island twice to Thomas ...,Sparrow twice sold the island to Thomas Polloc...
5173,The name in Tupi means `` insensitive stone ''...,"The name in Tupi means '' hard stone `` , '' i..."
5174,"The company has branches in Tokyo , based in t...",The company has branches in Tokyo based in Sai...
5175,The modern coat of arms of Bavaria was designe...,The modern coat of arms of Bavaria was designe...


In [9]:
dataset_test = Dataset.from_pandas(df_test)
dataset_train = Dataset.from_pandas(df_train)

In [10]:
def preprocess_function(examples):
    prefix = 'paraphrase: '
    inputs = [prefix + doc for doc in examples["in"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["expected"], max_length=512, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_test = dataset_test.map(preprocess_function, batched=True)
tokenized_train = dataset_train.map(preprocess_function, batched=True)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/59 [00:00<?, ?ba/s]

In [11]:
tokenized_train_small = tokenized_train.shuffle(seed=42).select(range(30000))
tokenized_test_small = tokenized_test.shuffle(seed=42).select(range(1500))

In [12]:
print(len(tokenized_train[0]['input_ids']))
print(len(tokenized_train[0]['attention_mask']))
print(len(tokenized_train[0]['labels']))

512
512
512


# Adapter Training

## Setup Model

In [13]:
from transformers.adapters import T5AdapterModel, AutoAdapterModel, AutoModelWithHeads
from transformers import AdapterTrainer, TrainingArguments

In [14]:
model = AutoAdapterModel.from_pretrained(base_path)
model_houlsby = AutoAdapterModel.from_pretrained(base_path)
model_pfeiffer = AutoAdapterModel.from_pretrained(base_path)
model_parallel = AutoAdapterModel.from_pretrained(base_path)

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

Some weights of the model checkpoint at t5-small were not used when initializing T5AdapterModel: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5AdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5AdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of T5AdapterModel were not initialized from the model checkpoint at t5-small and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at t5-small were not used when initia

## Adapter Configuration

In [15]:
tokenized_dataset = {
    "train": tokenized_train_small.remove_columns(['in', 'expected']),
    "test": tokenized_test_small.remove_columns(['in', 'expected'])
}

training_args = TrainingArguments(
  output_dir="./checkpoints_adapter", 
  do_train=True,
  remove_unused_columns=False,
  learning_rate=1e-4,
  num_train_epochs=3,
  weight_decay=0.01,
  evaluation_strategy="epoch"
)

In [16]:
from transformers.adapters import AdapterConfig, HoulsbyConfig, PfeifferConfig, ParallelConfig

config = AdapterConfig(mh_adapter=True, output_adapter=True, reduction_factor=16, non_linearity="relu")
houlsby_config = HoulsbyConfig()
pfeiffer_config = PfeifferConfig()
parallel_config = ParallelConfig()

### Houlsby

In [None]:
### Houlsby config
model_houlsby.add_adapter(adapter_path, config=houlsby_config)
model_houlsby.add_seq2seq_lm_head(adapter_path)
model_houlsby.train_adapter(adapter_path)

trainer_houlsby = AdapterTrainer(
  model=model_houlsby,
  args=training_args,
  tokenizer=tokenizer,
  train_dataset=tokenized_dataset['train'],
  eval_dataset=tokenized_dataset['test']
)

trainer_houlsby.train()
trainer_houlsby.evaluate()

Adding adapter 'paraphrase'.
Adding head 'paraphrase' with config {'head_type': 'seq2seq_lm', 'vocab_size': 32128, 'layers': 1, 'activation_function': None, 'layer_norm': False, 'bias': False, 'shift_labels': False, 'label2id': None}.
***** Running training *****
  Num examples = 5000
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 625


Step,Training Loss
100,0.795
200,0.0845
300,0.0795
400,0.078
500,0.0752
600,0.0748


Saving model checkpoint to ./checkpoints_adapter/checkpoint-500
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/adapter_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_adapter.bin
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/head_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_model_head.bin
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/head_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_model_head.bin
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/head_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_model_head.bin
tokenizer config file saved in ./checkpoints_adapter/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./checkpoints_adapter/checkpoint-500/special_tokens_map.json


Training completed. Do

{'epoch': 1.0,
 'eval_loss': 0.08687897026538849,
 'eval_runtime': 27.3274,
 'eval_samples_per_second': 18.297,
 'eval_steps_per_second': 2.305}

### Pfeiffer

In [None]:
### Pfeiffer config
model_pfeiffer.add_adapter(adapter_path, config=pfeiffer_config)
model_pfeiffer.add_seq2seq_lm_head(adapter_path)
model_pfeiffer.train_adapter(adapter_path)

trainer_pfeiffer = AdapterTrainer(
  model=model_pfeiffer,
  args=training_args,
  tokenizer=tokenizer,
  train_dataset=tokenized_dataset['train'],
  eval_dataset=tokenized_dataset['test']
)

trainer_pfeiffer.train()

Adding adapter 'paraphrase'.
Adding head 'paraphrase' with config {'head_type': 'seq2seq_lm', 'vocab_size': 32128, 'layers': 1, 'activation_function': None, 'layer_norm': False, 'bias': False, 'shift_labels': False, 'label2id': None}.
***** Running training *****
  Num examples = 5000
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 625


Step,Training Loss
100,1.2222
200,0.0908
300,0.0838
400,0.0818
500,0.0785
600,0.0784


Saving model checkpoint to ./checkpoints_adapter/checkpoint-500
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/adapter_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_adapter.bin
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/head_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_model_head.bin
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/head_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_model_head.bin
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/head_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_model_head.bin
tokenizer config file saved in ./checkpoints_adapter/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./checkpoints_adapter/checkpoint-500/special_tokens_map.json


Training completed. Do

{'epoch': 1.0,
 'eval_loss': 0.09049589186906815,
 'eval_runtime': 26.9239,
 'eval_samples_per_second': 18.571,
 'eval_steps_per_second': 2.34}

In [None]:
trainer_pfeiffer.evaluate()

***** Running Evaluation *****
  Num examples = 500
  Batch size = 8


{'epoch': 1.0,
 'eval_loss': 0.09049589186906815,
 'eval_runtime': 26.955,
 'eval_samples_per_second': 18.549,
 'eval_steps_per_second': 2.337}

### Parallel

In [17]:
### Parallel config
model_parallel.add_adapter(adapter_path, config=parallel_config)
model_parallel.add_seq2seq_lm_head(adapter_path)
model_parallel.train_adapter(adapter_path)

trainer_parallel = AdapterTrainer(
  model=model_parallel,
  args=training_args,
  tokenizer=tokenizer,
  train_dataset=tokenized_dataset['train'],
  eval_dataset=tokenized_dataset['test']
)

trainer_parallel.train()

***** Running training *****
  Num examples = 30000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 11250


Epoch,Training Loss,Validation Loss
1,0.0476,0.058254
2,0.043,0.056394
3,0.0401,0.055654


Saving model checkpoint to ./checkpoints_adapter/checkpoint-500
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/adapter_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_adapter.bin
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/head_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_model_head.bin
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/head_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_model_head.bin
Configuration saved in ./checkpoints_adapter/checkpoint-500/paraphrase/head_config.json
Module weights saved in ./checkpoints_adapter/checkpoint-500/paraphrase/pytorch_model_head.bin
tokenizer config file saved in ./checkpoints_adapter/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./checkpoints_adapter/checkpoint-500/special_tokens_map.json
Saving model checkpoint 

TrainOutput(global_step=11250, training_loss=0.04719882261488173, metrics={'train_runtime': 5864.0298, 'train_samples_per_second': 15.348, 'train_steps_per_second': 1.918, 'total_flos': 1.30530410496e+16, 'train_loss': 0.04719882261488173, 'epoch': 3.0})

In [None]:
trainer_parallel.evaluate()

# Save Adapters

In [None]:

# save adapter
model_houlsby.save_adapter('custom_adapters/houlsby-sm', 'paraphrase')

Configuration saved in custom_adapters/houlsby-sm/adapter_config.json
Module weights saved in custom_adapters/houlsby-sm/pytorch_adapter.bin
Configuration saved in custom_adapters/houlsby-sm/head_config.json
Module weights saved in custom_adapters/houlsby-sm/pytorch_model_head.bin


In [None]:
model_pfeiffer.save_adapter('custom_adapters/pfeiffer-sm', 'paraphrase')

Configuration saved in custom_adapters/pfeiffer-sm/adapter_config.json
Module weights saved in custom_adapters/pfeiffer-sm/pytorch_adapter.bin
Configuration saved in custom_adapters/pfeiffer-sm/head_config.json
Module weights saved in custom_adapters/pfeiffer-sm/pytorch_model_head.bin


In [18]:
model_parallel.save_adapter('custom_adapters/parallel', 'paraphrase')

Configuration saved in custom_adapters/parallel/adapter_config.json
Module weights saved in custom_adapters/parallel/pytorch_adapter.bin
Configuration saved in custom_adapters/parallel/head_config.json
Module weights saved in custom_adapters/parallel/pytorch_model_head.bin
