# Install transformers

In [1]:
!pip install git+https://github.com/huggingface/transformers
!git clone https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-475qyqut
Building wheels for collected packages: transformers
  Building wheel for transformers (setup.py) ... [?25ldone
[?25h  Created wheel for transformers: filename=transformers-3.0.2-py3-none-any.whl size=868793 sha256=593c56c93325518d6a5bc48ba7a2ef06fa784797f660aaf98c30e163673eea7f
  Stored in directory: /tmp/pip-ephem-wheel-cache-44vjeuw1/wheels/5a/0a/d0/eb8d0ea1d7d02156f8675d6e5dfa52c03601cbe377290db8dc
Successfully built transformers
Cloning into 'transformers'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 39062 (delta 6), reused 11 (delta 2), pack-reused 39042[K
Receiving objects: 100% (39062/39062), 28.08 MiB | 8.77 MiB/s, done.
Resolving deltas: 100% (27071/27071), done.
Checking connectivity... done.


In [46]:
# # If you do not use colabs...
# !pip install -r ./transformers/examples/requirements.txt

In [4]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

import transformers
from transformers import (
    AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, 
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    glue_compute_metrics,
    glue_output_modes,
    glue_tasks_num_labels,
    set_seed,
    EvalPrediction, 
    GlueDataset)
from transformers import GlueDataTrainingArguments as DataTrainingArguments

import os
import sys
import logging
import zipfile
from dataclasses import dataclass, field
from typing import Dict, Optional
import dataclasses

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# # If you do not use colabs...
# device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
# device

# Download GLUE benchmark (QQP, QNLI)

In [2]:
!python transformers/utils/download_glue_data.py --tasks QQP
!python transformers/utils/download_glue_data.py --tasks QNLI

In [5]:
# # If you do not use colabs...
# !mkdir ./glue_data
# # Download the QQP dataset from https://gluebenchmark.com/tasks
# # Upload the zip file to ./glud_data

# def download_and_extract(task, data_dir):
#     print("Extracting %s..." % task)
#     data_file = "%s.zip" % task
#     # urllib.request.urlretrieve(TASK2PATH[task], data_file)
#     print(data_file)
#     try:
#         with zipfile.ZipFile(os.path.join(data_dir, data_file)) as zip_ref:
#             zip_ref.extractall(data_dir)
#         print("\tCompleted!")
#         # os.remove(data_file)
#     except:
#         pass


In [8]:
# # If you do not use colabs...
# task_name = "QQP"
# download_and_extract(task_name, "./glue_data")

Extracting QQP...
QQP.zip
	Completed!


# Set configuration for models and training

## Model configurations

In [9]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )

In [10]:
model_args = ModelArguments(
    model_name_or_path="bert-base-uncased",
)

#   Enter the pre-trained model name  you want to train.    
#    Model          | Tokenizer          | Pretrained weights shortcut
#   (BertModel,       BertTokenizer,       'bert-base-uncased'),
#   (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
#   (GPT2Model,       GPT2Tokenizer,       'gpt2'),
#   (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
#   (CTRLModel,       CTRLTokenizer,       'ctrl'),
#   (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
#   (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
#   (DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased'),
#   (RobertaModel,    RobertaTokenizer,    'roberta-base'),
#   (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),

## Define task name

In [11]:
# define task name
task_name = "QQP"

## Training parameters

In [24]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  
os.environ["CUDA_VISIBLE_DEVICES"]="1"

training_args = TrainingArguments(
    output_dir=os.path.join("./outputs/glue",task_name),
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_gpu_train_batch_size=32,
    per_gpu_eval_batch_size=128,
    num_train_epochs=1,
    logging_steps=500,
    logging_first_step=True,
    save_steps=1000,
    evaluate_during_training=True,
)

In [25]:
print(training_args.device)
print(training_args.n_gpu)

INFO:transformers.training_args:PyTorch: setting up devices


cuda:0
1


In [48]:
# # If you do not use colab.
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# from typing import Tuple

# @dataclass
# class TrainingArgumentsNotebook(TrainingArguments):
#     @property
#     def _setup_devices(self) -> Tuple["torch.device", int]:
#         return  device, 1
    
# training_args = TrainingArgumentsNotebook(
#     output_dir=os.path.join("./outputs/glue",task_name),
#     overwrite_output_dir=True,
#     do_train=True,
#     do_eval=True,
#     per_gpu_train_batch_size=32,
#     per_gpu_eval_batch_size=128,
#     num_train_epochs=1,
#     logging_steps=500,
#     logging_first_step=True,
#     save_steps=1000,
#     evaluate_during_training=True,
# )

In [27]:
# Set num_labels 
num_labels = glue_tasks_num_labels[task_name.lower()]
num_labels

2

# Instantiate the config, the tokenizer, and the model

In [28]:
set_seed(training_args.seed)

In [29]:
config = AutoConfig.from_pretrained(
    model_args.model_name_or_path,
    num_labels=num_labels,
    finetuning_task=task_name,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_args.model_name_or_path,
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_args.model_name_or_path,
    config=config,
)

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/jennybae/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
INFO:transformers.configuration_utils:Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": "QQP",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.

# Load the dataset

In [30]:
data_dir = './glue_data'
file_dir = os.path.join(data_dir, task_name)
# input_file = os.path.join(file_dir, "train.tsv")
input_file = os.path.join(file_dir, "dev.tsv")
# input_file = os.path.join(file_dir, "test.tsv")

In [31]:
with open(input_file, "r") as f:
    display(pd.read_csv(f, delimiter='\t').head())

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,201359,303345,303346,Why are African-Americans so beautiful?,Why are hispanics so beautiful?,0.0
1,263843,69383,380476,I want to pursue PhD in Computer Science about...,I handle social media for a non-profit. Should...,0.0
2,172974,266948,175089,Is there a reason why we should travel alone?,What are some reasons to travel alone?,1.0
3,15329,29298,29299,Why are people so obsessed with having a girlf...,How can a single male have a child?,0.0
4,209794,314169,314170,What are some good baby girl names starting wi...,What are some good baby girl names starting wi...,0.0


In [32]:
data_args = DataTrainingArguments(task_name=task_name, data_dir=file_dir)
data_args

GlueDataTrainingArguments(task_name='qqp', data_dir='./glue_data/QQP', max_seq_length=128, overwrite_cache=False)

In [34]:
train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train")
eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode='dev')

INFO:filelock:Lock 139857324285176 acquired on ./glue_data/QQP/cached_train_BertTokenizer_128_qqp.lock
INFO:transformers.data.datasets.glue:Creating features from dataset file at ./glue_data/QQP
INFO:transformers.data.processors.glue:*** Example ***
INFO:transformers.data.processors.glue:guid: train-133273
INFO:transformers.data.processors.glue:features: InputFeatures(input_ids=[101, 2129, 2003, 1996, 2166, 1997, 1037, 8785, 3076, 1029, 2071, 2017, 6235, 2115, 2219, 6322, 1029, 102, 2029, 2504, 1997, 17463, 8156, 2003, 2438, 2005, 1996, 11360, 1046, 14277, 2102, 2629, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0

In [37]:
print("input_ids: ", train_dataset[10].input_ids[:30])
print("attention_mask: ", train_dataset[10].attention_mask[:30])
print("token_type_ids: ", train_dataset[10].token_type_ids[:30])
print("label: ",  train_dataset[1].label)

input_ids:  [101, 2054, 2003, 1996, 2212, 1997, 3968, 2290, 8059, 1998, 2129, 2515, 1996, 9580, 12826, 2000, 1996, 2414, 5538, 1997, 2003, 18722, 1029, 102, 2054, 2003, 1996, 2221, 1997, 3968]
attention_mask:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
token_type_ids:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
label:  1


In [40]:
print(tokenizer.convert_ids_to_tokens(101))
print(tokenizer.convert_ids_to_tokens(102))

[CLS]
[SEP]


# Train with Trainer

## Evaluation metrics

In [41]:
def compute_metrics(p: EvalPrediction) -> Dict:
    preds = np.argmax(p.predictions, axis=1)
    return glue_compute_metrics(data_args.task_name, preds, p.label_ids)

In [42]:
# In case of QQP,  eval metrics are accuracy and F1 score.
# def acc_and_f1(preds, labels):
#     acc = simple_accuracy(preds, labels)
#     f1 = f1_score(y_true=labels, y_pred=preds)
#     return {
#         "acc": acc,
#         "f1": f1,
#         "acc_and_f1": (acc + f1) / 2,
#     }

# In case of QNLI, eval metric is accuracy
# def simple_accuracy(preds, labels):
#     return (preds == labels).mean()


## Train

In [43]:
training_args.device

device(type='cuda', index=0)

In [49]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

INFO:transformers.trainer:You are instantiating a Trainer but W&B is not installed. To use wandb logging, run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface.
INFO:transformers.trainer:To use comet_ml logging, run `pip/conda install comet_ml` see https://www.comet.ml/docs/python-sdk/huggingface/


In [50]:
%%time
if training_args.do_train:
    trainer.train()
    trainer.save_model()

INFO:transformers.trainer:***** Running training *****
INFO:transformers.trainer:  Num examples = 363849
INFO:transformers.trainer:  Num Epochs = 1
INFO:transformers.trainer:  Instantaneous batch size per device = 8
INFO:transformers.trainer:  Total train batch size (w. parallel, distributed & accumulation) = 32
INFO:transformers.trainer:  Gradient Accumulation steps = 1
INFO:transformers.trainer:  Total optimization steps = 11371


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=11371.0, style=ProgressStyle(description_…

{'loss': 0.0010740993022918702, 'learning_rate': 4.9995602849353626e-05, 'epoch': 8.79430129276229e-05, 'step': 1}
{'loss': 0.40556571882218123, 'learning_rate': 4.780142467680943e-05, 'epoch': 0.04397150646381145, 'step': 500}


INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 40430
INFO:transformers.trainer:  Batch size = 128


{'loss': 0.36396301475167275, 'learning_rate': 4.560284935361886e-05, 'epoch': 0.0879430129276229, 'step': 1000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=316.0, style=ProgressStyle(description_w…

INFO:transformers.trainer:Saving model checkpoint to ./outputs/glue/QQP/checkpoint-1000
INFO:transformers.configuration_utils:Configuration saved in ./outputs/glue/QQP/checkpoint-1000/config.json



{'eval_loss': 0.3333613752743822, 'eval_acc': 0.8482562453623547, 'eval_f1': 0.808753390068269, 'eval_acc_and_f1': 0.8285048177153118, 'epoch': 0.0879430129276229, 'step': 1000}


INFO:transformers.modeling_utils:Model weights saved in ./outputs/glue/QQP/checkpoint-1000/pytorch_model.bin






KeyboardInterrupt: 

## Evaluate

In [51]:
print(training_args.output_dir)
print(eval_dataset.args.task_name)

./outputs/glue/QQP
qqp


In [52]:
eval_results = {}
eval_result = trainer.evaluate(eval_dataset=eval_dataset)

output_eval_file = os.path.join(
    training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt"
)

with open(output_eval_file, "w") as writer:
    logger.info("***** Eval results {} *****".format(eval_dataset.args.task_name))
    for key, value in eval_result.items():
        logger.info("  %s = %s", key, value)
        writer.write("%s = %s\n" % (key, value))

eval_results.update(eval_result)
print(eval_results)

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 40430
INFO:transformers.trainer:  Batch size = 128


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=316.0, style=ProgressStyle(description_w…

INFO:__main__:***** Eval results qqp *****
INFO:__main__:  eval_loss = 0.33840099849708005
INFO:__main__:  eval_acc = 0.8541429631461785
INFO:__main__:  eval_f1 = 0.8067381116245534
INFO:__main__:  eval_acc_and_f1 = 0.830440537385366
INFO:__main__:  epoch = 0.10940110808196289



{'eval_loss': 0.33840099849708005, 'eval_acc': 0.8541429631461785, 'eval_f1': 0.8067381116245534, 'eval_acc_and_f1': 0.830440537385366, 'epoch': 0.10940110808196289, 'step': 1244}
{'eval_loss': 0.33840099849708005, 'eval_acc': 0.8541429631461785, 'eval_f1': 0.8067381116245534, 'eval_acc_and_f1': 0.830440537385366, 'epoch': 0.10940110808196289}


# Predict

In [53]:
test_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode='test')

INFO:filelock:Lock 139854078368232 acquired on ./glue_data/QQP/cached_test_BertTokenizer_128_qqp.lock
INFO:transformers.data.datasets.glue:Creating features from dataset file at ./glue_data/QQP
INFO:transformers.data.processors.glue:*** Example ***
INFO:transformers.data.processors.glue:guid: test-0
INFO:transformers.data.processors.glue:features: InputFeatures(input_ids=[101, 2052, 1996, 2801, 1997, 8398, 1998, 22072, 1999, 2793, 2362, 12665, 2017, 1010, 2445, 1996, 20248, 18155, 26116, 13494, 1029, 102, 2079, 2017, 2228, 2008, 2065, 6221, 8398, 2020, 2700, 2343, 1010, 2002, 2052, 2022, 2583, 2000, 9239, 4262, 2007, 22072, 1998, 3607, 2004, 2002, 2056, 2002, 2071, 1010, 2241, 2006, 1996, 6857, 3276, 22072, 2018, 2007, 8112, 1998, 5747, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [54]:
eval_results = {}
predictions = trainer.predict(test_dataset=test_dataset).predictions
predictions = np.argmax(predictions, axis=1)
output_test_file = os.path.join(
    training_args.output_dir, f"test_results_{test_dataset.args.task_name}.txt"
)
with open(output_test_file, "w") as writer:
    logger.info("***** Test results {} *****".format(test_dataset.args.task_name))
    writer.write("index\tprediction\n")
    for index, item in enumerate(predictions):
        item = test_dataset.get_labels()[item]
        writer.write("%d\t%s\n" % (index, item))

INFO:transformers.trainer:***** Running Prediction *****
INFO:transformers.trainer:  Num examples = 390965
INFO:transformers.trainer:  Batch size = 128


HBox(children=(FloatProgress(value=0.0, description='Prediction', max=3055.0, style=ProgressStyle(description_…




KeyboardInterrupt: 

In [40]:
with open(output_test_file, "r", encoding="utf-8") as f:
    lines = f.readlines()
    for line in lines:
        print(line)
    

index	prediction

0	0

1	0

2	1

3	0

4	1

5	1

6	0

7	0

8	0

9	1

10	0

11	0

12	1

13	0

14	0

15	0

16	0

17	0

18	0

19	0

20	0

21	0

22	0

23	0

24	1

25	1

26	0

27	1

28	0

29	0

30	0

31	0

32	1

33	0

34	0

35	0

36	0

37	1

38	1

39	0

40	0

41	0

42	1

43	0

44	1

45	0

46	0

47	0

48	0

49	0

50	0

51	0

52	0

53	0

54	0

55	0

56	0

57	0

58	0

59	0

60	0

61	1

62	0

63	1

64	0

65	0

66	1

67	0

68	0

69	0

70	0

71	0

72	1

73	0

74	0

75	1

76	0

77	1

78	0

79	0

80	0

81	1

82	0

83	1

84	0

85	0

86	1

87	1

88	0

89	0

90	0

91	0

92	1

93	0

94	0

95	0

96	0

97	0

98	1

99	0

100	1

101	1

102	0

103	1

104	0

105	0

106	1

107	1

108	0

109	0

110	0

111	1

112	0

113	0

114	1

115	0

116	0

117	0

118	0

119	1

120	1

121	0

122	0

123	0

124	1

125	1

126	0

127	0

128	0

129	0

130	1

131	0

132	1

133	0

134	0

135	0

136	0

137	0

138	0

139	1

140	0

141	1

142	0

143	0

144	0

145	0

146	0

147	0

148	1

149	0

150	1

151	1

152	0

153	0

154	1

155	0



KeyboardInterrupt: 

# Tensorboard

In [55]:
%load_ext tensorboard

In [56]:
%tensorboard --logdir runs --bind_all