In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import csv
import numpy as np
import pandas as pd
import pickle

from sklearn.utils import shuffle
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = "retina"
import transformers
from tqdm import tqdm

def preprocess_text(input_text):
    input_text = str(input_text).strip()
    input_text = input_text.replace("``", "''").replace("‘‘", '"').replace("’’", '"').replace("''", '"')
    input_text = input_text.replace("[", "").replace("]", "")
    input_text = input_text.replace(" .", ".").replace(" ,", ",")
    input_text = input_text.replace("’", "'").replace("“", '"').replace("”", '"')
    return input_text.replace("  ", " ")

In [2]:
MAX_SEQUENCE_LENGTH = 128
MODEL_NAME = "microsoft/deberta-large"
LEARNING_RATE = 5e-6
BATCH_SIZE = 128
TRAIN_DATASET = "MRPC_COR"
EPOCHS = 10

In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_text(t1, t2):
    o = tokenizer(t1, t2,
                  padding="max_length",
                  truncation="longest_first",
                  max_length=MAX_SEQUENCE_LENGTH,
                  return_token_type_ids=True,
                  return_attention_mask=True)
    return o

In [4]:
train_sent_1_list, train_sent_2_list, train_labels_raw = [], [], []

if TRAIN_DATASET=="PAWS":
    with open("../datasets/paws/train.tsv", "r") as csvfile:
        reader = csv.reader(csvfile, delimiter='\t')
        next(reader, None)  # skip the headers
        for row in reader:
            label = int(row[3])
            train_sent_1_list.append(preprocess_text(row[1]))
            train_sent_2_list.append(preprocess_text(row[2]))
            train_labels_raw.append(label)
if TRAIN_DATASET=="MRPC":
    with open("../datasets/mrpc_train_processed.csv", "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader, None)  # skip the headers
        for row in reader:
            label = int(row[2])
            train_sent_1_list.append(preprocess_text(row[0]))
            train_sent_2_list.append(preprocess_text(row[1]))
            assert len(row[1]) > 5
            train_labels_raw.append(label)
if TRAIN_DATASET=="MRPC_COR":
    with open("../datasets/mrpc_train_corrected.csv", "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader, None)  # skip the headers
        for row in reader:
            label = int(row[-1])
            train_sent_1_list.append(preprocess_text(row[0]))
            train_sent_2_list.append(preprocess_text(row[1]))
            assert len(row[1]) > 5
            train_labels_raw.append(label)

train_tokens = []
train_type_ids = []
train_attn_masks = []
train_labels = []

for t1, t2, l in zip(train_sent_1_list, train_sent_2_list, train_labels_raw):
    t = tokenize_text(t1, t2)
    train_tokens.append(t["input_ids"])
    train_type_ids.append(t["token_type_ids"])
    train_attn_masks.append(t["attention_mask"])
    train_labels.append(l)
    t = tokenize_text(t2, t1)
    train_tokens.append(t["input_ids"])
    train_type_ids.append(t["token_type_ids"])
    train_attn_masks.append(t["attention_mask"])
    train_labels.append(l)
    
train_tokens = np.asarray(train_tokens)
train_type_ids = np.asarray(train_type_ids)
train_attn_masks = np.asarray(train_attn_masks)
train_labels = np.asarray(train_labels)

In [5]:
valid_sent_1_list, valid_sent_2_list, valid_labels_raw = [], [], []
        
with open("../datasets/paws/dev.tsv", "r") as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    next(reader, None)  # skip the headers
    for row in reader:
        label = int(row[3])
        valid_sent_1_list.append(preprocess_text(row[1]))
        valid_sent_2_list.append(preprocess_text(row[2]))
        valid_labels_raw.append(label)

valid_tokens = []
valid_type_ids = []
valid_attn_masks = []
valid_labels = []

for t1, t2, l in zip(valid_sent_1_list, valid_sent_2_list, valid_labels_raw):
    t = tokenize_text(t1, t2)
    valid_tokens.append(t["input_ids"])
    valid_type_ids.append(t["token_type_ids"])
    valid_attn_masks.append(t["attention_mask"])
    valid_labels.append(l)
    t = tokenize_text(t2, t1)
    valid_tokens.append(t["input_ids"])
    valid_type_ids.append(t["token_type_ids"])
    valid_attn_masks.append(t["attention_mask"])
    valid_labels.append(l)
    
valid_tokens = np.asarray(valid_tokens)
valid_type_ids = np.asarray(valid_type_ids)
valid_attn_masks = np.asarray(valid_attn_masks)
valid_labels = np.asarray(valid_labels)

In [6]:
train_labels = np.asarray(train_labels)
print('Shape of label tensor:', train_labels.shape)

Shape of label tensor: (8870,)


In [7]:
print('Number of entries in each category:')
print("Training:", train_labels.sum(axis=0))
print("Validation:", valid_labels.sum(axis=0))

Number of entries in each category:
Training: 2680
Validation: 7078


In [8]:
import torch

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, tokens, type_ids, attn_masks, labels):
        self.tokens = tokens
        self.type_ids = type_ids
        self.attn_masks = attn_masks
        self.labels = labels
    def __getitem__(self, idx):
        item = {
            "input_ids": torch.tensor(self.tokens[idx], dtype=torch.int64),
            "token_type_ids": torch.tensor(self.type_ids[idx], dtype=torch.int64),
            "attention_mask": torch.tensor(self.attn_masks[idx], dtype=torch.int64),
        }
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.int64)
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_tokens, train_type_ids, train_attn_masks, train_labels)
valid_dataset = TextDataset(valid_tokens, valid_type_ids, valid_attn_masks, valid_labels)

In [9]:
d0 = train_dataset[0]
print(tokenizer.decode(d0["input_ids"]))
print(d0["labels"])

[CLS]Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.[SEP]Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor(1)


In [10]:
d0 = valid_dataset[0]
print(tokenizer.decode(d0["input_ids"]))
print(d0["labels"])

[CLS]Bradd Crellin represented BARLA Cumbria on a tour of Australia with 6 other players representing Britain, also on a tour of Australia.[SEP]Bradd Crellin also represented BARLA Great Britain on a tour through Australia on a tour through Australia with 6 other players representing Cumbria.[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
tensor(0)


In [11]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [12]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaForSequenceClassification: ['config', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['classifier.bias',

In [14]:
LOGGING_STEP = 50
training_args = TrainingArguments(
    output_dir='./results',
    logging_dir='./logs',
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=128,
    weight_decay=0.00,
    warmup_ratio=0.4,
    lr_scheduler_type="constant_with_warmup",
    eval_steps=LOGGING_STEP,
    logging_steps=LOGGING_STEP,
    save_steps=LOGGING_STEP,
    evaluation_strategy="steps",
    logging_strategy="steps",
    save_strategy="steps",
    save_total_limit=10,
    learning_rate=1e-5,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    disable_tqdm=False,
)

trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=train_dataset,
                  eval_dataset=valid_dataset,
                  compute_metrics=compute_metrics,
                  callbacks=[transformers.EarlyStoppingCallback(early_stopping_patience=4,
                                                                early_stopping_threshold=0.01),])
trainer.remove_callback(transformers.integrations.TensorBoardCallback)

Using amp fp16 backend


In [15]:
trainer.train()

***** Running training *****
  Num examples = 8870
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 556


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.65,0.784724,0.557562,0.357971,0.278799,0.499944
100,0.581,0.752105,0.469375,0.381191,0.580453,0.519786
150,0.4527,1.087971,0.477875,0.392171,0.611344,0.528167


  nn.utils.clip_grad_norm_(
***** Running Evaluation *****
  Num examples = 16000
  Batch size = 128
Saving model checkpoint to ./results/checkpoint-50
Configuration saved in ./results/checkpoint-50/config.json
Model weights saved in ./results/checkpoint-50/pytorch_model.bin
  nn.utils.clip_grad_norm_(
***** Running Evaluation *****
  Num examples = 16000
  Batch size = 128
Saving model checkpoint to ./results/checkpoint-100
Configuration saved in ./results/checkpoint-100/config.json
Model weights saved in ./results/checkpoint-100/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 16000
  Batch size = 128
Saving model checkpoint to ./results/checkpoint-150
Configuration saved in ./results/checkpoint-150/config.json
Model weights saved in ./results/checkpoint-150/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 16000
  Batch size = 128
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_1815/4032920361.py", line 1, in <module>
    trainer.train()
  File "/opt/conda/lib/python3.8/site-packages/transformers/trainer.py", line 1342, in train
    self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
  File "/opt/conda/lib/python3.8/site-packages/transformers/trainer.py", line 1443, in _maybe_log_save_evaluate
    metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
  File "/opt/conda/lib/python3.8/site-packages/transformers/trainer.py", line 2041, in evaluate
    output = eval_loop(
  File "/opt/conda/lib/python3.8/site-packages/transformers/trainer.py", line 2210, in evaluation_loop
    loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
  File "/opt/conda/lib/python3.8/sit

TypeError: object of type 'NoneType' has no len()

In [None]:
model_name_short = MODEL_NAME.split("/")[-1]
model.save_pretrained("./"+TRAIN_DATASET.lower()+"_"+model_name_short+"/")