<a href="https://colab.research.google.com/github/benjaminbeilharz/ba-thesis/blob/main/DD_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [80]:
!pip install --upgrade transformers datasets accelerate torch cloud-tpu-client



In [91]:
from typing import List, Tuple

import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, DataCollatorWithPadding, AdamW
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments

import datasets
from datasets import load_dataset
from datasets.dataset_dict import DatasetDict

HuggingfaceDataset = DatasetDict
TOKENIZER = AutoTokenizer.from_pretrained('benjaminbeilharz/distilbert-dailydialog-turn-classifier')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


Didn't find file benjaminbeilharz/distilbert-dailydialog-turn-classifier/added_tokens.json. We won't load it.
loading file benjaminbeilharz/distilbert-dailydialog-turn-classifier/vocab.txt
loading file benjaminbeilharz/distilbert-dailydialog-turn-classifier/tokenizer.json
loading file None
loading file benjaminbeilharz/distilbert-dailydialog-turn-classifier/special_tokens_map.json
loading file benjaminbeilharz/distilbert-dailydialog-turn-classifier/tokenizer_config.json


In [82]:
data = load_dataset('benjaminbeilharz/better_daily_dialog')

Using custom data configuration benjaminbeilharz--better_daily_dialog-b98601c51fce7f6b
Reusing dataset parquet (/root/.cache/huggingface/datasets/parquet/benjaminbeilharz--better_daily_dialog-b98601c51fce7f6b/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121)


  0%|          | 0/3 [00:00<?, ?it/s]

In [83]:
def tokenizer_function(sample):
    return TOKENIZER(sample['utterance'], truncation=True)

tokenized_data = data.map(tokenizer_function, batched=True)
tokenized_data = tokenized_data.remove_columns(['dialog_id', 'emotion', 'utterance'])
tokenized_data = tokenized_data.rename_column('turn_type', 'labels')
tokenized_data

  0%|          | 0/88 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 87170
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 8069
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 7740
    })
})

In [92]:
def compute_metrics(preds):
    metric = datasets.load_metric('accuracy')
    logits, labels = preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [85]:
data_collator = DataCollatorWithPadding(tokenizer=TOKENIZER)
model = AutoModelForSequenceClassification.from_pretrained('benjaminbeilharz/distilbert-dailydialog-turn-classifier')



loading configuration file benjaminbeilharz/distilbert-dailydialog-turn-classifier/config.json
Model config DistilBertConfig {
  "_name_or_path": "benjaminbeilharz/distilbert-dailydialog-turn-classifier",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.15.0",
  "vocab_size": 30522


In [94]:
args = TrainingArguments('model', evaluation_strategy='epoch')
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['validation'],
    tokenizer=TOKENIZER,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [87]:
trainer.train()

***** Running training *****
  Num examples = 87170
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 32691


Epoch,Training Loss,Validation Loss


Saving model checkpoint to model/checkpoint-500
Configuration saved in model/checkpoint-500/config.json
Model weights saved in model/checkpoint-500/pytorch_model.bin
tokenizer config file saved in model/checkpoint-500/tokenizer_config.json
Special tokens file saved in model/checkpoint-500/special_tokens_map.json
Saving model checkpoint to model/checkpoint-1000
Configuration saved in model/checkpoint-1000/config.json
Model weights saved in model/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in model/checkpoint-1000/tokenizer_config.json
Special tokens file saved in model/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to model/checkpoint-1500
Configuration saved in model/checkpoint-1500/config.json
Model weights saved in model/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in model/checkpoint-1500/tokenizer_config.json
Special tokens file saved in model/checkpoint-1500/special_tokens_map.json
Saving model checkpoint to model/checkpoint-2000

AttributeError: ignored

In [75]:
model.push_to_hub('benjaminbeilharz/distilbert-dailydialog-turn-classifier')
TOKENIZER.push_to_hub('benjaminbeilharz/distilbert-dailydialog-turn-classifier')

Cloning https://huggingface.co/benjaminbeilharz/distilbert-dailydialog-turn-classifier into local empty directory.
Configuration saved in benjaminbeilharz/distilbert-dailydialog-turn-classifier/config.json
Model weights saved in benjaminbeilharz/distilbert-dailydialog-turn-classifier/pytorch_model.bin


Upload file pytorch_model.bin:   0%|          | 32.0k/255M [00:00<?, ?B/s]

To https://huggingface.co/benjaminbeilharz/distilbert-dailydialog-turn-classifier
   1bdbc6b..31f99e2  main -> main

tokenizer config file saved in benjaminbeilharz/distilbert-dailydialog-turn-classifier/tokenizer_config.json
Special tokens file saved in benjaminbeilharz/distilbert-dailydialog-turn-classifier/special_tokens_map.json
To https://huggingface.co/benjaminbeilharz/distilbert-dailydialog-turn-classifier
   31f99e2..0cf1579  main -> main



'https://huggingface.co/benjaminbeilharz/distilbert-dailydialog-turn-classifier/commit/0cf1579663312bd6cb08035ed2aef764463241e1'

In [58]:
train_loader, val_loader, test_loader = [DataLoader(data, batch_size=1, shuffle=True, collate_fn=data_collator) for data in tokenized_data.values()]

In [59]:
next(iter(train_loader))

ValueError: ignored

In [6]:
model = model.to(device)
optimizer = AdamW(model.parameters())

In [27]:
for epoch in range(1):
    running_loss = .0
    model.train()
    for i, batch in enumerate(train_loader, start=1):
        x, y = *batch
        print(x, y)
        for i in x:
            i = {k: v.to(device).squeeze(0) for k, v in x.items()}
        y = y.to(device)

        model.zero_grad()

        out = model(**x, labels=y)

        if i % 50 == 0:
            print('Running loss: ', running_loss/i)

        loss = out.loss
        loss.backward()
        running_loss += loss.item()

        optimizer.step()
        optimizer.zero_grad()


    running_eval = .0
    model.eval()
    preds = []
    labels = []

    for i, batch in enumerate(val_loader, start=1):
        with torch.no_grad():
            x, y = batch
            x = {k: v.to(device).squeeze(0) for k, v in x.items()}
            y = y.to(device)
            labels.append(y)

            out = model(**x, labels=y)
            running_eval += out.loss.item()
            logits = out.logits
            pred = torch.argmax(logits, dim=-1)
            preds.append(pred)

    acc = torch.sum(preds == labels) / len(val_loader)
    print('Val loss: ', running_eval)
    print('Avg val acc: ', acc)
            

        





AttributeError: ignored

In [None]:
hf = ''