In [1]:
from pathlib import Path
from datasets import load_dataset
from clearml import Dataset

local_dataset_path = Path(Dataset.get(dataset_project="sarcasm_detector", dataset_name="reddit_kaggle").get_local_copy())

dataset = load_dataset("csv", data_files={"train": str(local_dataset_path / "train-balanced-sarcasm.train.csv"), "val": str(local_dataset_path / "train-balanced-sarcasm.test.csv")})
dataset = dataset.filter(lambda x: bool(x['comment']))
# dataset = dataset.map(lambda examples: {"labels": examples["label"]}, batched=True)
dataset

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["comment"], truncation=True)

tokenized_dataset = dataset.map(preprocess_function, batched=True)

100%|██████████| 910/910 [01:48<00:00,  8.36ba/s]
100%|██████████| 102/102 [00:12<00:00,  8.14ba/s]


In [6]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
import evaluate

accuracy = evaluate.load("accuracy")

In [8]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [9]:
id2label = {0: "NORMAL", 1: "SARCASTIC"}
label2id = {"NORMAL": 0, "SARCASTIC": 1}

In [10]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)
model.to('cuda')

training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

ClearML Task: created new task id=89f9be4182374070bfccd90808827c71
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2023-02-01 15:00:53,802 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/21ae7368745a4faa800170e7b97a5772/experiments/89f9be4182374070bfccd90808827c71/output/log
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid dea

ClearML Task has been initialized.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [20]:
text = "Arne is hungry, Nele should probably leave now."

In [21]:
from transformers import pipeline

classifier = pipeline("text-classification", model="./my_awesome_model/checkpoint-56828", device='cuda:0')
classifier(text)

[{'label': 'NORMAL', 'score': 0.5706275701522827}]

In [31]:
from evaluate import evaluator
task_evaluator = evaluator("text-classification")

results = task_evaluator.compute(model_or_pipeline=classifier, data=dataset['val'], metric=accuracy, label_mapping=label2id, input_column='comment')

print(results)

{'accuracy': 0.7718036855738641, 'total_time_in_seconds': 293.1384018079989, 'samples_per_second': 346.358577974718, 'latency_in_seconds': 0.00288718127279352}
