In [12]:
import random
import torch
import json
import os.path
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, \
    IntervalStrategy, EarlyStoppingCallback, AutoConfig, EvalPrediction
import numpy as np
from datasets import Dataset
from sklearn.metrics import f1_score

In [13]:
def convert_labels(data, label_map):
    data['label'] = [label_map[i] for i in data['label']]
    return data

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    if len(set(preds))==2:
        f1_bin = f1_score(p.label_ids, preds, average='binary')
    else:
        f1_bin = -1
    result = {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item(),
              'f1': f1_bin,
              'f1_macro': f1_score(p.label_ids, preds, average='macro'),
              'f1_micro': f1_score(p.label_ids, preds, average='micro')}
    return result

In [14]:
# variables

label_maps = {
    'debate': {'claim': 1, 'noclaim': 0},
    'sandy': {'y': 1, 'n': 0},
    'rumours':  {'comment': 2, 'deny': 1, 'support': 3, 'query': 0},
    'clex': {'Related - but not informative': 2, 'Not related': 1,
             'Related and informative': 3, 'Not applicable': 0}
}

label_maps_inverse = {
    'debate': {1: 'claim', 0: 'noclaim'},
    'sandy': {1: 'y', 0:'n'},
    'rumours':  {0: 'query', 1: 'deny', 2:'comment', 3:'support'},
    'clex': {2: 'Related - but not informative', 1: 'Not related',
             3: 'Related and informative', 0:'Not applicable'}
}

data_name = 'sandy'
data_dir = '/ukp-storage-1/beck/Repositories/temporal-adaptation/datasets/stowe-2018/labeled/stowe-2018-labeled-all.csv'
partition = 'time_stratified_partition'
output_dir = '/ukp-storage-1/beck/Repositories/dcwe/results/jupyter'
label_map = label_maps[data_name]
inverse_label_map = label_maps_inverse[data_name]
batch_size = 16
lr = 0.0001
n_epochs = 2
warmup_ratio =0.1
weight_decay=0.01
modelname = 'bert-base-cased'
time_field = 'date'
label_field = 'tag'
seed = 666
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)


<torch._C.Generator at 0x7f0010c5ce28>

In [15]:
dataframe = pd.read_csv(data_dir, encoding='utf-8')
n_labels = len(set(dataframe[label_field].values))
# take the corresponding split (if it exists otherwise take random split)

# rename data columns to common format
dataframe.rename(columns={label_field: 'label', time_field: 'time'}, inplace=True)
# convert string labels to numeric
dataframe['label'] = dataframe['label'].replace(label_map)

dataframe.dropna(subset=[partition, 'label', 'time'], inplace=True)
dataframe.time = pd.to_datetime(dataframe.time)
dataframe.reset_index(inplace=True, drop=True)

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
output_dir = output_dir

tokenizer = AutoTokenizer.from_pretrained(modelname)
# load split data
train_data = dataframe[dataframe[partition] == "train"]
train_data = pd.DataFrame(dataframe.iloc[train_data.index])
train_dataset = Dataset.from_pandas(train_data).map(
    lambda ex: tokenizer(ex['text'], truncation=True, padding='max_length'), batched=True)

validation_data = dataframe[dataframe[partition] == "dev"]
validation_data = pd.DataFrame(dataframe.iloc[validation_data.index])
validation_dataset = Dataset.from_pandas(validation_data).map(
    lambda ex: tokenizer(ex['text'], truncation=True, padding='max_length'), batched=True)

test_data = dataframe[dataframe[partition] == "test"]
test_data = pd.DataFrame(dataframe.iloc[test_data.index])
test_dataset = Dataset.from_pandas(test_data).map(
    lambda ex: tokenizer(ex['text'], truncation=True, padding='max_length'), batched=True)

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /ukp-storage-1/beck/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file https://huggingface.co/bert-base-cased/re

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [16]:
# model preparation
def model_init():
    config = AutoConfig.from_pretrained(
        modelname,
        num_labels=n_labels
    )
    model = AutoModelForSequenceClassification.from_pretrained(modelname, config=config)
    return model

training_args = TrainingArguments(
    output_dir=output_dir,          # output directory
    num_train_epochs=n_epochs,              # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=batch_size,   # batch size for evaluation
    warmup_ratio=warmup_ratio,                # number of warmup steps for learning rate scheduler
    weight_decay=weight_decay,               # strength of weight decay
    evaluation_strategy=IntervalStrategy.EPOCH,
    save_strategy=IntervalStrategy.EPOCH,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    save_total_limit=1,
    seed=seed
)

trainer = Trainer(
    model=model_init(),                   # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=validation_dataset,      # evaluation dataset
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /ukp-storage-1/beck/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "mod

In [17]:
trainer.train()

eval_results = trainer.evaluate()

test_results = trainer.predict(test_dataset=test_dataset)
preds = test_results.predictions[0] if isinstance(test_results.predictions, tuple) else test_results.predictions
preds =  [inverse_label_map[i] for i in list(np.argmax(preds, axis=1))]
truth =  [label_map[i] for i in list(test_results.label_ids)]

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: time_stratified_partition, id, tweet_id, progressive_bin, __index_level_0__, controlled_partition, text, time.
***** Running training *****
  Num examples = 10358
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1296


Epoch,Training Loss,Validation Loss,Accuracy,F1 Bin,F1 Macro,F1 Micro
1,0.2708,0.290252,0.933102,0.830022,0.894189,0.933102
2,0.1525,0.205263,0.937446,0.859375,0.909576,0.937446


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: time_stratified_partition, id, tweet_id, progressive_bin, __index_level_0__, controlled_partition, text, time.
***** Running Evaluation *****
  Num examples = 1151
  Batch size = 16
Saving model checkpoint to /ukp-storage-1/beck/Repositories/dcwe/results/jupyter/checkpoint-648
Configuration saved in /ukp-storage-1/beck/Repositories/dcwe/results/jupyter/checkpoint-648/config.json
Model weights saved in /ukp-storage-1/beck/Repositories/dcwe/results/jupyter/checkpoint-648/pytorch_model.bin
Deleting older checkpoint [/ukp-storage-1/beck/Repositories/dcwe/results/jupyter/checkpoint-1296] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: time_stratified_partition, id, tweet_id, progressive_bin, __index_level_0__, controlled

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: time_stratified_partition, id, tweet_id, progressive_bin, __index_level_0__, controlled_partition, text, time.
***** Running Prediction *****
  Num examples = 5754
  Batch size = 16


KeyError: 0

In [9]:
preds = test_results.predictions[0] if isinstance(test_results.predictions, tuple) else test_results.predictions

In [10]:
inverse_label_map = label_maps_inverse[data_name]
preds =  [inverse_label_map[i] for i in list(np.argmax(preds, axis=1))]
truth =  [inverse_label_map[i] for i in list(test_results.label_ids)]

In [11]:
f1_score(truth, preds, pos_label='y')

0.8291280740977323