In [None]:
import os
#specifying wich gpu to use because there is no option for that in Trainer 
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import torch
import evaluate
import numpy as np
import pandas as pd
from datasets import Dataset
from torch import nn
from transformers import BertModel
from transformers import AutoTokenizer
from transformers import default_data_collator
from transformers import TrainingArguments, Trainer
from transformers import BertForSequenceClassification

In [None]:
path_train = os.path.join("train_short_impossible.csv")
train_df = pd.read_csv("train_short_impossible.csv", sep = ";", index_col=0, na_filter=False)
# 

path_dev = os.path.join("dev_short_impossible.csv")
dev_df = pd.read_csv("dev_short_impossible.csv", sep = ";", index_col=0, na_filter=False)
# 
train_df['is_impossible'] = train_df['is_impossible'].apply(int)
dev_df['is_impossible'] = dev_df['is_impossible'].apply(int)

train_df = train_df.drop(['end','start', 'user', 'title', 'section', 'answer', 'type', 'modanswer'], axis=1)
dev_df = dev_df.drop(['end','start', 'user', 'title', 'section', 'answer', 'type', 'modanswer'], axis=1)

train_df.columns = train_df.columns.str.replace('is_impossible', 'label')
dev_df.columns = dev_df.columns.str.replace('is_impossible', 'label')

train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)

In [None]:
#deepset/xlm-roberta-large-squad2
#mcsabai/huBert-fine-tuned-hungarian-squadv2
id2label = {0: "False", 1: "True"}
label2id = {"False": 0, "True": 1}
model_checkpoint = "SZTAKI-HLT/hubert-base-cc"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = BertForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

max_length = 384
doc_stride = 128
pad_on_right = tokenizer.padding_side == "right"

In [None]:
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    
    examples["question"] = [q.lstrip() for q in examples["question"]]
    
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        padding="max_length",
    )
           
    return tokenized_examples

In [None]:
train_tokenized_dataset = train_dataset.map(prepare_train_features, batched=True)
train_tokenized_dataset = train_tokenized_dataset.remove_columns(['token_type_ids', 'id', 'question', '__index_level_0__'])
dev_tokenized_dataset = dev_dataset.map(prepare_train_features, batched=True)
dev_tokenized_dataset=dev_tokenized_dataset.remove_columns(['token_type_ids', 'id', 'question','__index_level_0__'])

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
batch_size = 16
lr = 2e-5
epochs = 3

model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    output_dir=f"{model_name}",
    evaluation_strategy = "epoch",
    save_strategy = 'no',
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    push_to_hub=False,
    remove_unused_columns=False,
)
data_collator = default_data_collator

In [None]:
model.to('cuda:0')
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=dev_tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()