In [None]:
import os
#specifying wich gpu to use because there is no option for that in Trainer 
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import tqdm
import torch
import numpy as np
import pandas as pd

from datasets import Dataset

from huggingface_hub import notebook_login

from transformers import pipeline
from transformers import AutoTokenizer
from transformers import default_data_collator
from transformers import AutoModelForQuestionAnswering
from transformers import TrainingArguments, Trainer

In [None]:
# from huggingface_hub import HfApi

# api = HfApi(
#     # endpoint="https://huggingface.co/ZTamas/xlm-roberta-large-squad2-qa-milqa-impossible/", # Can be a Private Hub endpoint.
#     # token="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", # Token is not persisted on the machine.
# )

# api.upload_folder(
#     folder_path="models/xlm-roberta-large-squad2_16batch_3epoch_2e-05lr_long_impossible/",
#     repo_id="ZTamas/xlm-roberta-large-squad2_impossible_long_answer",
#     ignore_patterns=".ipynb_checkpoints",
# )

In [None]:
csv_n = 'short_impossible'
csv_name = "data/short_impossible.csv"
df = pd.read_csv(csv_name, sep=";", index_col=0, na_filter=False)

In [None]:
df = df.drop(["section", "title", "type", "modanswer"], axis=1)

In [None]:
df['answer'].astype(str).map(len).max()

In [None]:
train, dev, test = \
              np.split(df.sample(frac=1, random_state=42),
                       [int(.80*len(df)), int(.99*len(df))])
print(len(dev))
dev = pd.concat([dev, test], axis=0)
print(len(train))
print(len(dev))
print(len(test))

train_dataset = Dataset.from_pandas(train)
dev_dataset = Dataset.from_pandas(dev)
test_dataset = Dataset.from_pandas(test)

In [None]:
#deepset/xlm-roberta-large-squad2
#mcsabai/huBert-fine-tuned-hungarian-squadv2
model_checkpoint = "deepset/xlm-roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

max_length = 384
doc_stride = 128
pad_on_right = tokenizer.padding_side == "right"

In [None]:
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    
    examples["question"] = [q.lstrip() for q in examples["question"]]
    
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    is_impossible = 0
    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        # answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        # print(len(examples["short_start"])
        # print(examples.keys())

        if examples['is_impossible'][sample_index] == True:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)

        else:
            
            # Start/end character index of the answer in the text.
            start_char = examples["start"][sample_index]
            end_char = examples["end"][sample_index]

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1
            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                (tokenized_examples["start_positions"].append(token_start_index))
                #while offsets[token_end_index][1] >= end_char:
                
                while offsets[token_end_index][1] > end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index)
           
    return tokenized_examples

In [None]:
#tokenizing

In [None]:
train_tokenized_dataset = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
# train_tokenized_dataset = train_tokenized_dataset.remove_columns('token_type_ids')
dev_tokenized_dataset = dev_dataset.map(prepare_train_features, batched=True, remove_columns=dev_dataset.column_names)
# dev_tokenized_dataset=dev_tokenized_dataset.remove_columns('token_type_ids')

In [None]:

# model.load_state_dict(torch.load('huBert-fine-tuned-hungarian-squadv1-finetuned-squad/huBert-fine-tuned-hungarian-squadv1_3epoch_16batch_2e-5_startendint.pt'))

In [None]:


batch_size = 16
lr = 2e-5
epochs = 3

model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    output_dir=f"{model_name}",
    evaluation_strategy = "epoch",
    save_strategy = 'no',
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    push_to_hub=False,
    remove_unused_columns=False,
)
data_collator = default_data_collator

In [None]:
model.to('cuda:0')
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=dev_tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,

)

In [None]:
trainer.train()

In [None]:
# torch.save(model.state_dict(), f"{model_name}-finetuned-squad/{model_name}_10epoch_16batch_2e-5_startendint.pt")

In [None]:
path = os.path.join("models", model_name+"_"+str(batch_size)+"batch_"+str(epochs)+"epoch_"+str(lr)+"lr_"+csv_n)

In [None]:
if os.path.exists(path):
    print("this file already exists", path)
else:
    os.mkdir(path)

In [None]:
trainer.save_model(path)

In [None]:
trainer.push_to_hub()

In [None]:
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

In [None]:
def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [None]:
def test_data(test_data, model, tokenizer):
    qa_pipeline = pipeline(
    "question-answering",
    model = model,
    tokenizer = tokenizer,
    device = 0,
    handle_impossible_answer = True,
    max_answer_len = 1000
    )
    
    # idx_to_id = dict()
    predictions = dict()
    for i, data in enumerate(tqdm.tqdm(test_data)):
        context = data['context']
        question = data['question']
        gold_answer = data['answer']
        # idx_to_id['i'] = data['id']
        id_of_context = data['id']
        prediction = qa_pipeline({
            'context': context,
            'question': question
        })
        predictions[i] = {'id':id_of_context, 
                          'context':context, 
                          'question':question, 
                          'gold_answer':gold_answer, 
                          'prediction':prediction['answer'],
                          'prediction score': prediction['score']}
    return predictions
#A lenti kód már csak úgy tesztel, hogy nem veszi figyelembe a duplikált kérdéseket, hiszen ezek elég sok hibát szülnek.
def test_data_without_duplicate(test_data, model, tokenizer):
    not_to_check = df[df['question'].duplicated(keep=False)]['question'].unique()
    qa_pipeline = pipeline(
    "question-answering",
    model = model,
    tokenizer = tokenizer,
    device = 0,
    handle_impossible_answer = True,
    max_answer_len = 1000
    )
    
    # idx_to_id = dict()
    predictions = dict()
    for i, t_data in enumerate(tqdm.tqdm(test_data)):
        if t_data['question'] not in not_to_check:
            context = t_data['context']
            question = t_data['question']
            gold_answer = t_data['answer']
            # idx_to_id['i'] = data['id']
            id_of_context = t_data['id']
            prediction = qa_pipeline({
                'context': context,
                'question': question
            })
            predictions[i] = {'id':id_of_context, 
                              'context':context, 
                              'question':question, 
                              'gold_answer':gold_answer, 
                              'prediction':prediction['answer'],
                              'prediction_score': prediction['score']}
    return predictions

In [None]:
def compute(predictions):
    f1 = 0
    compute = 0
    for pred in predictions.items():

        f1 += compute_f1(pred[1]['prediction'], pred[1]['gold_answer'])
        compute += compute_exact_match(pred[1]['prediction'], pred[1]['gold_answer'])

    f1 = (f1/len(predictions))
    compute = (compute/len(predictions))
    print(f'f1 score:   ',f1, '\nexact match:',compute)

In [None]:
def compute_score(path,dev_dataset, model, tokenizer):
    
    
    preds = test_data(dev_dataset, model, tokenizer)
    preds_without_dup = test_data_without_duplicate(dev_dataset, model, tokenizer)
    
    print(path)
    print("\n", "score, ha nézzük a listás válaszokat is:")
    compute(preds)
    print("\n", "score, ha nem nézzük a listás válaszokat:")
    compute(preds_without_dup)
    

In [None]:
paths = [x[0] for x in os.walk("models")][2:-1]

In [None]:

in paths[2]

In [None]:
for path in paths:
    if csv_n in path:
        tokenizer = AutoTokenizer.from_pretrained(path,local_files_only=True)
        model = AutoModelForQuestionAnswering.from_pretrained(path,local_files_only=True).to('cuda:0')
        compute_score(path, dev_dataset,model, tokenizer)
        del model
        del tokenizer

In [None]:
import evaluate

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred, labels):
    
    return accuracy.compute(predictions=eval_pred, references=labels)

In [None]:
path = os.path.join("models","huBert-fine-tuned-hungarian-squadv2_16batch_3epoch_2e-05lr_short_impossible")
tokenizer = AutoTokenizer.from_pretrained(path,local_files_only=True)
model = AutoModelForQuestionAnswering.from_pretrained(path,local_files_only=True).to('cuda:0')
preds = test_data(dev_dataset,model, tokenizer)

In [None]:
ev_preds = list()
for pred in preds.items():
    ev_preds.append(True if pred[1]['prediction'] == "" else False)

In [None]:
compute_metrics(ev_preds, dev_dataset['is_impossible'])