In [3]:
import os
#specifying wich gpu to use because there is no option for that in Trainer 
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import tqdm
import torch
import numpy as np
import pandas as pd

from datasets import Dataset

from transformers import pipeline
from transformers import AutoTokenizer
from transformers import default_data_collator
from transformers import AutoModelForQuestionAnswering
from transformers import TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
csv_name = "short.csv"
df = pd.read_csv(csv_name, sep=";", index_col=0, na_filter=False)

In [3]:
df = df.drop(["section", "title", "type", "modanswer"], axis=1)

In [4]:
train, dev, test = \
              np.split(df.sample(frac=1, random_state=42),
                       [int(.80*len(df)), int(.99*len(df))])
print(len(train))
print(len(dev))
print(len(test))
train_dataset = Dataset.from_pandas(train)
dev_dataset = Dataset.from_pandas(dev)
test_dataset = Dataset.from_pandas(test)

12011
2852
151


In [5]:
model_checkpoint = "mcsabai/huBert-fine-tuned-hungarian-squadv1"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

max_length = 384
doc_stride = 128
pad_on_right = tokenizer.padding_side == "right"

In [6]:
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        # answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        # print(len(examples["short_start"])
        # print(examples.keys())
        if examples['is_impossible'] == True:
            
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            
            # Start/end character index of the answer in the text.
            start_char = examples["start"][sample_index]
            end_char = examples["end"][sample_index]

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1
            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index-1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index+1)


    return tokenized_examples

In [7]:
#tokenizing

In [8]:
train_tokenized_dataset = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
train_tokenized_dataset = train_tokenized_dataset.remove_columns('token_type_ids')
dev_tokenized_dataset = dev_dataset.map(prepare_train_features, batched=True, remove_columns=dev_dataset.column_names)
dev_tokenized_dataset=dev_tokenized_dataset.remove_columns('token_type_ids')

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:04<00:00,  3.18ba/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  3.73ba/s]


In [22]:

# model.load_state_dict(torch.load('huBert-fine-tuned-hungarian-squadv1-finetuned-squad/huBert-fine-tuned-hungarian-squadv1_3epoch_16batch_2e-5_startendint.pt'))

In [6]:


batch_size = 16
lr = 2e-5
epochs = 3

# model_name = model_checkpoint.split("/")[-1]
# args = TrainingArguments(
#     f"{model_name}-finetuned-squad",
#     evaluation_strategy = "epoch",
#     save_strategy = 'no',
#     learning_rate=lr,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     num_train_epochs=epochs,
#     weight_decay=0.01,
#     push_to_hub=False,
#     remove_unused_columns=False,
# )
# data_collator = default_data_collator

In [None]:
model.to('cuda:2')

In [10]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=dev_tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [32]:
# torch.save(model.state_dict(), f"{model_name}-finetuned-squad/{model_name}_10epoch_16batch_2e-5_startendint.pt")

In [4]:
path = os.path.join("huBert_MCsabai_"+batch_size+"_"+epochs+"_"+lr)

TypeError: can only concatenate str (not "int") to str

In [13]:
trainer.save_model(path)

Saving model checkpoint to trainer_model
Configuration saved in trainer_model/config.json
Model weights saved in trainer_model/pytorch_model.bin
tokenizer config file saved in trainer_model/tokenizer_config.json
Special tokens file saved in trainer_model/special_tokens_map.json


In [22]:
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

In [23]:
def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [16]:
# path = os.path.join("..","trainer_model")
tokenizer = AutoTokenizer.from_pretrained("./short_16batch_3epoch/",local_files_only=True)
model = AutoModelForQuestionAnswering.from_pretrained("./short_16batch_3epoch/",local_files_only=True).to('cuda:0')

In [21]:
def test_data(test_data, model, tokenizer):
    qa_pipeline = pipeline(
    "question-answering",
    model = model,
    tokenizer = tokenizer,
    device = 0
    )
    
    # idx_to_id = dict()
    predictions = dict()
    for i, data in enumerate(tqdm.tqdm(test_data)):
        context = data['context']
        question = data['question']
        gold_answer = data['answer']
        # idx_to_id['i'] = data['id']
        id_of_context = data['id']
        prediction = qa_pipeline({
            'context': context,
            'question': question
        })
        predictions[i] = {'id':id_of_context, 
                          'context':context, 
                          'question':question, 
                          'gold_answer':gold_answer, 
                          'prediction':prediction['answer'],
                          'prediction score': prediction['score']}
    return predictions

In [22]:
preds = test_data(dev_dataset, model, tokenizer)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2852/2852 [00:23<00:00, 122.06it/s]


In [1]:
f1 = 0
compute = 0
for pred in preds.items():
    
    f1 += compute_f1(pred[1]['prediction'], pred[1]['gold_answer'])
    compute += compute_exact_match(pred[1]['prediction'], pred[1]['gold_answer'])

f1 = (f1/len(preds))
compute = (compute/len(preds))
print(f'f1 score:   ',f1, '\nexact match:',compute)

NameError: name 'preds' is not defined

In [None]:
#A lenti kód már csak úgy tesztel, hogy nem veszi figyelembe a duplikált kérdéseket, hiszen ezek elég sok hibát szülnek.

In [57]:
def test_data(test_data, model, tokenizer):
    not_to_check = df[df['question'].duplicated(keep=False)]['question'].unique()
    qa_pipeline = pipeline(
    "question-answering",
    model = model,
    tokenizer = tokenizer
    )
    
    # idx_to_id = dict()
    predictions = dict()
    for i, t_data in enumerate(tqdm.tqdm(test_data)):
        if t_data['question'] not in not_to_check:
            context = t_data['context']
            question = t_data['question']
            gold_answer = t_data['gold_answer']
            # idx_to_id['i'] = data['id']
            id_of_context = t_data['id']
            prediction = qa_pipeline({
                'context': context,
                'question': question
            })
            predictions[i] = {'id':id_of_context, 
                              'context':context, 
                              'question':question, 
                              'gold_answer':gold_answer, 
                              'prediction':prediction['answer'],
                              'prediction_score': prediction['score']}
    return predictions

In [37]:
preds = test_data(dev_dataset, model, tokenizer)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2852/2852 [02:05<00:00, 22.71it/s]


In [40]:
f1 = 0
compute = 0
for pred in preds.items():
    
    f1 += compute_f1(pred[1]['prediction'], pred[1]['gold_answer'])
    compute += compute_exact_match(pred[1]['prediction'], pred[1]['gold_answer'])

f1 = (f1/len(preds))
compute = (compute/len(preds))
print(f'f1 score:   ',f1, '\nexact match:',compute)

f1 score:    0.8434772342667081 
exact match: 0.7017543859649122


In [58]:
preds = test_data(data_asd, model, tokenizer)

114it [00:06, 18.16it/s]


In [29]:
pred_to_csv = pd.DataFrame(preds).T

In [30]:
pred_to_csv = pred_to_csv.reset_index(drop = True)
#pred_to_csv =pred_to_csv.drop(['prediction score'], axis=1)

In [33]:
pred_to_csv.to_csv('pred_test.csv', sep=";")

In [116]:
pred_unique = None
for pred in preds.items():
    
    print ('id:', pred[1]['id'],'\n','question:',pred[1]['question'], '\n', 'prediction:', pred[1]['prediction'], '\n','gold_answer:', pred[1]['gold_answer'], '\n')

id: 540 
 question: Mikor épült a budapesti Áldás utcában az általános iskola? 
 prediction: 1911–1912 
 gold_answer: 1911–1912 

id: 3221 
 question: Miért nem tudják a láb izzadságát elvezetni a zoknik? 
 prediction: hacsak nem úgy alakítják ki őket, hogy erre alkalmasak legyenek 
 gold_answer: a cipő belsejében nem vagy csak alig érintkezhetnek a környezettel 

id: 2555 
 question: Melyik magyar városokban volt megtekinthető a Seuso vándorkiállítás 2017-ben és 2018-ban? 
 prediction: Kaposvárott, Kecskeméten, Miskolcon, Nyíregyházán, Zalaegerszegen és Székesfehérvárott 
 gold_answer: Kaposvárott, Kecskeméten, Miskolcon, Nyíregyházán, Zalaegerszegen és Székesfehérvárott 

id: 2852 
 question: Miért tartották a hollót a halál madarának? 
 prediction: az emberi tetemeket is csapatostul lepték el 
 gold_answer: Mint dögevők, a kivégzések, csaták, háborúk után az emberi tetemeket is csapatostul lepték el 

id: 1359 
 question: Mikor nyitották meg a pesti magyar nyelvű színházat? 
 predic

In [35]:
df = pd.read_csv("pred_test.csv", sep=";")

In [37]:
df

Unnamed: 0.1,Unnamed: 0,id,context,question,gold_answer,prediction,prediction_score
0,0,540,"Áldás Utcai Általános Iskola, Budapest, 1911–1...",Mikor épült a budapesti Áldás utcában az által...,1911–1912,1911–1912,0.990547
1,1,3221,"Egy ruházatban akkor érezzük jól magunkat, ha ...",Miért nem tudják a láb izzadságát elvezetni a ...,a cipő belsejében nem vagy csak alig érintkezh...,"hacsak nem úgy alakítják ki őket, hogy erre al...",0.591594
2,2,2555,Az egyetemes antik kultúra egyik legnagyobb ér...,Melyik magyar városokban volt megtekinthető a ...,"Kaposvárott, Kecskeméten, Miskolcon, Nyíregyhá...","Kaposvárott, Kecskeméten, Miskolcon, Nyíregyhá...",0.071009
3,3,2852,A Holló csillagkép nevét a görög mitológia ért...,Miért tartották a hollót a halál madarának?,"Mint dögevők, a kivégzések, csaták, háborúk ut...",az emberi tetemeket is csapatostul lepték el,0.201222
4,4,1359,A 13. század óta népszerű a magyar költészetbe...,Mikor nyitották meg a pesti magyar nyelvű szín...,1812-es,1812-es,0.966016
...,...,...,...,...,...,...,...
109,109,928,A vizsgálatok ellenére számos atléta folytatta...,Tudtak-e arról a keletnémet sportolónők szülei...,a szüleik előzetes hozzájárulásával,tudomásuk nélkül,0.386311
110,110,2734,Az idő előrehaladásával párhuzamosan egyre job...,Milyen hitben volt Karl Begger és Hans Oemler ...,hogy a korábban eltervezett „katonai gyakorlat...,hogy a korábban eltervezett „katonai gyakorlat...,0.394501
111,111,954,"A 2013-as évet is jól kezdte, januárban az els...",Milyen meleg volt a Balaton 2013. június 22-én...,30 fokos,30 fokos,0.718921
112,112,295,1943 januárjában a szövetséges vezetők a casab...,Frederick E. Morgan altábornagy amerikai volt?,brit,Frederick E. Morgan altábornagy,0.390771
