In [26]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat May 28 23:52:01 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P0    34W / 250W |  15993MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
import os 
import pandas as pd 
import numpy as np
import sqlite3

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 4.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 4.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 59.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 36.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
   

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score,matthews_corrcoef

In [4]:
from transformers import AutoModel, AutoTokenizer, AutoModelForQuestionAnswering
from sklearn.model_selection import train_test_split


df = pd.read_csv('/content/drive/MyDrive/SEW.NLP/qasper_df.csv', index_col=0).drop_duplicates().reset_index(drop=True)
intro_papers_df = pd.read_csv('/content/drive/MyDrive/SEW.NLP/intro_papers_df.csv', index_col=0)

df["introPaper"], intro_papers_df["introPaper"] = False, True
df = pd.concat([df, intro_papers_df], axis=0).reset_index(drop=True)

# only keep questions related to the data
search_for = ["data", "feature", "variable", "result", "preprocessing", "labels", "baseline", "metric"]
df_filtered = df.loc[df["question"].str.contains("|".join(search_for))]
# df_filtered["start-end"] = df_filtered.apply(lambda x: (x["start_index"], x["end_index"]), axis=1)
# df_filtered = df_filtered.groupby(["question", "context"])["start-end"].apply(list).reset_index()

In [5]:
count_answers = df_filtered.groupby(["question", "context"])["answer"].count().reset_index()
context_mult_answers = count_answers[count_answers["answer"] > 1]["context"]
df_filtered = df_filtered[(df_filtered["start_index"] != -1) | (~df_filtered["context"].isin(context_mult_answers))]

In [6]:
df_filtered["min_start_idx"] = df_filtered.groupby(["question", "context"])["start_index"].transform("min")
df_filtered["max_end_idx"] = df_filtered.groupby(["question", "context"])["end_index"].transform("max")

In [7]:
def narrow_context(row):
    context = row["context"]
    start_index = max(row["min_start_idx"] - 4000, 0) 
    end_index = max(row["max_end_idx"] + 4000, 8000)
    end_index = min(end_index, len(context)-1)
    return context[start_index:end_index]

In [8]:
df_filtered["narrowed_context"] = df_filtered.apply(narrow_context, axis=1)

In [9]:
def find_index(row):
    answer = row["answer"]
    start_index = row["narrowed_context"].find(answer) if row["start_index"] != -1 else -1
    end_index = start_index + len(answer) - 1 if row["start_index"] != -1 else -1
    return start_index, end_index

df_filtered["start-end"] = df_filtered.apply(find_index, axis=1)
df_filtered["narrowed_context"] = df_filtered["narrowed_context"].str.strip()
df_filtered["answer"] = df_filtered["answer"].str.strip()

In [10]:
df_filtered = df_filtered.groupby(["question", "narrowed_context"])[["start-end", "answer"]].agg(lambda x: list(x)).reset_index()

In [11]:
X = df_filtered[['question', 'narrowed_context']]
y = df_filtered[["start-end", "answer"]]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 42)

df_train = pd.concat([X_train, y_train], axis = 1)
df_val = pd.concat([X_val, y_val], axis = 1)
df_test = pd.concat([X_test, y_test], axis = 1)

In [12]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [13]:
def tokenize_df(df, tokenizer, MAX_LEN, stride):

    return tokenizer(
        list(df['question']),
        list(df['narrowed_context']),
        max_length = MAX_LEN,
        return_overflowing_tokens = True,
        truncation = 'only_second',
        return_offsets_mapping = True,
        stride = stride,
        padding = 'max_length'
    )


In [14]:
def preprocess_data(df, tokenizer, max_len, stride):
    start_positions = []
    end_positions = []

    tokenized = tokenize_df(df, tokenizer, max_len, stride)

    offsets_mapping = tokenized["offset_mapping"]
    for i, offset in enumerate(offsets_mapping):
        sequence_ids = tokenized.sequence_ids(i)
        
        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        df_index = tokenized["overflow_to_sample_mapping"][i]
        list_start_end = df.loc[df_index, "start-end"]
        if i == 0:
          print(list_start_end, offset[context_start][0], offset[context_end][1])

        for start_char, end_char in list_start_end:
            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                continue
            else:
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)
                if i == 0:
                    print(start_positions, end_positions)
                break
        
        if len(start_positions) == i:
            start_positions.append(0)
            end_positions.append(0)
    
        if i == 0:
            print(start_positions, end_positions)

    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    
    return tokenized

In [15]:
tokenizer = AutoTokenizer.from_pretrained("ixa-ehu/SciBERT-SQuAD-QuAC")

In [16]:
MAX_LEN = 512
stride = 128

data_preprocessing_pipeline = lambda df: preprocess_data(df, tokenizer, MAX_LEN, stride)

df_train_tokenized = data_preprocessing_pipeline(df_train)
df_val_tokenized = data_preprocessing_pipeline(df_val)
df_test_tokenized = data_preprocessing_pipeline(df_test)

[(4000, 4037), (4150, 4209)] 0 1941
[0] [0]
[(4000, 4002), (4330, 4333), (4590, 4593)] 0 2595
[0] [0]
[(4000, 4014)] 0 2410
[0] [0]


In [17]:
class TextDataset(Dataset):
  def __init__(self, questions, starts, ends, attention_masks, df_index, offset_mapping):
    self.questions = questions
    self.starts = starts
    self.ends = ends
    self.attention_masks = attention_masks
    self.df_index = df_index
    self.offset_mapping = offset_mapping

  def __len__(self):
    return len(self.questions)

  def __getitem__(self, item):
    question = self.questions[item]
    attention_mask = self.attention_masks[item]
    start = self.starts[item]
    end = self.ends[item]
    df_index = self.df_index[item]
    offset_mapping = self.offset_mapping[item]


    return {
      'input_ids': torch.tensor(question, dtype = torch.long),
      'attention_mask': torch.tensor(attention_mask, dtype = torch.long),
      'start_positions': torch.tensor(start, dtype=torch.long),
      'end_positions' : torch.tensor(end, dtype = torch.long),
      'df_index': torch.tensor(df_index, dtype=torch.long),
      'offset_mapping': offset_mapping
    }

In [18]:
train_dataset = TextDataset(
    questions = df_train_tokenized['input_ids'],
    starts = df_train_tokenized['start_positions'],
    ends = df_train_tokenized['end_positions'],
    attention_masks = df_train_tokenized['attention_mask'],
    df_index = df_train_tokenized["overflow_to_sample_mapping"],
    offset_mapping = df_train_tokenized["offset_mapping"]
)

val_dataset = TextDataset(
    questions = df_val_tokenized['input_ids'],
    starts = df_val_tokenized['start_positions'],
    ends = df_val_tokenized['end_positions'],
    attention_masks = df_val_tokenized['attention_mask'],
    df_index = df_val_tokenized["overflow_to_sample_mapping"],
    offset_mapping = df_val_tokenized["offset_mapping"]
)

test_dataset = TextDataset(
    questions = df_test_tokenized['input_ids'],
    starts = df_test_tokenized['start_positions'],
    ends = df_test_tokenized['end_positions'],
    attention_masks = df_test_tokenized['attention_mask'],
    df_index = df_test_tokenized["overflow_to_sample_mapping"],
    offset_mapping = df_test_tokenized["offset_mapping"]
)

In [35]:
batch_size = 8


# train_sampler = RandomSampler(train_data)
# val_sampler = RandomSampler(val_data)
# test_sampler = RandomSampler(test_data)

# train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)
# val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size = batch_size)
# test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = batch_size)

In [36]:
import gc
gc.collect()
import torch
torch.cuda.empty_cache()
import random
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

SEED = 19

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if device == torch.device("cuda"):
    torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda")

In [49]:
from transformers import TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("ixa-ehu/SciBERT-SQuAD-QuAC").to(device)
model_checkpoint = "ixa-ehu/SciBERT-SQuAD-QuAC"

model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    output_dir = "/content/drive/MyDrive/SEW.NLP/logs/scibert-squad_24",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    logging_dir = "./logs/runs",
    do_train = True,
    do_eval = True,
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps = 25

)

loading configuration file https://huggingface.co/ixa-ehu/SciBERT-SQuAD-QuAC/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a8c52b63ffbb5c867d6270ae21e7905d08e9801af48232ff0e08e3b755da233b.7c3af56d16d03847a339f67a41d8e0c1108d55c0977344615e38b438ec54680d
Model config BertConfig {
  "_name_or_path": "ixa-ehu/SciBERT-SQuAD-QuAC",
  "_num_labels": 2,
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31090
}

loading weights file https://

In [50]:
from transformers import default_data_collator

data_collator = default_data_collator


trainer = Trainer(
    model,
    args,
    train_dataset= train_dataset,
    eval_dataset= val_dataset,
    data_collator=data_collator
)

In [51]:
%%time
trainer.train()

***** Running training *****
  Num examples = 5660
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 354
The following columns in the training set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: offset_mapping, df_index. If offset_mapping, df_index are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.


Epoch,Training Loss,Validation Loss
1,1.233,1.308563
2,0.9586,1.375567


***** Running Evaluation *****
  Num examples = 702
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: offset_mapping, df_index. If offset_mapping, df_index are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
Saving model checkpoint to /content/drive/MyDrive/SEW.NLP/logs/scibert-squad_24/checkpoint-177
Configuration saved in /content/drive/MyDrive/SEW.NLP/logs/scibert-squad_24/checkpoint-177/config.json
Model weights saved in /content/drive/MyDrive/SEW.NLP/logs/scibert-squad_24/checkpoint-177/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 702
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: offset_mapping, df_index. If offset_mapping, df_index are not expected by `BertForQuestionAnswering.forward`,  you can safely 

CPU times: user 10min 27s, sys: 3.06 s, total: 10min 30s
Wall time: 10min 31s


TrainOutput(global_step=354, training_loss=1.2174749381124637, metrics={'train_runtime': 631.5325, 'train_samples_per_second': 17.925, 'train_steps_per_second': 0.561, 'total_flos': 2957879286251520.0, 'train_loss': 1.2174749381124637, 'epoch': 2.0})

In [52]:
# current best "/content/drive/MyDrive/SEW.NLP/logs/scibert_squad_15/checkpoint-500"

PATH_MODEL = "/content/drive/MyDrive/SEW.NLP/logs/scibert-squad_24/checkpoint-177" # path of model saved at best epoch

In [53]:
best_model = AutoModelForQuestionAnswering.from_pretrained(PATH_MODEL)

loading configuration file /content/drive/MyDrive/SEW.NLP/logs/scibert-squad_24/checkpoint-177/config.json
Model config BertConfig {
  "_name_or_path": "/content/drive/MyDrive/SEW.NLP/logs/scibert-squad_24/checkpoint-177",
  "_num_labels": 2,
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.19.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31090
}

loading weights file /content/drive/MyDrive/SEW.NLP/logs/scibert-squad_24/checkpoint-177/pytorch_model.bin
All model checkpoint weights w

In [54]:
best_trainer = Trainer(
    best_model,
    args,
    train_dataset= train_dataset,
    eval_dataset= test_dataset,
    data_collator=data_collator
)

val_predictions = best_trainer.predict(val_dataset)

***** Running Prediction *****
  Num examples = 702
  Batch size = 8
The following columns in the test set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: offset_mapping, df_index. If offset_mapping, df_index are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.


In [55]:
def compute_best_prediction(start_indexes, end_indexes,
                            start_logits, end_logits,
                            sequence_ids, offset_mapping, context):
    """
      Computes best feasible prediction and compares it with null prediction
    """


    best_score, null_score = -np.inf, -np.inf
    best_answer = ""
    best_start, best_end = 0, 0

    
    for start_index in start_indexes:
        for end_index in end_indexes:
            score = start_logits[start_index] + end_logits[end_index]
            if start_index == 0 or end_index == 0: # null prediction
                if start_index != end_index:
                    continue
                null_score = score
            
            elif start_index <= end_index and sequence_ids[start_index] == 1 and sequence_ids[end_index] == 1:
                if score > best_score:
                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    if end_char > 0:
                        best_answer = context[start_char:end_char]
                        best_score = score
                        best_start = start_index
                        best_end = end_index
    
    score_diff = null_score - best_score
    

    return {
        "score_diff": score_diff,
        "pred_start": best_start,
        "pred_end": best_end,
        "pred_answer": best_answer
    }

In [56]:
def get_preds_df(predictions, df, tokenized_df, dataset):
    predicted_answers = []
    n_best_size = 20

    test_start_logits, test_end_logits = predictions.predictions
    start_labels, end_labels = predictions.label_ids

    for i in range(len(dataset)):

        start_label, end_label = start_labels[i], end_labels[i]
        start_logits, end_logits = test_start_logits[i], test_end_logits[i]

        start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
        end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()

        row_idx = dataset[i]["df_index"].item()
        row_context = df.loc[row_idx, "narrowed_context"]
        offset_mapping = dataset[i]["offset_mapping"]
        sequence_ids = tokenized_df.sequence_ids(i)
        
        # find predicted answer:
        prediction = compute_best_prediction(start_indexes, end_indexes, start_logits, end_logits, sequence_ids, offset_mapping, row_context)
        
        # find correct answer:
        start_char_true = offset_mapping[start_label][0]
        end_char_true = offset_mapping[end_label][1]
        correct_answer = row_context[start_char_true:end_char_true+1] if end_label > 0 else ""
        
        prediction["pred_token"] = tokenized_df[i].tokens[prediction["pred_start"]:prediction["pred_end"]+1]
        prediction["start_label"] = start_label
        prediction["end_label"] = end_label
        prediction["correct_answer"] = correct_answer
        prediction["correct_token"] = tokenized_df[i].tokens[start_label:end_label+1]

        predicted_answers.append(prediction)

    preds_df = pd.DataFrame(predicted_answers)
    preds_df["NoAnsw"] = preds_df.apply(lambda row: row["correct_token"] == ["[CLS]"], axis=1)
    
    return preds_df

In [57]:
val_preds_df = get_preds_df(val_predictions, df_val, df_val_tokenized, val_dataset)

In [58]:
val_preds_df

Unnamed: 0,score_diff,pred_start,pred_end,pred_answer,pred_token,start_label,end_label,correct_answer,correct_token,NoAnsw
0,6.430604,103,110,"hoax, propaganda and satire","[ho, ##ax, ,, propag, ##anda, and, sati, ##re]",0,0,,[[CLS]],True
1,-3.744671,407,407,CNN,[cnn],407,407,CNN:,[cnn],False
2,-4.712158,34,105,"CNN: In this model, we apply a 1-d CNN (Convol...","[cnn, :, in, this, model, ,, we, apply, a, 1, ...",34,34,CNN:,[cnn],False
3,1.418314,132,134,graph convolutional network,"[graph, convolutional, network]",0,0,,[[CLS]],True
4,8.940039,35,40,GAT + 2 Attn Heads,"[gat, +, 2, att, ##n, heads]",0,0,,[[CLS]],True
...,...,...,...,...,...,...,...,...,...,...
697,4.413710,106,107,perplex,"[per, ##plex]",0,0,,[[CLS]],True
698,-4.167419,480,497,EM INLINEFORM0 : evaluates the overall accurac...,"[em, in, ##line, ##form, ##0, :, evaluates, th...",480,484,EM INLINEFORM0,"[em, in, ##line, ##form, ##0]",False
699,-2.804528,99,114,leftmargin=*] EM INLINEFORM0 : evaluates the o...,"[left, ##mar, ##gin, =, *, ], em, in, ##line, ...",105,109,EM INLINEFORM0,"[em, in, ##line, ##form, ##0]",False
700,9.570248,57,478,HiStGen INLINEFORM7 gives the worst performanc...,"[hist, ##gen, in, ##line, ##form, ##7, gives, ...",0,0,,[[CLS]],True


In [59]:
import collections

def compute_f1(gold_toks, pred_toks):
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [60]:
def get_f1_df(preds_df):
    f1_data = {"threshold": [], "f1":[], "f1_Answ":[], "f1_NoAnsw":[]}

    for threshold in np.arange(-3, 10, 1):
        temp = preds_df.copy()
        temp["pred_token"] = temp.apply(lambda row: row["pred_token"] if row["score_diff"] < threshold else ["[CLS]"], axis=1)
        temp["f1"] = temp.apply(lambda r: compute_f1(r["correct_token"], r["pred_token"]), axis=1)

        f1_total = 100 * np.mean(temp["f1"])
        f1_scores = list(100 * temp.groupby("NoAnsw").agg(["mean"])["f1"]["mean"])
        
        f1_data["threshold"].append(threshold)
        f1_data["f1"].append(f1_total)
        f1_data["f1_Answ"].append(f1_scores[0])
        f1_data["f1_NoAnsw"].append(f1_scores[1])

    df_scores = pd.DataFrame(f1_data)

    return df_scores
val_scores = get_f1_df(val_preds_df)

In [61]:
val_scores

Unnamed: 0,threshold,f1,f1_Answ,f1_NoAnsw
0,-3,72.627485,16.596821,98.541667
1,-2,73.650332,21.633031,97.708333
2,-1,73.477952,23.790641,96.458333
3,0,73.349747,26.988841,94.791667
4,1,72.823506,29.829286,92.708333
5,2,71.629492,34.612178,88.75
6,3,70.12598,37.515488,85.208333
7,4,66.323394,39.004605,78.958333
8,5,64.523352,42.321589,74.791667
9,6,60.555474,42.83758,68.75


In [34]:
val_scores.to_csv("/content/drive/MyDrive/SEW.NLP/Edoardo_val_scibert_21.csv")

In [None]:
## Pick a threshold with decent f1 and good f1_Answ, like 3 or 4
threshold = 4

In [None]:
best_trainer = Trainer(
    best_model,
    args,
    train_dataset= train_dataset,
    eval_dataset= test_dataset,
    data_collator=data_collator
)

test_predictions = best_trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 1724
  Batch size = 20
The following columns in the test set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: df_index, offset_mapping. If df_index, offset_mapping are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.


In [None]:
test_preds_df = get_preds_df(test_predictions, df_test, df_test_tokenized, test_dataset)

In [None]:
test_scores = get_f1_df(test_preds_df)

In [None]:
test_scores[test_scores["threshold"] == threshold] # final score

Unnamed: 0,threshold,f1,f1_Answ,f1_NoAnsw
7,4,70.543183,33.574426,84.931507
