In [1]:
import os 
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

import numpy as np
import pandas as pd
import plotly

from tqdm.auto import tqdm

import torch
from torch.utils.data import Dataset, DataLoader 

from datasets import Dataset
import evaluate
from transformers import get_scheduler, DefaultDataCollator
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import TrainingArguments, Trainer

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

PRETRAINED_MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

  from .autonotebook import tqdm as notebook_tqdm
2022-11-04 21:03:23.536895: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-04 21:03:23.674149: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-04 21:03:24.158873: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/alan/.cuda/lib64:/home/alan/.cuda/extras/CUPTI/lib64:/home/alan/.cuda/lib64:/home/alan/.cuda/extras/CUPTI/lib64:
2022-11-04 21:03:24.158943: W tenso

In [2]:
df_train = pd.read_csv('data/Batch_answers - train_data (no-blank).csv' )

# drop unnecessary columns
df_train = df_train.drop(['Unnamed: 6', 'total no.: 7987'], axis=1)

# remove quotes
df_train['q'] = df_train['q'].str.strip('"')
df_train['r'] = df_train['r'].str.strip('"')
df_train["q'"] = df_train["q'"].str.strip('"')
df_train["r'"] = df_train["r'"].str.strip('"')

# drop duplicated rows
df_train = df_train.drop_duplicates()

# numerical s
df_train['s_label'] = (df_train['s'] == 'AGREE').astype(int)

# some information about the dataset
print(df_train['s_label'].value_counts())
print('# of distinct data:\t', len(df_train['id'].unique()))
print('# of data:\t', len(df_train))

# save cleaned dataset
df_train.to_csv('data/train.tsv', sep='\t', index=False)

# load cleaned dataset
df_train = pd.read_csv('data/train.tsv', sep='\t')
df_train.sample(3)

0    29600
1     6415
Name: s_label, dtype: int64
# of distinct data:	 7987
# of data:	 36015


Unnamed: 0,id,q,r,s,q',r',s_label
8089,2248,Seriously ? That 's your reasoning ?,"Er , no . That is AnswersinGenesis ' reasoning...",DISAGREE,Seriously ? That 's your reasoning ?,Are you having difficulties,0
29515,8210,Let me get this straight . We ca n't force a w...,Yes because it is `` officially `` a life . Wh...,AGREE,We ca n't force a woman to give birth if she d...,Yes because it is `` officially `` a life .,1
16819,4768,Foolish Clergymen . How can this work ? How ca...,"You know , that sounds exactly what you are do...",DISAGREE,Nothing in the bible is to be beleived unless ...,"You know , that sounds exactly what you are do...",0


In [3]:
df_train['question'] = 'What is the main point?'
df_train['context'] = df_train['q'] + df_train['r']
df_train['id'] = df_train.index

# calculate the answer index in the context
df_train['answers_start'] = df_train[['q', 'q\'']].apply(lambda x: x['q'].find(x['q\'']), axis=1)
df_train['answers_text'] = df_train['q\'']
df_train['answers'] = df_train[['answers_start', 'answers_text']].apply(lambda x: {'answer_start': [x['answers_start']], 'text': [x['answers_text']]}, axis=1)

# drop unmatched answers (results from uncleaned dataset)
unmatch_idx = df_train['answers_start'] == -1
df_train = df_train[~unmatch_idx]

# FIXME: drop length > 360
df_train = df_train[(df_train['question']+df_train['context']).apply(lambda x: len(x)) < 360]

print(f'Num of unmatch data {unmatch_idx.sum()}')
df_train.sample(3)

df_train[['id', 'question', 'context', 'answers']].sample(3)

Num of unmatch data 5865


Unnamed: 0,id,question,context,answers
32666,32666,What is the main point?,C.S . Lewis also pointed out that even our abi...,"{'answer_start': [34], 'text': ['even our abil..."
23354,23354,What is the main point?,"Wait a minute , I fail to see how it not being...","{'answer_start': [0], 'text': ['Wait a minute ..."
28625,28625,What is the main point?,Why do suppose exaggeration and embellishment ...,"{'answer_start': [61], 'text': ['to facts is n..."


In [4]:
print('Length of the question + context:')
(df_train['question'] + df_train['context']).apply(lambda x: len(x)).describe()
# plotly.plot((df_train['question'] + df_train['context']).apply(lambda x: len(x)).sort_values(ascending=False), kind='hist', bins=100)

Length of the question + context:


count    10520.000000
mean       237.915875
std         74.205776
min         35.000000
25%        182.750000
50%        243.000000
75%        299.000000
max        359.000000
dtype: float64

In [5]:
dataset = Dataset.from_pandas(df_train[['id', 'question', 'context', 'answers']])
dataset = dataset.shuffle(seed=42).train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 7364
    })
    test: Dataset({
        features: ['id', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 3156
    })
})

### 下面大多是 copy-paste，可以去看 [Hugging Face Course - Question answering](https://huggingface.co/course/chapter7/7?fw=pt)

In [6]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [7]:
train_dataset = dataset["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=dataset["train"].column_names,
)
len(dataset["train"]), len(train_dataset)

 88%|████████▊ | 7/8 [00:01<00:00,  4.83ba/s]


(7364, 7364)

In [8]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [9]:
validation_dataset = dataset["test"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=dataset["test"].column_names,
)
len(dataset["test"]), len(validation_dataset)

 75%|███████▌  | 3/4 [00:00<00:00,  4.31ba/s]


(3156, 3156)

In [10]:
print(train_dataset)
print(validation_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 7364
})
Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 3156
})


In [11]:
BATCH_SIZE = 8
model = AutoModelForQuestionAnswering.from_pretrained(PRETRAINED_MODEL_NAME)

data_collator = DefaultDataCollator()

args = TrainingArguments(
  'test_trainer/',
  evaluation_strategy = "epoch",
  learning_rate=2e-5,
  per_device_train_batch_size=BATCH_SIZE,
  per_device_eval_batch_size=BATCH_SIZE,
  num_train_epochs=3,
  weight_decay=0.01,
)

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)
trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

Epoch,Training Loss,Validation Loss
1,2.0776,No log
2,1.485,No log
3,1.3048,No log


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test_trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in test_trainer/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForQuestionAnswering.forward` and have been ignored: offset_mapping, example_id. If offset_mapping, example_id are not expected by `DistilBertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3156
  Batch size = 8
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test_trainer/checkpoint-1000/tokenizer_config.json
Speci

TrainOutput(global_step=2763, training_loss=1.552578572988769, metrics={'train_runtime': 262.6554, 'train_samples_per_second': 84.11, 'train_steps_per_second': 10.519, 'total_flos': 2164791214430208.0, 'train_loss': 1.552578572988769, 'epoch': 3.0})

In [12]:
small_eval_set = dataset["test"].select(range(50))

eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=dataset["test"].column_names,
)

  0%|          | 0/1 [00:00<?, ?ba/s]


In [13]:
eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}

with torch.no_grad():
    outputs = model(**batch)

In [14]:
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

In [15]:
import collections

example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(eval_set):
    example_to_features[feature["example_id"]].append(idx)

In [16]:
import numpy as np

n_best = 20
max_answer_length = 30
predicted_answers = []

for example in small_eval_set:
    example_id = example["id"]
    context = example["context"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = eval_set["offset_mapping"][feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )

    best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})

In [17]:
theoretical_answers = [
    {"id": ex["id"], "answers": ex["answers"]} for ex in small_eval_set
]

In [18]:
for i in range(50):
    print("Context: ", small_eval_set[i]["context"])
    print("Predicted answer: ", predicted_answers[i]["prediction_text"])
    print("Correct answer: ", theoretical_answers[i]["answers"]["text"][0])
    print()

Context:  '' infer ( an unknown ) from something that is known ; conjecture. ``and a conjecture is ;
Predicted answer:  infer ( an unknown ) from something that is known ; conjecture. ``
Correct answer:  infer ( an unknown ) from something that is known ;

Context:  While I again feel that it should be assumed he would do so to begin with , I can agree that a simple answer as you say would have been easy enough .You accuse others of making an assumption to the answer . But then you admit that YOU think the answer should have been assumed . Do you see a problem here ?
Predicted answer:  I can agree that a simple answer as you say would have been easy enough .
Correct answer:  While I again feel that it should be assumed he would do so to begin with

Context:  I 'm wondering what it implies that God brought each of the animals to Adam and none of them proved to be a suitable `` helper. `` What exactly does that mean , given the activities that seem to have been carried on between Adam an