In [20]:
import os 
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

import torch
from torch.utils.data import Dataset, DataLoader 

from datasets import Dataset
import evaluate
from transformers import DefaultDataCollator
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import TrainingArguments, Trainer

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

PRETRAINED_MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

In [24]:
df_train = pd.read_csv('data/Batch_answers - train_data (no-blank).csv' )

# drop unnecessary columns
df_train = df_train.drop(['Unnamed: 6', 'total no.: 7987'], axis=1)

# remove quotes
df_train['q'] = df_train['q'].str.strip('"')
df_train['r'] = df_train['r'].str.strip('"')
df_train["q'"] = df_train["q'"].str.strip('"')
df_train["r'"] = df_train["r'"].str.strip('"')

# drop duplicated rows
df_train = df_train.drop_duplicates()

# numerical s
df_train['s_label'] = (df_train['s'] == 'AGREE').astype(int)

# some information about the dataset
print(df_train['s_label'].value_counts())
print('# of distinct data:\t', len(df_train['id'].unique()))
print('# of data:\t', len(df_train))

# save cleaned dataset
df_train.to_csv('data/train.tsv', sep='\t', index=False)

# load cleaned dataset
df_train = pd.read_csv('data/train.tsv', sep='\t')
df_train.sample(3)

0    29600
1     6415
Name: s_label, dtype: int64
# of distinct data:	 7987
# of data:	 36015


Unnamed: 0,id,q,r,s,q',r',s_label
31495,8778,Then how do we morally judge God ?,That 's kind of the point ... to morally judge...,DISAGREE,how do we morally judge God ?,mean they that being is on the same level does...,0
33713,9390,"Faith is the cornerstone of religion , yes . B...","Well said , Kronus . Very well indeed .",AGREE,Religion is a completely human invention,"Well said , Kronus . Very well indeed .",1
34155,9530,"That 's right , a higher view of the fetus tha...",Or in any culture . Women abort WORLDWIDE you ...,DISAGREE,higher view of the fetus typically in our cult...,Or in any culture I certainly AM bothered by t...,0


In [27]:
df_train['question'] = 'What is the main point?'
df_train['context'] = df_train['q'] + ' ' + df_train['s'] + ' ' + df_train['r']
df_train['id'] = df_train.index

# calculate the answer index in the context
df_train['answers_start'] = df_train[['q', 'q\'']].apply(lambda x: x['q'].find(x['q\'']), axis=1)
df_train['answers_text'] = df_train['q\'']
df_train['answers'] = df_train[['answers_start', 'answers_text']].apply(lambda x: {'answer_start': [x['answers_start']], 'text': [x['answers_text']]}, axis=1)

# drop unmatched answers (results from uncleaned dataset)
unmatch_idx = df_train['answers_start'] == -1
df_train = df_train[~unmatch_idx]

# FIXME: drop length > 384
# df_train = df_train[(df_train['question']+df_train['context']).apply(lambda x: len(x)) < 384]

print(f'Num of unmatch data {unmatch_idx.sum()}')
df_train.sample(3)

df_train[['id', 'question', 'context', 'answers']].sample(3)

Num of unmatch data 5865


Unnamed: 0,id,question,context,answers
35903,35903,What is the main point?,A prediction was made based on the theory of e...,"{'answer_start': [0], 'text': ['A prediction w..."
19911,19911,What is the main point?,What I do n't get is why anachronistic vitalis...,"{'answer_start': [0], 'text': ['What I do n't ..."
30970,30970,What is the main point?,And that 's when her career and popularity too...,"{'answer_start': [61], 'text': ['Now that she ..."


In [37]:
df_train['context'].apply(len).describe()

count    30150.000000
mean       698.901658
std        840.172937
min         22.000000
25%        279.000000
50%        474.000000
75%        809.000000
max      18023.000000
Name: context, dtype: float64

In [35]:
print('Length of the context:')
(df_train['context']).apply(lambda x: len(x)).describe()
px.bar((df_train['context']).apply(lambda x: len(x)).sort_values(ascending=False))

Length of the context:


In [5]:
dataset = Dataset.from_pandas(df_train[['id', 'question', 'context', 'answers']])
dataset = dataset.shuffle(seed=42).train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 7761
    })
    test: Dataset({
        features: ['id', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 3327
    })
})

### 下面大多是 copy-paste，可以去看 [Hugging Face Course - Question answering](https://huggingface.co/course/chapter7/7?fw=pt)

In [43]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [44]:
train_dataset = dataset["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=dataset["train"].column_names,
)
len(dataset["train"]), len(train_dataset)

 88%|████████▊ | 7/8 [00:01<00:00,  5.32ba/s]


(7761, 7761)

In [45]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [46]:
validation_dataset = dataset["test"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=dataset["test"].column_names,
)
len(dataset["test"]), len(validation_dataset)

 75%|███████▌  | 3/4 [00:00<00:00,  3.36ba/s]


(3327, 3327)

In [47]:
print(train_dataset)
print(validation_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 7761
})
Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 3327
})


In [48]:
BATCH_SIZE = 8
EPOCHS = 3

model = AutoModelForQuestionAnswering.from_pretrained(PRETRAINED_MODEL_NAME)

data_collator = DefaultDataCollator()

args = TrainingArguments(
  'test_trainer/',
  evaluation_strategy = "epoch",
  learning_rate=2e-5,
  per_device_train_batch_size=BATCH_SIZE,
  per_device_eval_batch_size=BATCH_SIZE,
  num_train_epochs=EPOCHS,
  weight_decay=0.01,
)

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

loading configuration file config.json from cache at /home/alan/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/alan/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/pytorch_model.bin
Some weights of the model checkpoint at distilbert-base-uncased were not us

Epoch,Training Loss,Validation Loss
1,1.8791,No log
2,1.3523,No log
3,1.2485,No log


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test_trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in test_trainer/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `DistilBertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3327
  Batch size = 8
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test_trainer/checkpoint-1000/tokenizer_config.json
Speci

TrainOutput(global_step=2913, training_loss=1.4316682615846528, metrics={'train_runtime': 275.9578, 'train_samples_per_second': 84.372, 'train_steps_per_second': 10.556, 'total_flos': 2281497096033792.0, 'train_loss': 1.4316682615846528, 'epoch': 3.0})

In [49]:
small_eval_set = dataset["test"].select(range(50))

eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=dataset["test"].column_names,
)

  0%|          | 0/1 [00:00<?, ?ba/s]


In [50]:
eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}

with torch.no_grad():
    outputs = model(**batch)

In [51]:
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

In [52]:
import collections

example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(eval_set):
    example_to_features[feature["example_id"]].append(idx)

In [53]:
import numpy as np

n_best = 20
max_answer_length = 30
predicted_answers = []

for example in small_eval_set:
    example_id = example["id"]
    context = example["context"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = eval_set["offset_mapping"][feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )

    best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})

In [54]:
theoretical_answers = [
    {"id": ex["id"], "answers": ex["answers"]} for ex in small_eval_set
]

In [55]:
for i in range(50):
    print("Context: ", small_eval_set[i]["context"])
    print("Predicted answer: ", predicted_answers[i]["prediction_text"])
    print("Correct answer: ", theoretical_answers[i]["answers"]["text"][0])
    print()

Context:  All forms of life ? Animal life ? Plant life ? The mold growing in your bathroom ? DISAGREE I 'm sorry , but I thought it went without saying that we were discussing human life .
Predicted answer:  All forms of life ? Animal life ? Plant life ? The mold growing in your bathroom ?
Correct answer:  All forms of life ? Animal life ? Plant life ?

Context:  All of a sudden in the last 10 years they are trying to convice us that homosexuality is not `` wrong `` its a `` choice `` DISAGREE Actually the majority of homosexuals will tell you it 's not a choice . And I , being the wacky gal I am , have a tendency to believe the word of homosexuals on that particular subject over the word of straight neonazicons .
Predicted answer:  All of a sudden in the last 10 years they are trying to convice us that homosexuality is not `` wrong ``
Correct answer:  they are trying to convice us that homosexuality is not `` wrong `` its a `` choice ``

Context:  Not all of them . DISAGREE Correction