In [4]:
import os 
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

import numpy as np
import pandas as pd
import plotly

from tqdm.auto import tqdm

import torch
from torch.utils.data import Dataset, DataLoader 

from datasets import Dataset
import evaluate
from transformers import get_scheduler, DefaultDataCollator
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import TrainingArguments, Trainer

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

PRETRAINED_MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

In [5]:
df_train = pd.read_csv('data/Batch_answers - train_data (no-blank).csv' )

# drop unnecessary columns
df_train = df_train.drop(['Unnamed: 6', 'total no.: 7987'], axis=1)

# remove quotes
df_train['q'] = df_train['q'].str.strip('"')
df_train['r'] = df_train['r'].str.strip('"')
df_train["q'"] = df_train["q'"].str.strip('"')
df_train["r'"] = df_train["r'"].str.strip('"')

# drop duplicated rows
df_train = df_train.drop_duplicates()

# numerical s
df_train['s_label'] = (df_train['s'] == 'AGREE').astype(int)

# some information about the dataset
print(df_train['s_label'].value_counts())
print('# of distinct data:\t', len(df_train['id'].unique()))
print('# of data:\t', len(df_train))

# save cleaned dataset
df_train.to_csv('data/train.tsv', sep='\t', index=False)

# load cleaned dataset
df_train = pd.read_csv('data/train.tsv', sep='\t')
df_train.sample(3)

0    29600
1     6415
Name: s_label, dtype: int64
# of distinct data:	 7987
# of data:	 36015


Unnamed: 0,id,q,r,s,q',r',s_label
16463,4634,Early Christian authorities rejected marriage ...,"Freethought Today , November 2008",DISAGREE,Early Christian authorities rejected marriage ...,"Freethought Today , November 2008",0
35687,9932,Many Christians often say that the proofs for ...,Just because the bible is over 2000 years old ...,DISAGREE,Many Christians often say that the proofs for ...,You do n't have any absolute proof and neither...,0
10486,2935,I 'm now slightly less confused about why I 'v...,"Do me a favor . Write , in your own words , tw...",DISAGREE,I 'm now slightly less confused,"Write , in your own words , two paragraphs . O...",0


In [6]:
df_train['question'] = 'What is the main point?'
df_train['context'] = df_train['q'] + df_train['r']
df_train['u_id'] = df_train['id']
df_train['id'] = df_train.index

# calculate the answer index in the context
df_train['answers_start'] = df_train[['q', 'q\'']].apply(lambda x: x['q'].find(x['q\'']), axis=1)
df_train['answers_text'] = df_train['q\'']
df_train['answers'] = df_train[['answers_start', 'answers_text']].apply(lambda x: {'answer_start': [x['answers_start']], 'text': [x['answers_text']]}, axis=1)

# drop unmatched answers (results from uncleaned dataset)
unmatch_idx = df_train['answers_start'] == -1
df_train = df_train[~unmatch_idx]

# FIXME: drop length > 360
df_train = df_train[(df_train['question']+df_train['context']).apply(lambda x: len(x)) < 360]

print(f'Num of unmatch data {unmatch_idx.sum()}')
df_train.sample(3)

df_train[['id', 'u_id', 'question', 'context', 'answers']].sample(3)

Num of unmatch data 5865


Unnamed: 0,id,u_id,question,context,answers
6534,6534,1833,What is the main point?,"No , it does n't make it right but it 's not r...","{'answer_start': [5], 'text': ['it does n't ma..."
24311,24311,6819,What is the main point?,TalkOrigins contains nothing but barely readab...,"{'answer_start': [0], 'text': ['TalkOrigins co..."
25978,25978,7225,What is the main point?,"It is so easy , so simple , so evident is not ...","{'answer_start': [0], 'text': ['It is so easy ..."


In [7]:
print('Length of the question + context:')
(df_train['question'] + df_train['context']).apply(lambda x: len(x)).describe()
# plotly.plot((df_train['question'] + df_train['context']).apply(lambda x: len(x)).sort_values(ascending=False), kind='hist', bins=100)

Length of the question + context:


count    10520.000000
mean       237.915875
std         74.205776
min         35.000000
25%        182.750000
50%        243.000000
75%        299.000000
max        359.000000
dtype: float64

In [8]:
dataset = Dataset.from_pandas(df_train[['id', 'question', 'context', 'answers']])
dataset = dataset.shuffle(seed=42).train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 7364
    })
    test: Dataset({
        features: ['id', 'question', 'context', 'answers', '__index_level_0__'],
        num_rows: 3156
    })
})

### 下面大多是 copy-paste，可以去看 [Hugging Face Course - Question answering](https://huggingface.co/course/chapter7/7?fw=pt)

In [9]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [10]:
train_dataset = dataset["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=dataset["train"].column_names,
)
len(dataset["train"]), len(train_dataset)

  0%|          | 0/8 [00:00<?, ?ba/s]

(7364, 7364)

In [11]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [12]:
validation_dataset = dataset["test"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=dataset["test"].column_names,
)
len(dataset["test"]), len(validation_dataset)

  0%|          | 0/4 [00:00<?, ?ba/s]

(3156, 3156)

In [13]:
print(train_dataset)
print(validation_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 7364
})
Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 3156
})


In [14]:
BATCH_SIZE = 8
model = AutoModelForQuestionAnswering.from_pretrained(PRETRAINED_MODEL_NAME)

data_collator = DefaultDataCollator()

args = TrainingArguments(
  'test_trainer/',
  evaluation_strategy = "epoch",
  learning_rate=2e-5,
  per_device_train_batch_size=BATCH_SIZE,
  per_device_eval_batch_size=BATCH_SIZE,
  num_train_epochs=3,
  weight_decay=0.01,
)

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)
trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

Epoch,Training Loss,Validation Loss
1,2.0604,No log
2,1.4806,No log
3,1.3222,No log


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test_trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in test_trainer/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `DistilBertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3156
  Batch size = 8
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test_trainer/checkpoint-1000/tokenizer_config.json
Speci

TrainOutput(global_step=2763, training_loss=1.546786584070615, metrics={'train_runtime': 706.0777, 'train_samples_per_second': 31.288, 'train_steps_per_second': 3.913, 'total_flos': 2164791214430208.0, 'train_loss': 1.546786584070615, 'epoch': 3.0})

In [15]:
small_eval_set = dataset["test"].select(range(50))

eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=dataset["test"].column_names,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [16]:
eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}

with torch.no_grad():
    outputs = model(**batch)

In [17]:
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

In [18]:
import collections

example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(eval_set):
    example_to_features[feature["example_id"]].append(idx)

In [19]:
import numpy as np

n_best = 20
max_answer_length = 30
predicted_answers = []

for example in small_eval_set:
    example_id = example["id"]
    context = example["context"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = eval_set["offset_mapping"][feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )

    best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})

In [20]:
theoretical_answers = [
    {"id": ex["id"], "answers": ex["answers"]} for ex in small_eval_set
]

In [21]:
for i in range(50):
    print("Context: ", small_eval_set[i]["context"])
    print("Predicted answer: ", predicted_answers[i]["prediction_text"])
    print("Correct answer: ", theoretical_answers[i]["answers"]["text"][0])
    print()

Context:  So it 's not evidence vs evidence , it 's evidence vs religious belief .Well , youÂ ’ re just begging the question of what is evidence . You are saying religious belief ( others might say religious knowledge ) is not evidence . If it is scientific , it is evidence . If it is not scientific , it is not evidence . Have I got it right ?
Predicted answer:  So it 's not evidence vs evidence , it 's evidence vs religious belief .
Correct answer:  So it 's not evidence vs evidence , it 's evidence vs religious belief .

Context:  So it would be more loving to severely beat the child ... forever ?A person going to Hell is a consequence of sin . God does n't want it , but he ca n't stop it .
Predicted answer:  So it would be more loving to severely beat the child ... forever ?
Correct answer:  it would be more loving to severely beat the child ... forever ?

Context:  We 've been over this many many times before ...And you continually refuse to learn the facts of the matter .
Predicte

In [43]:
from nltk.tokenize import word_tokenize, RegexpTokenizer
import string

In [44]:
def lcs(X, Y):
    # find the length of the strings
    X_token = word_tokenize(X)
    Y_token = word_tokenize(Y)

    X = [x for x in X_token if x not in string.punctuation]	
    Y = [x for x in Y_token if x not in string.punctuation]	

    m = len(X)
    n = len(Y)

    total_length = m + n
 
    # declaring the array for storing the dp values
    L = [[None]*(n + 1) for i in range(m + 1)]
 
    """Following steps build L[m + 1][n + 1] in bottom up fashion
    Note: L[i][j] contains length of LCS of X[0..i-1]
    and Y[0..j-1]"""
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0 :
                L[i][j] = 0
            elif X[i-1] == Y[j-1]:
                L[i][j] = L[i-1][j-1]+1
            else:
                L[i][j] = max(L[i-1][j], L[i][j-1])
 
    # L[m][n] contains the length of LCS of X[0..n-1] & Y[0..m-1]
    LSC = L[m][n]
    score = (LSC/(total_length - LSC))
    return score
# end of function lcs

In [48]:
df_all = pd.read_csv('data/train.tsv', sep='\t')

for i in range (50):
	print("Context: ", small_eval_set[i]["context"])
	print("Predicted answer: ", predicted_answers[i]["prediction_text"])
	u_id = df_train.loc[df_train["id"] == predicted_answers[i]["id"]]['u_id'].values[0]
	correct_answers = df_all.loc[df_all["id"] == u_id]['q\''].values
	# print("Correct answers: ", correct_answers)

	score = 0
	applied_answer = ""
	for j in range(len(correct_answers)):
		lcs_score = lcs(predicted_answers[i]["prediction_text"], correct_answers[j])
		score = max(score, lcs_score)
		applied_answer = correct_answers[j] if lcs_score == score else applied_answer
		
	print("Score: ", score)
	print("Applied answer: ", applied_answer)

	print()


	# print("Correct answers: ",  )

# df_train

Context:  So it 's not evidence vs evidence , it 's evidence vs religious belief .Well , youÂ ’ re just begging the question of what is evidence . You are saying religious belief ( others might say religious knowledge ) is not evidence . If it is scientific , it is evidence . If it is not scientific , it is not evidence . Have I got it right ?
Predicted answer:  So it 's not evidence vs evidence , it 's evidence vs religious belief .


NameError: name 'lcs_socre' is not defined