# Answering System

In [12]:
pip install transformers datasets evaluate

Collecting transformers
  Downloading transformers-4.43.2-py3-none-any.whl (9.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hCollecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m831.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:02[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.23.2
  Downloading huggingface_hub-0.24.2-py3-none-any.whl (417 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m417.2/417.2 kB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19
  Downloading tokenizers-0.19.1-cp311-cp311-macosx_11_0_arm64.whl (2.4 MB)
[2K     [90m━━━━━━━━━

## (1) process data. 

Use the official dev set as test set, and
split the original training set into training set and validation set (5000 samples). Prepare
the data according to the requirements of ML model training.

In [1]:
import pandas as pd

In [2]:
import json

def preprocess(file_name):
    with open(file_name, 'r', encoding='utf-8') as file:
      squad_data = json.load(file)
 
    data = []
    for article in squad_data['data']:
      title = article['title']
      for paragraph in article['paragraphs']:
          context = paragraph['context']
          for qa in paragraph['qas']:
              question = qa['question']
              id = qa['id']
              
              for answer in qa['answers']:
                answer_text = answer['text']
                answer_start = answer['answer_start']
                entry = {
                    'id': id,
                    'title': title,
                    'context': context,
                    'question': question,
                    'answers': {
                        'text': [answer_text],
                        'answer_start': [answer_start]
                    }
                }
                data.append(entry)
  
    return data


In [3]:
dev_data = preprocess('dev-v2.0.json')
train_data = preprocess('train-v2.0.json')

In [5]:
dev_data

[{'id': '56ddde6b9a695914005b9628',
  'title': 'Normans',
  'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
  'question': 'In what country is Normandy located?',
  'answers': {'text': ['France'], 'answer_start': [159]}},
 {'id': '56ddde6b9a695914005b9628',
  'title': 'Normans',
  'context':

In [6]:
print('length of dev:', len(dev_data))
print('length of train:', len(train_data))

length of dev: 20302
length of train: 86821


In [41]:
pd.DataFrame(train_data).groupby(['id']).count().reset_index().sort_values(by='id')

Unnamed: 0,id,title,context,question,answers
0,56be85543aeaaa14008c9063,1,1,1,1
1,56be85543aeaaa14008c9065,1,1,1,1
2,56be85543aeaaa14008c9066,1,1,1,1
3,56be86cf3aeaaa14008c9076,1,1,1,1
4,56be86cf3aeaaa14008c9078,1,1,1,1
...,...,...,...,...,...
86816,573636bf9c79961900ff7e06,1,1,1,1
86817,573636bf9c79961900ff7e07,1,1,1,1
86818,573636bf9c79961900ff7e08,1,1,1,1
86819,573636bf9c79961900ff7e09,1,1,1,1


In [42]:
pd.DataFrame(dev_data)

Unnamed: 0,id,title,context,question,answers
0,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,"{'text': ['France'], 'answer_start': [159]}"
1,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,"{'text': ['France'], 'answer_start': [159]}"
2,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,"{'text': ['France'], 'answer_start': [159]}"
3,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,"{'text': ['France'], 'answer_start': [159]}"
4,56ddde6b9a695914005b9629,Normans,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,"{'text': ['10th and 11th centuries'], 'answer_..."
...,...,...,...,...,...
20297,5737aafd1c456719005744ff,Force,"The pound-force has a metric counterpart, less...",What is the seldom used force unit equal to on...,"{'text': ['sthène'], 'answer_start': [665]}"
20298,5737aafd1c456719005744ff,Force,"The pound-force has a metric counterpart, less...",What is the seldom used force unit equal to on...,"{'text': ['sthène'], 'answer_start': [665]}"
20299,5737aafd1c456719005744ff,Force,"The pound-force has a metric counterpart, less...",What is the seldom used force unit equal to on...,"{'text': ['sthène'], 'answer_start': [665]}"
20300,5737aafd1c456719005744ff,Force,"The pound-force has a metric counterpart, less...",What is the seldom used force unit equal to on...,"{'text': ['sthène'], 'answer_start': [665]}"


- train_data: 没有出现同一个context、同一个question对应不同回答的情况

- dev_data：有重复

### Drop duplicates

In [13]:
dev_data_1 = pd.DataFrame(dev_data).drop_duplicates(subset=['id','question','context'])
dev_data_1

Unnamed: 0,id,title,context,question,answers
0,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,"{'text': ['France'], 'answer_start': [159]}"
4,56ddde6b9a695914005b9629,Normans,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,"{'text': ['10th and 11th centuries'], 'answer_..."
8,56ddde6b9a695914005b962a,Normans,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,"{'text': ['Denmark, Iceland and Norway'], 'ans..."
12,56ddde6b9a695914005b962b,Normans,The Normans (Norman: Nourmands; French: Norman...,Who was the Norse leader?,"{'text': ['Rollo'], 'answer_start': [308]}"
16,56ddde6b9a695914005b962c,Normans,The Normans (Norman: Nourmands; French: Norman...,What century did the Normans first gain their ...,"{'text': ['10th century'], 'answer_start': [671]}"
...,...,...,...,...,...
20277,5737aafd1c456719005744fb,Force,"The pound-force has a metric counterpart, less...",What is the metric term less used than the New...,"{'text': ['kilogram-force'], 'answer_start': [..."
20282,5737aafd1c456719005744fc,Force,"The pound-force has a metric counterpart, less...",What is the kilogram-force sometimes reffered ...,"{'text': ['kilopond'], 'answer_start': [114]}"
20287,5737aafd1c456719005744fd,Force,"The pound-force has a metric counterpart, less...",What is a very seldom used unit of mass in the...,"{'text': ['slug'], 'answer_start': [274]}"
20292,5737aafd1c456719005744fe,Force,"The pound-force has a metric counterpart, less...",What seldom used term of a unit of force equal...,"{'text': ['kip'], 'answer_start': [712]}"


In [14]:
train_data_1 = pd.DataFrame(train_data).drop_duplicates(subset=['id','question','context'])
train_data_1

Unnamed: 0,id,title,context,question,answers
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start'..."
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,"{'text': ['singing and dancing'], 'answer_star..."
2,56be85543aeaaa14008c9066,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,"{'text': ['2003'], 'answer_start': [526]}"
3,56bf6b0f3aeaaa14008c9601,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"{'text': ['Houston, Texas'], 'answer_start': [..."
4,56bf6b0f3aeaaa14008c9602,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,"{'text': ['late 1990s'], 'answer_start': [276]}"
...,...,...,...,...,...
86816,5735d259012e2f140011a09d,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,"{'text': ['Oregon'], 'answer_start': [229]}"
86817,5735d259012e2f140011a09e,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,"{'text': ['Rangoon'], 'answer_start': [414]}"
86818,5735d259012e2f140011a09f,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,"{'text': ['Minsk'], 'answer_start': [476]}"
86819,5735d259012e2f140011a0a0,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,"{'text': ['1975'], 'answer_start': [199]}"


In [15]:
print('after drop duplicates for same context, same question, same answer')
print('length of test set:', len(dev_data_1))
print('length of train set:', len(train_data_1))

after drop duplicates for same context, same question, same answer
length of test set: 5928
length of train set: 86821


### Dataset

In [16]:
import random

# 首先进行洗牌
random.shuffle(train_data)

validation_length = 5000
training_data = train_data[:-validation_length]
validation_data = train_data[-validation_length:]

print(len(training_data))  # 输出训练集长度
print(len(validation_data))  # 输出验证集长度


81821
5000


In [17]:
train_data_ = pd.DataFrame(training_data)
valid_data_ = pd.DataFrame(validation_data)

In [32]:
!pip install pyarrow==9.0.0
!pip install datasets==2.10.0







Collecting pyarrow==9.0.0
  Downloading pyarrow-9.0.0.tar.gz (873 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m873.1/873.1 kB[0m [31m523.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: pyarrow
  Building wheel for pyarrow (pyproject.toml) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for pyarrow [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[222 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "/private/var/folders/6w/218h06nd01b7jrt1y56dhm6w0000gq/T/pip-build-env-oii73r84/overlay/lib/python3.11/site-packages/setuptools_scm/_integration/pyproject_reading.py", lin

In [34]:
import pandas as pd
from datasets import Dataset, DatasetDict

# 将 Pandas DataFrame 转换为 Dataset
train_dataset = Dataset.from_pandas(train_data_)
validation_dataset = Dataset.from_pandas(valid_data_)
test_dataset = Dataset.from_pandas(dev_data_1)

# 将 Dataset 转换为 DatasetDict
data = DatasetDict({"train": train_dataset,'test':test_dataset,'validation':validation_dataset})

# 打印转换后的数据集
print(data)


AttributeError: module 'pyarrow.lib' has no attribute 'ListViewType'

In [49]:
data["train"][0:3]

{'id': ['5727e04eff5b5019007d974b',
  '570d1e05b3d812140066d43b',
  '57303ce9a23a5019007fcfdc'],
 'title': ['New_Haven,_Connecticut', 'Valencia', 'The_Blitz'],
 'context': ['New Haven Harbor is home to the Port of New Haven, a deep-water seaport with three berths capable of hosting vessels and barges as well as the facilities required to handle break bulk cargo. The port has the capacity to load 200 trucks a day from the ground or via loading docks. Rail transportation access is available, with a private switch engine for yard movements and private siding for loading and unloading. Approximately 400,000 square feet (40,000 m2) of inside storage and 50 acres (200,000 m2) of outside storage are available at the site. Five shore cranes with a 250-ton capacity and 26 forklifts, each with a 26-ton capacity, are also available.',
  "The city remained in the hands of Christian troops until 1102, when the Almoravids retook the city and restored the Muslim religion. Although the self-styled 'Em

### tokenizer

In [19]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [51]:
def preprocess_function(examples):
    '''
    examples: dataframe
    return: a dataset that have features ['input_ids', 'attention_mask', 'start_positions', 'end_positions']
    '''
    
    # 对 examples 字典中的 "question" 键对应的值进行处理，去除首尾空白字符，并将每个问题存储在 questions 列表中
    questions = [q.strip() for q in examples["question"]]

    # 使用 tokenizer 对问题和上下文进行编码，并生成模型的输入
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    # 从 inputs 字典中弹出 "offset_mapping" 键对应的值，并将其赋值给 offset_mapping 变量
    offset_mapping = inputs.pop("offset_mapping")

    # 从 examples 字典中获取答案
    answers = examples["answers"]

    # 初始化用于存储开始位置和结束位置的列表
    start_positions = []
    end_positions = []

    # 遍历 offset_mapping 列表，并获取每个答案的开始位置和结束位置
    for i, offset in enumerate(offset_mapping):
        # 获取第 i 个答案
        answer = answers[i]
        
        # 获取答案的开始字符位置和结束字符位置
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])

        # 获取输入的序列 ID
        sequence_ids = inputs.sequence_ids(i) #第i条数据的context过了tokenizer之后, 每一个token在第几个sequence
        
        # 查找上下文的起始和结束位置
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx

        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # 如果答案不完全在上下文中，则标记为 (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # 否则，答案的开始和结束位置为相应的标记位置
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    # 将开始位置和结束位置存储到 inputs 字典中的对应键中
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    # 返回处理后的 inputs 字典
    return inputs


In [52]:
tokenized_data = data.map(preprocess_function, batched=True, remove_columns=data["train"].column_names)

  0%|          | 0/82 [00:00<?, ?ba/s]


KeyboardInterrupt



In [None]:
tokenized_data['validation']

In [None]:
tokenized_data["train"]

In [None]:
tokenized_data.save_to_disk('/kaggle/working/tokenized_data')

In [None]:
from datasets import load_from_disk

reloaded_dataset = load_from_disk("/kaggle/working/tokenized_data")

## (2) Finetune 

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased")

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [None]:
from huggingface_hub import notebook_login

notebook_login()
# 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

In [None]:
tokenized_data["train"]

In [None]:
tokenized_data["validation"]

In [None]:
from datasets import load_metric
metric = load_metric('/kaggle/input/eval-squad/squad.py')

In [None]:
training_args = TrainingArguments(
    output_dir="qasystem_distilbert",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=metric,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [None]:
# trainer.train()

# # weight and bias API key: 9d4c36ec325757de4dc529602fb94a877f93a94a

In [None]:
# trainer.push_to_hub('qasystem_distilbert')

## Hyperparameters Search

In [None]:
args = TrainingArguments(
    report_to="wandb",
    output_dir="qasystem_distilbert",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

In [None]:
def model_init(trial):
    return AutoModelForQuestionAnswering.from_pretrained(
#         model_args.model_name_or_path,
#         from_tf=bool(".ckpt" in model_args.model_name_or_path),
#         config=config,
#         cache_dir=model_args.cache_dir,
#         revision=model_args.model_revision,
#         use_auth_token=True if model_args.use_auth_token else None,
        "distilbert-base-cased"
    )

In [None]:
from datasets import load_metric
metric = load_metric('/kaggle/input/eval-squad/squad.py')

In [None]:
trainer = Trainer(
    model=None,
    args=args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    compute_metrics=metric,
    tokenizer=tokenizer,
    model_init=model_init,
    data_collator=data_collator,
)

In [None]:
def wandb_hp_space(trial):
    return {
        "method": "random",
        "metric": {"name": "objective", "goal": "minimize"},
        "parameters": {
            "learning_rate": {"distribution": "uniform", "min": 1e-6, "max": 1e-4},
            "per_device_train_batch_size": {"values": [16, 32]}
        },
    }

In [None]:
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="wandb",
    hp_space=wandb_hp_space,
    n_trials=20,
#     compute_objective=compute_objective,
)

# weight and bias API key: 9d4c36ec325757de4dc529602fb94a877f93a94a