In [1]:
from fastNLP.embeddings import BertEmbedding
from fastNLP.models import BertForQuestionAnswering
from fastNLP.core.losses import CMRC2018Loss
from fastNLP.core.metrics import CMRC2018Metric
from fastNLP.io.pipe.qa import CMRC2018BertPipe
from fastNLP import Trainer, BucketSampler
from fastNLP import WarmupCallback, GradientClipCallback
from fastNLP.core.optimizer import AdamW


data_bundle = CMRC2018BertPipe().process_from_file()

print(data_bundle)

In total 2 datasets:
	dev has 3219 instances.
	train has 10142 instances.
In total 1 vocabs:
	chars has 5844 entries.



In [83]:
data_bundle.get_dataset('train')

+------------+--------------+---------------+--------------+---------------+-----------+-------------+---------------+--------------+------------+-----------+
| title      | context      | question      | answers      | answer_starts | id        | context_len | raw_chars     | target_start | target_end | chars     |
+------------+--------------+---------------+--------------+---------------+-----------+-------------+---------------+--------------+------------+-----------+
| 范廷颂...  | 范廷颂枢...  | 范廷颂是什... | ['1963年...  | [30]          | TRAIN_... | 492         | ['范', '廷... | 30           | 34         | [665, ... |
| 范廷颂...  | 范廷颂枢...  | 1990年，范... | ['1990年...  | [41]          | TRAIN_... | 491         | ['范', '廷... | 41           | 61         | [665, ... |
| 范廷颂...  | 范廷颂枢...  | 范廷颂是于... | ['范廷颂...  | [97]          | TRAIN_... | 494         | ['范', '廷... | 97           | 125        | [665, ... |
| 范廷颂...  | 范廷颂枢...  | 1994年3月...  | ['1994年...  | [548]         | TRAIN_... | 489         

In [84]:
data_bundle.get_dataset('train')[0]

+-----------+-------------+---------------+-------------+---------------+-----------+-------------+---------------+--------------+------------+-----------+
| title     | context     | question      | answers     | answer_starts | id        | context_len | raw_chars     | target_start | target_end | chars     |
+-----------+-------------+---------------+-------------+---------------+-----------+-------------+---------------+--------------+------------+-----------+
| 范廷颂... | 范廷颂枢... | 范廷颂是什... | ['1963年... | [30]          | TRAIN_... | 492         | ['范', '廷... | 30           | 34         | [665, ... |
+-----------+-------------+---------------+-------------+---------------+-----------+-------------+---------------+--------------+------------+-----------+

In [85]:
data_bundle.get_dataset('train')[0]['title']

'范廷颂'

In [86]:
data_bundle.get_dataset('train')[0]['question']

'范廷颂是什么时候被任为主教的？'

In [113]:
data_bundle.get_dataset('train')[0]['answers']

['1963年']

In [87]:
data_bundle.get_dataset('train')[0]['answer_starts']

[30]

In [3]:
CMRC2018BertPipe

fastNLP.io.pipe.qa.CMRC2018BertPipe

In [4]:
CMRC2018BertPipe.process_from_file

<function fastNLP.io.pipe.qa.CMRC2018BertPipe.process_from_file>

In [None]:
r"""
本文件中的Pipe主要用于处理问答任务的数据。

"""


from copy import deepcopy


from .pipe import Pipe

from .. import DataBundle

from ..loader.qa import CMRC2018Loader

from .utils import get_tokenizer

from ...core import DataSet

from ...core import Vocabulary


__all__ = ['CMRC2018BertPipe']


def _concat_clip(data_bundle, max_len, concat_field_name='raw_chars'):
    r"""
    处理data_bundle中的DataSet，将context与question按照character进行tokenize，然后使用[SEP]将两者连接起来。

    会新增field: context_len(int), raw_words(list[str]), target_start(int), target_end(int)其中target_start
    与target_end是与raw_chars等长的。其中target_start和target_end是前闭后闭的区间。

    :param DataBundle data_bundle: 类似["a", "b", "[SEP]", "c", ]
    :return:
    """
    tokenizer = get_tokenizer('cn-char', lang='cn')
    for name in list(data_bundle.datasets.keys()):
        ds = data_bundle.get_dataset(name)
        data_bundle.delete_dataset(name)
        new_ds = DataSet()
        for ins in ds:
            new_ins = deepcopy(ins)
            context = ins['context']
            question = ins['question']

            cnt_lst = tokenizer(context)
            q_lst = tokenizer(question)

            answer_start = -1

            if len(cnt_lst) + len(q_lst) + 3 > max_len:  
                # 预留开头的[CLS]和[SEP]和中间的[sep]
                if 'answer_starts' in ins and 'answers' in ins:
                    answer_start = int(ins['answer_starts'][0])
                    answer = ins['answers'][0]
                    answer_end = answer_start + len(answer)
                    if answer_end > max_len - 3 - len(q_lst):
                        span_start = answer_end + 3 + len(q_lst) - max_len
                        span_end = answer_end
                    else:
                        span_start = 0
                        span_end = max_len - 3 - len(q_lst)
                    cnt_lst = cnt_lst[span_start:span_end]
                    answer_start = int(ins['answer_starts'][0])
                    answer_start -= span_start
                    answer_end = answer_start + len(ins['answers'][0])
                else:
                    cnt_lst = cnt_lst[:max_len - len(q_lst) - 3]
            else:
                if 'answer_starts' in ins and 'answers' in ins:
                    answer_start = int(ins['answer_starts'][0])
                    answer_end = answer_start + len(ins['answers'][0])

            tokens = cnt_lst + ['[SEP]'] + q_lst
            new_ins['context_len'] = len(cnt_lst)
            new_ins[concat_field_name] = tokens

            if answer_start != -1:
                new_ins['target_start'] = answer_start
                new_ins['target_end'] = answer_end - 1

            new_ds.append(new_ins)
        data_bundle.set_dataset(new_ds, name)

    return data_bundle


class CMRC2018BertPipe(Pipe):
    r"""
    处理之后的DataSet将新增以下的field(传入的field仍然保留)

    .. csv-table::
        :header: "context_len", "raw_chars",
        "target_start", "target_end", "chars"
        
        492, ['范', '廷', '颂... ], 30, 34, "[21, 25, ...]"
        491, ['范', '廷', '颂... ], 41, 61, "[21, 25, ...]"

        ".", "...", "...","...", "..."

 #将结果拼接一下的结果
    raw_words列是context与question拼起来的结果(连接的地方加入了[SEP])
    ，words是转为index的值, target_start为答案start的index
    ，target_end为答案end的index
    （闭区间）；context_len指示的是words列中context的长度。

    其中各列的meta信息如下:
    
    .. code::
    
        +-------------+-------------+-----------+--------------+------------+-------+---------+
        | field_names | context_len | raw_chars | target_start | target_end | chars | answers |
        +-------------+-------------+-----------+--------------+------------+-------+---------|
        |   is_input  |    False    |   False   |    False     |   False    |  True |  False  |
        |  is_target  |     True    |    True   |     True     |    True    | False |  True   |
        | ignore_type |    False    |    True   |    False     |   False    | False |  True   |
        |  pad_value  |      0      |     0     |      0       |     0      |   0   |   0     |
        +-------------+-------------+-----------+--------------+------------+-------+---------+
    
    """
    def __init__(self, max_len=510):
        super().__init__()
        self.max_len = max_len

    def process(self, data_bundle: DataBundle) -> DataBundle:
        r"""
        传入的DataSet应该具备以下的field

        .. csv-table::
           :header:"title", "context", "question", "answers", "answer_starts", "id"

           "范廷颂", "范廷颂枢机（，），圣名保禄·若瑟（）...", "范廷颂是什么时候被任为主教的？", ["1963年"], ["30"], "TRAIN_186_QUERY_0"
           "范廷颂", "范廷颂枢机（，），圣名保禄·若瑟（）...", "1990年，范廷颂担任什么职务？", ["1990年被擢升为天..."], ["41"],"TRAIN_186_QUERY_1"
           "...", "...", "...","...", ".", "..."

        :param data_bundle:
        :return:
        """
        data_bundle = _concat_clip(data_bundle
                                   , max_len=self.max_len
                                   , concat_field_name='raw_chars')

        src_vocab = Vocabulary()
        src_vocab.from_dataset(*[ds for name, ds in data_bundle.iter_datasets() 
                                 if 'train' in name],
                               field_name='raw_chars',
                               no_create_entry_dataset=[ds for name, ds in data_bundle.iter_datasets()
                                                        if 'train' not in name]
                               )
        src_vocab.index_dataset(*data_bundle.datasets.values()
                                , field_name='raw_chars', new_field_name='chars')
        data_bundle.set_vocab(src_vocab, 'chars')

        data_bundle.set_ignore_type('raw_chars', 'answers', flag=True)
        data_bundle.set_input('chars')
        data_bundle.set_target('raw_chars', 
                               'answers',
                               'target_start',
                               'target_end',
                               'context_len')

        return data_bundle

    #process最重要
    def process_from_file(self, paths=None) -> DataBundle:
        data_bundle = CMRC2018Loader().load(paths)
        return self.process(data_bundle)

# CMRC2018Loader

In [None]:
r"""
该文件中的Loader主要用于读取问答式任务的数据

"""


from . import Loader
import json
from ...core import DataSet, Instance

__all__ = ['CMRC2018Loader']


class CMRC2018Loader(Loader):
    r"""
    请直接使用从fastNLP下载的数据进行处理。
    该数据集未提供测试集，测试需要通过上传到对应的系统进行评测

    读取之后训练集DataSet将具备以下的内容，每个问题的答案只有一个

    .. csv-table::
       :header:"title", "context", "question", "answers", "answer_starts", "id"

       "范廷颂", "范廷颂枢机（，），圣名保禄·若瑟（）...", "范廷颂是什么时候被任为主教的？", ["1963年"], ["30"], "TRAIN_186_QUERY_0"
       "范廷颂", "范廷颂枢机（，），圣名保禄·若瑟（）...", "1990年，范廷颂担任什么职务？", ["1990年被擢升为天..."], ["41"],"TRAIN_186_QUERY_1"
       "...", "...", "...","...", ".", "..."

    其中title是文本的标题，多条记录可能是相同的title
    ；id是该问题的id，具备唯一性

    验证集DataSet将具备以下的内容
    ，每个问题的答案可能有三个(有时候只是3个重复的答案)

    .. csv-table::
       :header: "title", "context", "question", "answers", "answer_starts", "id"

       "战国无双3", "《战国无双3》（）是由光荣和ω-force开发...", "《战国无双3》是由哪两个公司合作开发的？", "['光荣和ω-force', '光荣和ω-force', '光荣和ω-force']", "[30, 30, 30]", "DEV_0_QUERY_0"
       "战国无双3", "《战国无双3》（）是由光荣和ω-force开发...", "男女主角亦有专属声优这一模式是由谁改编的？", "['村雨城', '村雨城', '任天堂游戏谜之村雨城']", "[226, 226, 219]", "DEV_0_QUERY_1"
       "...", "...", "...","...", ".", "..."

    其中answer_starts是从0开始的index。
    例如"我来自a复旦大学？"，其中"复"的开始index为4。
    另外"Russell评价说"中的说的index为9, 因为
    英文和数字都直接按照character计量的。
    """
    def __init__(self):
        super().__init__()

    def _load(self, path: str) -> DataSet:
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)['data']
            ds = DataSet()
            for entry in data:
                title = entry['title']
                para = entry['paragraphs'][0]
                context = para['context']
                qas = para['qas']
                for qa in qas:
                    question = qa['question']
                    ans = qa['answers']
                    answers = []
                    answer_starts = []
                    id = qa['id']
                    for an in ans:
                        answers.append(an['text'])
                        answer_starts.append(an['answer_start'])
                    ds.append(Instance(title=title, context=context, question=question, answers=answers,
                                       answer_starts=answer_starts,id=id))
        return ds

    def download(self) -> str:
        r"""
        如果您使用了本数据，请引用A Span-Extraction Dataset for Chinese Machine Reading Comprehension. Yiming Cui, Ting Liu, etc.

        :return:
        """
        output_dir = self._get_dataset_path('cmrc2018')
        return output_dir



In [88]:
import json

In [89]:
import fastNLP

In [90]:
from fastNLP import DataSet,Instance

In [91]:
path=r'C:\Users\Administrator\.fastNLP\dataset\cmrc2018\dev.json'

In [92]:
with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)['data']
            ds = DataSet()
            for entry in data:
                title = entry['title']
#                 print(type(title))
#                 print(type(title))
                para = entry['paragraphs'][0]
#                 print(type(para))
                context = para['context']
#                 print(type(context))
                qas = para['qas']
#                 print(type(qas))
                for qa in qas:
                    question = qa['question']
                    ans = qa['answers']
                    answers = []
                    answer_starts = []
                    id = qa['id']
                    for an in ans:
                        answers.append(an['text'])
                        answer_starts.append(an['answer_start'])
                    ds.append(Instance(title=title, context=context, question=question, answers=answers,
                                       answer_starts=answer_starts,id=id))
#                     print(len(answers))

In [93]:
ds.shape

In [94]:
ds[1]

+-----------+----------------+-----------------+----------------+-------------------+----------------+
| title     | context        | question        | answers        | answer_starts     | id             |
+-----------+----------------+-----------------+----------------+-------------------+----------------+
| 战国无双3 | 《战国无双3... | 男女主角亦有... | ['村雨城', ... | [226, 226, 219... | DEV_0_QUERY... |
+-----------+----------------+-----------------+----------------+-------------------+----------------+

In [95]:
# 这个算法估计是做机器阅读倒是很不错

In [96]:
ds[1]['title']

'战国无双3'

In [97]:
ds[1]['context']

'《战国无双3》（）是由光荣和ω-force开发的战国无双系列的正统第三续作。本作以三大故事为主轴，分别是以武田信玄等人为主的《关东三国志》，织田信长等人为主的《战国三杰》，石田三成等人为主的《关原的年轻武者》，丰富游戏内的剧情。此部份专门介绍角色，欲知武器情报、奥义字或擅长攻击类型等，请至战国无双系列1.由于乡里大辅先生因故去世，不得不寻找其他声优接手。从猛将传 and Z开始。2.战国无双 编年史的原创男女主角亦有专属声优。此模式是任天堂游戏谜之村雨城改编的新增模式。本作中共有20张战场地图（不含村雨城），后来发行的猛将传再新增3张战场地图。但游戏内战役数量繁多，部分地图会有兼用的状况，战役虚实则是以光荣发行的2本「战国无双3 人物真书」内容为主，以下是相关介绍。（注：前方加☆者为猛将传新增关卡及地图。）合并本篇和猛将传的内容，村雨城模式剔除，战国史模式可直接游玩。主打两大模式「战史演武」&「争霸演武」。系列作品外传作品'

In [98]:
ds[1]['question']

'男女主角亦有专属声优这一模式是由谁改编的？'

In [101]:
ds[1]['answers']

['村雨城', '村雨城', '任天堂游戏谜之村雨城']

In [103]:
ds[1]['answer_starts']

[226, 226, 219]

In [104]:
type(ds[1]['answer_starts'])

list

In [105]:
ds[1]['id']

'DEV_0_QUERY_1'

In [106]:
print(type(ds))

<class 'fastNLP.core.dataset.DataSet'>


In [107]:
data_bundle=ds

In [108]:
from fastNLP.io import CMRC2018Loader

In [109]:
CMRC2018Loader

fastNLP.io.loader.qa.CMRC2018Loader

In [17]:
tokenize_method='cn-char'
# tokenize_method='spacy'
lang='cn'

In [18]:
def _raw_split(sent):
    return sent.split()

In [19]:
def _cn_char_split(sent):
    return [chars for chars in sent]

In [20]:
r"""

:param str tokenize_method: 获取tokenzier方法
:param str lang: 语言，当前仅支持en
:return: 返回tokenize函数
"""
tokenizer_dict = {
    'spacy': None,
    'raw': _raw_split,
    'cn-char': _cn_char_split,
}
if tokenize_method == 'spacy':
    import spacy
    spacy.prefer_gpu()
    if lang != 'en':
        raise RuntimeError("Spacy only supports en right right.")
    # 估计目前是不支持spacy用法，所以才有这么回事
    #到时候想法子改一下参数就行了
    en = spacy.load(lang)
    tokenizer = lambda x: [w.text for w in en.tokenizer(x)]
elif tokenize_method in tokenizer_dict:
    tokenizer = tokenizer_dict[tokenize_method]
else:
    raise RuntimeError(f"Only support {tokenizer_dict.keys()} tokenizer.")

In [21]:
max_len=510

In [22]:
from fastNLP.io import DataBundle

In [24]:
from copy import deepcopy

In [34]:
new_ds = DataSet()
i=0
for ins in ds:
#     i=i+1
    new_ins = deepcopy(ins)
    context = ins['context']
    question = ins['question']
    print(context)
    print(question)
    cnt_lst = tokenizer(context)
    print(cnt_lst)
    q_lst = tokenizer(question)
    print(q_lst)
    answer_start = -1
    print('=='*20)
    print(len(cnt_lst))
    print(len(q_lst))
    print(len(cnt_lst) + len(q_lst) + 3)
    if len(cnt_lst) + len(q_lst) + 3 > max_len:  
        # 预留开头的[CLS]和[SEP]和中间的[sep]
        if 'answer_starts' in ins and 'answers' in ins:
            answer_start = int(ins['answer_starts'][0])
            print('answer_start',answer_start)
            answer = ins['answers'][0]
            answer_end = answer_start + len(answer)
            if answer_end > max_len - 3 - len(q_lst):
                span_start = answer_end + 3 + len(q_lst) - max_len
                span_end = answer_end
            else:
                span_start = 0
                span_end = max_len - 3 - len(q_lst)
            cnt_lst = cnt_lst[span_start:span_end]
            answer_start = int(ins['answer_starts'][0])
            answer_start -= span_start
            answer_end = answer_start + len(ins['answers'][0])
        else:
            cnt_lst = cnt_lst[:max_len - len(q_lst) - 3]
    else:
        if 'answer_starts' in ins and 'answers' in ins:
            answer_start = int(ins['answer_starts'][0])
            print('<'*20)
            print(answer_start)
            answer_end = answer_start + len(ins['answers'][0])
            print(ins['answers'])
            print(answer_end)
    tokens = cnt_lst + ['[SEP]'] + q_lst
    new_ins['context_len'] = len(cnt_lst)
    new_ins['raw_chars'] = tokens
    if answer_start != -1:
            new_ins['target_start'] = answer_start
#             print(answer_start)
            new_ins['target_end'] = answer_end - 1
#             print(answer_end-1)

    new_ds.append(new_ins)
    data_bundle=DataBundle()
    name='dev'
    data_bundle.set_dataset(new_ds, name)
        
    if i==0:
        break

《战国无双3》（）是由光荣和ω-force开发的战国无双系列的正统第三续作。本作以三大故事为主轴，分别是以武田信玄等人为主的《关东三国志》，织田信长等人为主的《战国三杰》，石田三成等人为主的《关原的年轻武者》，丰富游戏内的剧情。此部份专门介绍角色，欲知武器情报、奥义字或擅长攻击类型等，请至战国无双系列1.由于乡里大辅先生因故去世，不得不寻找其他声优接手。从猛将传 and Z开始。2.战国无双 编年史的原创男女主角亦有专属声优。此模式是任天堂游戏谜之村雨城改编的新增模式。本作中共有20张战场地图（不含村雨城），后来发行的猛将传再新增3张战场地图。但游戏内战役数量繁多，部分地图会有兼用的状况，战役虚实则是以光荣发行的2本「战国无双3 人物真书」内容为主，以下是相关介绍。（注：前方加☆者为猛将传新增关卡及地图。）合并本篇和猛将传的内容，村雨城模式剔除，战国史模式可直接游玩。主打两大模式「战史演武」&「争霸演武」。系列作品外传作品
《战国无双3》是由哪两个公司合作开发的？
['《', '战', '国', '无', '双', '3', '》', '（', '）', '是', '由', '光', '荣', '和', 'ω', '-', 'f', 'o', 'r', 'c', 'e', '开', '发', '的', '战', '国', '无', '双', '系', '列', '的', '正', '统', '第', '三', '续', '作', '。', '本', '作', '以', '三', '大', '故', '事', '为', '主', '轴', '，', '分', '别', '是', '以', '武', '田', '信', '玄', '等', '人', '为', '主', '的', '《', '关', '东', '三', '国', '志', '》', '，', '织', '田', '信', '长', '等', '人', '为', '主', '的', '《', '战', '国', '三', '杰', '》', '，', '石', '田', '三', '成', '等', '人', '为', '主', '的', '《', '关', '原', '的', '年', '轻', '武', '者', '》', '，', '丰', '富', '游', '戏', '内', '的', '剧', 

# answer_starts 
原来指的是答案在什么位置上面
# answer_end
原来值得是答案的最后位置

In [66]:
data_bundle.get_dataset('dev')

+-------------+-------------+---------------+-------------+---------------+------------+-------------+---------------+--------------+------------+
| title       | context     | question      | answers     | answer_starts | id         | context_len | raw_chars     | target_start | target_end |
+-------------+-------------+---------------+-------------+---------------+------------+-------------+---------------+--------------+------------+
| 战国无双... | 《战国无... | 《战国无双... | ['光荣和... | [11, 11, 11]  | DEV_0_Q... | 417         | ['《', '战... | 11           | 20         |
+-------------+-------------+---------------+-------------+---------------+------------+-------------+---------------+--------------+------------+

In [68]:
from fastNLP import Vocabulary

In [69]:
src_vocab = Vocabulary()

In [70]:
src_vocab.from_dataset(*[ds for name, ds in data_bundle.iter_datasets() 
                                 if 'train' in name],
                               field_name='raw_chars',
                               no_create_entry_dataset=[ds for name, ds in data_bundle.iter_datasets()
                                                        if 'train' not in name]
                               )

Vocabulary(['《', '战', '国', '无', '双']...)

In [71]:
len(src_vocab)

213

In [72]:
src_vocab.index_dataset(*data_bundle.datasets.values()
                                , field_name='raw_chars', new_field_name='chars')

Vocabulary(['《', '战', '国', '无', '双']...)

In [73]:
len(src_vocab)

213

In [74]:
data_bundle.set_vocab(src_vocab, 'chars')

In total 1 datasets:
	dev has 1 instances.
In total 1 vocabs:
	chars has 213 entries.

In [75]:
data_bundle.set_ignore_type('raw_chars', 'answers', flag=True)

In total 1 datasets:
	dev has 1 instances.
In total 1 vocabs:
	chars has 213 entries.

In [78]:
# data_bundle.datasets

In [79]:
data_bundle.set_input('chars')

In total 1 datasets:
	dev has 1 instances.
In total 1 vocabs:
	chars has 213 entries.

In [85]:
data_bundle.get_dataset('dev')

+-----------+-------------+---------------+-------------+---------------+-----------+-------------+---------------+--------------+------------+-----------+
| title     | context     | question      | answers     | answer_starts | id        | context_len | raw_chars     | target_start | target_end | chars     |
+-----------+-------------+---------------+-------------+---------------+-----------+-------------+---------------+--------------+------------+-----------+
| 战国无... | 《战国无... | 《战国无双... | ['光荣和... | [11, 11, 11]  | DEV_0_... | 417         | ['《', '战... | 11           | 20         | [13, 3... |
+-----------+-------------+---------------+-------------+---------------+-----------+-------------+---------------+--------------+------------+-----------+

In [86]:
data_bundle.set_target('raw_chars', 
                               'answers',
                               'target_start',
                               'target_end',
                               'context_len')

In total 1 datasets:
	dev has 1 instances.
In total 1 vocabs:
	chars has 213 entries.

In [94]:
data_bundle.rename_field('chars', 'words')

In total 1 datasets:
	dev has 1 instances.
In total 1 vocabs:
	words has 213 entries.

In [87]:
data_bundle.get_dataset('dev')

+-----------+-------------+---------------+-------------+---------------+-----------+-------------+---------------+--------------+------------+-----------+
| title     | context     | question      | answers     | answer_starts | id        | context_len | raw_chars     | target_start | target_end | chars     |
+-----------+-------------+---------------+-------------+---------------+-----------+-------------+---------------+--------------+------------+-----------+
| 战国无... | 《战国无... | 《战国无双... | ['光荣和... | [11, 11, 11]  | DEV_0_... | 417         | ['《', '战... | 11           | 20         | [13, 3... |
+-----------+-------------+---------------+-------------+---------------+-----------+-------------+---------------+--------------+------------+-----------+

In [88]:
data_bundle.datasets.keys()

dict_keys(['dev'])

In [97]:
ds = data_bundle.get_dataset('dev')

In [98]:
j=0
for ins in ds:
    print(ins)
    j=j+1

+-----------+-------------+---------------+-------------+---------------+-----------+-------------+---------------+--------------+------------+-----------+
| title     | context     | question      | answers     | answer_starts | id        | context_len | raw_chars     | target_start | target_end | words     |
+-----------+-------------+---------------+-------------+---------------+-----------+-------------+---------------+--------------+------------+-----------+
| 战国无... | 《战国无... | 《战国无双... | ['光荣和... | [11, 11, 11]  | DEV_0_... | 417         | ['《', '战... | 11           | 20         | [13, 3... |
+-----------+-------------+---------------+-------------+---------------+-----------+-------------+---------------+--------------+------------+-----------+


In [100]:
ds['title'][0]

'战国无双3'

In [102]:
ds['context'][0]

'《战国无双3》（）是由光荣和ω-force开发的战国无双系列的正统第三续作。本作以三大故事为主轴，分别是以武田信玄等人为主的《关东三国志》，织田信长等人为主的《战国三杰》，石田三成等人为主的《关原的年轻武者》，丰富游戏内的剧情。此部份专门介绍角色，欲知武器情报、奥义字或擅长攻击类型等，请至战国无双系列1.由于乡里大辅先生因故去世，不得不寻找其他声优接手。从猛将传 and Z开始。2.战国无双 编年史的原创男女主角亦有专属声优。此模式是任天堂游戏谜之村雨城改编的新增模式。本作中共有20张战场地图（不含村雨城），后来发行的猛将传再新增3张战场地图。但游戏内战役数量繁多，部分地图会有兼用的状况，战役虚实则是以光荣发行的2本「战国无双3 人物真书」内容为主，以下是相关介绍。（注：前方加☆者为猛将传新增关卡及地图。）合并本篇和猛将传的内容，村雨城模式剔除，战国史模式可直接游玩。主打两大模式「战史演武」&「争霸演武」。系列作品外传作品'

In [103]:
ds['question'][0]

'《战国无双3》是由哪两个公司合作开发的？'

In [104]:
ds['answers'][0]

['光荣和ω-force', '光荣和ω-force', '光荣和ω-force']

In [106]:
print(ds['answer_starts'][0])
print(ds['context_len'][0])
print(ds['raw_chars'][0])
print(ds['target_start'][0])
print(ds['target_end'][0])
print(ds['words'][0])

[11, 11, 11]
417
['《', '战', '国', '无', '双', '3', '》', '（', '）', '是', '由', '光', '荣', '和', 'ω', '-', 'f', 'o', 'r', 'c', 'e', '开', '发', '的', '战', '国', '无', '双', '系', '列', '的', '正', '统', '第', '三', '续', '作', '。', '本', '作', '以', '三', '大', '故', '事', '为', '主', '轴', '，', '分', '别', '是', '以', '武', '田', '信', '玄', '等', '人', '为', '主', '的', '《', '关', '东', '三', '国', '志', '》', '，', '织', '田', '信', '长', '等', '人', '为', '主', '的', '《', '战', '国', '三', '杰', '》', '，', '石', '田', '三', '成', '等', '人', '为', '主', '的', '《', '关', '原', '的', '年', '轻', '武', '者', '》', '，', '丰', '富', '游', '戏', '内', '的', '剧', '情', '。', '此', '部', '份', '专', '门', '介', '绍', '角', '色', '，', '欲', '知', '武', '器', '情', '报', '、', '奥', '义', '字', '或', '擅', '长', '攻', '击', '类', '型', '等', '，', '请', '至', '战', '国', '无', '双', '系', '列', '1', '.', '由', '于', '乡', '里', '大', '辅', '先', '生', '因', '故', '去', '世', '，', '不', '得', '不', '寻', '找', '其', '他', '声', '优', '接', '手', '。', '从', '猛', '将', '传', ' ', 'a', 'n', 'd', ' ', 'Z', '开', '始', '。', '2', '.', '战', '国', '无', '双

In [65]:
data_bundle

In total 1 datasets:
	dev has 1 instances.

In [60]:
print(new_ds)

+-------------+-------------+---------------+-------------+---------------+------------+-------------+---------------+--------------+------------+
| title       | context     | question      | answers     | answer_starts | id         | context_len | raw_chars     | target_start | target_end |
+-------------+-------------+---------------+-------------+---------------+------------+-------------+---------------+--------------+------------+
| 战国无双... | 《战国无... | 《战国无双... | ['光荣和... | [11, 11, 11]  | DEV_0_Q... | 417         | ['《', '战... | 11           | 20         |
+-------------+-------------+---------------+-------------+---------------+------------+-------------+---------------+--------------+------------+


In [61]:
print(ds)

+----------------+-----------------+-----------------+-----------------+-------------------+----------------+
| title          | context         | question        | answers         | answer_starts     | id             |
+----------------+-----------------+-----------------+-----------------+-------------------+----------------+
| 战国无双3      | 《战国无双3...  | 《战国无双3...  | ['光荣和ω-...   | [11, 11, 11]      | DEV_0_QUERY... |
| 战国无双3      | 《战国无双3...  | 男女主角亦有... | ['村雨城', ...  | [226, 226, 219... | DEV_0_QUERY... |
| 战国无双3      | 《战国无双3...  | 战国史模式主... | ['「战史演武... | [395, 395, 395... | DEV_0_QUERY... |
| 锣鼓经         | 锣鼓经是大陆... | 锣鼓经是什么... | ['大陆传统器... | [4, 4, 4]         | DEV_1_QUERY... |
| 锣鼓经         | 锣鼓经是大陆... | 锣鼓经常用的... | ['锣鼓点', ...  | [67, 67, 67]      | DEV_1_QUERY... |
| 锣鼓经         | 锣鼓经是大陆... | 锣鼓经运用的... | ['依照角色行... | [167, 167, 167... | DEV_1_QUERY... |
| 锣鼓经         | 锣鼓经是大陆... | 戏曲锣鼓所运... | ['鼓、锣、钹... | [237, 237, 237... | DEV_1_QUERY... |
| 广茂铁路       | 广茂铁路是中... | 广茂铁路全长

In [None]:
data_bundle.rename_field('chars', 'words')
