### Homework：加载本地保存的模型，进行评估和再训练更高的 F1 Score

In [None]:
squad_v2 = False
model_checkpoint = "distilbert-base-uncased"
batch_size = 192

In [44]:
# process datasets
from datasets import load_dataset
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))
    
datasets = load_dataset("squad_v2" if squad_v2 else "squad")
print(datasets)
print(datasets["train"][0])
show_random_elements(datasets["train"])

from transformers import AutoTokenizer
import transformers
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# 以下断言确保我们的 Tokenizers 使用的是 FastTokenizer（Rust 实现，速度和功能性上有一定优势）。
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

for i, example in enumerate(datasets["train"]):
    if len(tokenizer(example["question"], example["context"])["input_ids"]) > 384:
        break
example = datasets["train"][i]

# The maximum length of a feature (question and context)
max_length = 384 
# The authorized overlap between two part of the context when splitting it is needed.
doc_stride = 128 
len(tokenizer(example["question"], example["context"])["input_ids"])
len(tokenizer(example["question"],
              example["context"],
              max_length=max_length,
              truncation="only_second")["input_ids"])

tokenized_example = tokenizer(
    example["question"],
    example["context"],
    max_length=max_length,
    truncation="only_second",
    return_overflowing_tokens=True,
    stride=doc_stride
)

pad_on_right = tokenizer.padding_side == "right"

def prepare_train_features(examples):
    # 一些问题的左侧可能有很多空白字符，这对我们没有用，而且会导致上下文的截断失败
    # （标记化的问题将占用大量空间）。因此，我们删除左侧的空白字符。
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # 使用截断和填充对我们的示例进行标记化，但保留溢出部分，使用步幅（stride）。
    # 当上下文很长时，这会导致一个示例可能提供多个特征，其中每个特征的上下文都与前一个特征的上下文有一些重叠。
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # 由于一个示例可能给我们提供多个特征（如果它具有很长的上下文），我们需要一个从特征到其对应示例的映射。这个键就提供了这个映射关系。
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # 偏移映射将为我们提供从令牌到原始上下文中的字符位置的映射。这将帮助我们计算开始位置和结束位置。
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # 让我们为这些示例进行标记！
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # 我们将使用CLS令牌的索引来标记不可能的答案。
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # 获取与该示例对应的序列（以了解上下文和问题是什么）。
        sequence_ids = tokenized_examples.sequence_ids(i)

        # 一个示例可以提供多个跨度，这是包含此文本跨度的示例的索引。
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # 如果没有给出答案，则将cls_index设置为答案。
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # 答案在文本中的开始和结束字符索引。
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # 当前跨度在文本中的开始令牌索引。
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # 当前跨度在文本中的结束令牌索引。
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # 检测答案是否超出跨度（在这种情况下，该特征的标签将使用CLS索引）。
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # 否则，将token_start_index和token_end_index移到答案的两端。
                # 注意：如果答案是最后一个单词（边缘情况），我们可以在最后一个偏移之后继续。
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

tokenized_datasets = datasets.map(prepare_train_features,
                                  batched=True,
                                  remove_columns=datasets["train"].column_names)

from transformers import default_data_collator

data_collator = default_data_collator

Reusing dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})
{'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome

Unnamed: 0,id,title,context,question,answers
0,5731ab21b9d445190005e44f,Religion_in_ancient_Rome,"The meaning and origin of many archaic festivals baffled even Rome's intellectual elite, but the more obscure they were, the greater the opportunity for reinvention and reinterpretation — a fact lost neither on Augustus in his program of religious reform, which often cloaked autocratic innovation, nor on his only rival as mythmaker of the era, Ovid. In his Fasti, a long-form poem covering Roman holidays from January to June, Ovid presents a unique look at Roman antiquarian lore, popular customs, and religious practice that is by turns imaginative, entertaining, high-minded, and scurrilous; not a priestly account, despite the speaker's pose as a vates or inspired poet-prophet, but a work of description, imagination and poetic etymology that reflects the broad humor and burlesque spirit of such venerable festivals as the Saturnalia, Consualia, and feast of Anna Perenna on the Ides of March, where Ovid treats the assassination of the newly deified Julius Caesar as utterly incidental to the festivities among the Roman people. But official calendars preserved from different times and places also show a flexibility in omitting or expanding events, indicating that there was no single static and authoritative calendar of required observances. In the later Empire under Christian rule, the new Christian festivals were incorporated into the existing framework of the Roman calendar, alongside at least some of the traditional festivals.",What poet wrote a long poem describing Roman religious holidays?,"{'text': ['Ovid'], 'answer_start': [346]}"
1,56e08b457aa994140058e5e3,Hydrogen,"Hydrogen forms a vast array of compounds with carbon called the hydrocarbons, and an even vaster array with heteroatoms that, because of their general association with living things, are called organic compounds. The study of their properties is known as organic chemistry and their study in the context of living organisms is known as biochemistry. By some definitions, ""organic"" compounds are only required to contain carbon. However, most of them also contain hydrogen, and because it is the carbon-hydrogen bond which gives this class of compounds most of its particular chemical characteristics, carbon-hydrogen bonds are required in some definitions of the word ""organic"" in chemistry. Millions of hydrocarbons are known, and they are usually formed by complicated synthetic pathways, which seldom involve elementary hydrogen.",What is the form of hydrogen and carbon called?,"{'text': ['hydrocarbons'], 'answer_start': [64]}"
2,56cef65baab44d1400b88d36,Spectre_(2015_film),"Christopher Orr, writing in The Atlantic, also criticised the film, saying that Spectre ""backslides on virtually every [aspect]"". Lawrence Toppman of The Charlotte Observer called Craig's performance ""Bored, James Bored."" Alyssa Rosenberg, writing for The Washington Post, stated that the film turned into ""a disappointingly conventional Bond film.""",What adjective did Lawrence Toppman use to describe Craig's portrayal of James Bond?,"{'text': ['Bored'], 'answer_start': [201]}"
3,571a30bb10f8ca1400304f53,Seattle,"King County Metro provides frequent stop bus service within the city and surrounding county, as well as a South Lake Union Streetcar line between the South Lake Union neighborhood and Westlake Center in downtown. Seattle is one of the few cities in North America whose bus fleet includes electric trolleybuses. Sound Transit currently provides an express bus service within the metropolitan area; two Sounder commuter rail lines between the suburbs and downtown; its Central Link light rail line, which opened in 2009, between downtown and Sea-Tac Airport gives the city its first rapid transit line that has intermediate stops within the city limits. Washington State Ferries, which manages the largest network of ferries in the United States and third largest in the world, connects Seattle to Bainbridge and Vashon Islands in Puget Sound and to Bremerton and Southworth on the Kitsap Peninsula.",To what two islands does the ferry service connect?,"{'text': ['Bainbridge and Vashon'], 'answer_start': [796]}"
4,570d2cb4fed7b91900d45cb5,Macintosh,"In 1998, after the return of Steve Jobs, Apple consolidated its multiple consumer-level desktop models into the all-in-one iMac G3, which became a commercial success and revitalized the brand. Since their transition to Intel processors in 2006, the complete lineup is entirely based on said processors and associated systems. Its current lineup comprises three desktops (the all-in-one iMac, entry-level Mac mini, and the Mac Pro tower graphics workstation), and four laptops (the MacBook, MacBook Air, MacBook Pro, and MacBook Pro with Retina display). Its Xserve server was discontinued in 2011 in favor of the Mac Mini and Mac Pro.",What took the place of Mac's Xserve server?,"{'text': ['Mac Mini and Mac Pro'], 'answer_start': [613]}"
5,570af6876b8089140040f646,Videoconferencing,"Technological developments by videoconferencing developers in the 2010s have extended the capabilities of video conferencing systems beyond the boardroom for use with hand-held mobile devices that combine the use of video, audio and on-screen drawing capabilities broadcasting in real-time over secure networks, independent of location. Mobile collaboration systems now allow multiple people in previously unreachable locations, such as workers on an off-shore oil rig, the ability to view and discuss issues with colleagues thousands of miles away. Traditional videoconferencing system manufacturers have begun providing mobile applications as well, such as those that allow for live and still image streaming.",What is one example of an application that videoconferencing manufacturers have begun to offer?,"{'text': ['still image streaming'], 'answer_start': [689]}"
6,56e82d0100c9c71400d775eb,Dialect,"Italy is home to a vast array of native regional minority languages, most of which are Romance-based and have their own local variants. These regional languages are often referred to colloquially or in non-linguistic circles as Italian ""dialects,"" or dialetti (standard Italian for ""dialects""). However, the majority of the regional languages in Italy are in fact not actually ""dialects"" of standard Italian in the strict linguistic sense, as they are not derived from modern standard Italian but instead evolved locally from Vulgar Latin independent of standard Italian, with little to no influence from what is now known as ""standard Italian."" They are therefore better classified as individual languages rather than ""dialects.""",What are Italian dialects termed in the Italian language?,"{'text': ['dialetti'], 'answer_start': [251]}"
7,56e147e6cd28a01900c6772b,Universal_Studios,"The Universal Film Manufacturing Company was incorporated in New York on April 30, 1912. Laemmle, who emerged as president in July 1912, was the primary figure in the partnership with Dintenfass, Baumann, Kessel, Powers, Swanson, Horsley, and Brulatour. Eventually all would be bought out by Laemmle. The new Universal studio was a vertically integrated company, with movie production, distribution and exhibition venues all linked in the same corporate entity, the central element of the Studio system era.","Along with exhibition and distribution, what business did the Universal Film Manufacturing Company engage in?","{'text': ['movie production'], 'answer_start': [368]}"
8,5731933a05b4da19006bd2d0,Steven_Spielberg,"Spielberg's next film, Schindler's List, was based on the true story of Oskar Schindler, a man who risked his life to save 1,100 Jews from the Holocaust. Schindler's List earned Spielberg his first Academy Award for Best Director (it also won Best Picture). With the film a huge success at the box office, Spielberg used the profits to set up the Shoah Foundation, a non-profit organization that archives filmed testimony of Holocaust survivors. In 1997, the American Film Institute listed it among the 10 Greatest American Films ever Made (#9) which moved up to (#8) when the list was remade in 2007.",Whose life was 'Schindler's List' based on?,"{'text': ['Oskar Schindler'], 'answer_start': [72]}"
9,56de93f94396321400ee2a36,Arnold_Schwarzenegger,"In 1985, Schwarzenegger appeared in ""Stop the Madness"", an anti-drug music video sponsored by the Reagan administration. He first came to wide public notice as a Republican during the 1988 presidential election, accompanying then-Vice President George H.W. Bush at a campaign rally.",In what presidential election year did Schwarzenegger make a name for himself as a prominent Republican?,"{'text': ['1988'], 'answer_start': [184]}"


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.21.3",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10

  0%|          | 0/88 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

In [45]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
import os
os.environ["WANDB_PROJECT"]="fine-tune-quickstart"
os.environ["WANDB_LOG_MODEL"]="true"
os.environ["WANDB_WATCH"]="false"

model_dir = "models"
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_dir}/{model_name}-finetuned-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    tf32=True,
    save_total_limit=2,
    hub_strategy="checkpoint",
    save_strategy="steps",
    save_steps=500,
    ignore_data_skip=False,
    report_to="wandb",
    logging_steps=5,
)

trained_model_path = f"{model_dir}/{model_name}-finetuned-squad-trained"
trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_model_path)


trained_trainer = Trainer(
    trained_model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

PyTorch: setting up devices
loading configuration file models/distilbert-base-uncased-finetuned-squad-trained/config.json
Model config DistilBertConfig {
  "_name_or_path": "models/distilbert-base-uncased-finetuned-squad-trained",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.21.3",
  "vocab_size": 30522
}

loading weights file models/distilbert-base-uncased-finetuned-squad-trained/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForQuestionAnswering.

All the weights of DistilBertForQuestionAnswering were initialized from