# 联系/调试

## 工作目录

In [1]:
%cd ..

/home/Public/Megatron-LM


## 环境变量

- 预测时，不需要 `CUDA` 设备:

In [2]:
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0


## 导入模组

In [3]:
import argparse
import glob
import json
import logging
import os
import pickle
import random
import re
import shutil
import sys

import numpy as np
import torch
from torch.nn import functional as F
from torch.utils.data import (DataLoader, Dataset, RandomSampler,
                              SequentialSampler)
from torch.utils.data.distributed import DistributedSampler
from torch.utils.tensorboard import SummaryWriter
from tqdm.auto import tqdm, trange
from transformers import (WEIGHTS_NAME, AdamW, GPT2Config, GPT2LMHeadModel,
                          GPT2Tokenizer, WarmupLinearSchedule)

from data_utils.tokenization import SentencePieceTokenizer, make_tokenizer


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


## 常量定义

- 假的参数

In [4]:
from types import SimpleNamespace

args = SimpleNamespace(
    model_name_or_path='checkpoints/hfgpt2-117m-emotion',
    train_data_file='data/baike10k.json',
    tokenizer_model_path='data/spm/gpt2_huamei_corpus_bpe_32k_v2.model',
    mlm=True,
    fp16=True,
    mlm_probability=0.15,
    num_train_epochs=1,
    train_batch_size=4,
)

- 最大输出长度

In [5]:
MAX_OUTPUT_LENGTH = 64

## 变量初始化

- SentencePiece tokenizer

In [6]:
tokenizer = make_tokenizer(SentencePieceTokenizer, None, model_path=args.tokenizer_model_path)

- GPT2 Model

In [7]:
config = GPT2Config.from_pretrained(args.model_name_or_path)
# model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path, config=config).eval()

## 全局类型定义

In [40]:
def mask_batch(inputs, tokenizer, args):
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
    labels = inputs.clone()
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, args.mlm_probability)
    special_tokens_mask = [
        [0] * len(val)
        for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(
        special_tokens_mask, dtype=torch.bool), value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -1  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(
        labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.get_command('MASK').Id

    # 10% of the time, we replace masked input tokens with random word
    indices_random = torch.bernoulli(torch.full(
        labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(
        len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels


In [41]:
class JsonLinesDataset(Dataset):
    def __init__(self, path, tokenizer, config, args):
        self._tokenizer = tokenizer
        self._config = config
        self._args = args
        self._data_list = []
        with open(path, encoding='utf8') as fp:
            for line in fp:
                line = line.strip()
                if line:
                    d = json.loads(line)
                    self._data_list.append({'text': d['text'].strip()})

    def __len__(self):
        return len(self._data_list)

    def __getitem__(self, idx):
        """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
        tokenizer = self._tokenizer
        config = self._config
        args = self._args
        n_ctx = config.n_ctx
        text = self._data_list[idx]['text'].strip()
        ids = [int(id_) for id_ in tokenizer.EncodeAsIds(text)]
        pad_sz = n_ctx - len(ids)
        ids = torch.tensor(ids)        
        if pad_sz > 0:
            ids = F.pad(ids, (0, pad_sz), value=tokenizer.get_command('pad').Id)
        else:
            ids = ids[:n_ctx]
        return ids

In [42]:
train_dataset = JsonLinesDataset(
    args.train_data_file,
    tokenizer, config, args
)

# train_sampler = RandomSampler(train_dataset)
train_sampler = SequentialSampler(train_dataset)

train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

In [43]:
train_iterator = trange(
    int(args.num_train_epochs),
    desc="Epoch",
)

for _ in train_iterator:
    epoch_iterator = tqdm(
        train_dataloader,
        desc="Iteration"
    )
    for batch in epoch_iterator:
        inputs, labels = mask_batch(batch, tokenizer, args) if args.mlm else (batch, batch)
        break

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Iteration', max=2500, style=ProgressStyle(description_width='…




In [49]:
def t2s(tokens):
    ids = [int(i) for i in tokens.tolist() if int(i)>0]
    text = tokenizer.DecodeIds(ids)
    return text
    

for input_, label in zip(inputs, labels):
    print('INPUT:')
    print(t2s(input_))
    print('LABEL:')
    print(t2s(label))
    print()

INPUT:
中国少年先锋队,简称中国少 <MASK> 队、少先 <MASK> ,是中国共产党的先锋组织,是中国共产主义青年团受权 <MASK> 的群众性儿童组织荂受苏联先锋运动 <MASK> ,中国共产党成立早期曾设立 <MASK>  <MASK> 共产主义 <MASK> ,如1931年在苏维埃区成立的少先队和后期的儿童团 <MASK> 。中华人民共和国成立 <MASK> ,中国新民主主义 <MASK>  <MASK> 中央委员会于1949年10月13日宣布成立 <MASK> 中国少年儿童队”, <MASK> 原少年先 <MASK> 队和儿童团合并而成,1953誕6月改为现 <MASK> 。中国少先队的组织架构主要基于民主겹制,而主要架 <MASK> 则源于 <MASK> 少先队。中国少先队最高机构是 <MASK> 5年召开 <MASK> 的全国代表 <MASK> ,而当全国代表大会 <MASK> 会期间则由 <MASK> 少年先锋 <MASK> 全国工作 <MASK> 作为核心权力机构。但因为全国少工委通常1年只会召开1次会议,使得多数职权和工作都由设在共青团中央少年部内的全国少工 <MASK> 办公室掌握,全国糍工委主任则成为主要负责人。中国少先队申请人的加入年龄必须介于6岁至14 <MASK> 间,几乎所有小 <MASK> 都加入了少先队 <MASK> 截至2002年,少 <MASK> 队员1.3亿人,成为 <MASK> 规模最大的先锋组织 <MASK> 中国少先畠始终宣称其理想与目标是 <MASK> 共产主义,将自身视为代表最广大中国 <MASK> 年的先锋组织。其中《中国少年 <MASK> 锋队章程》 <MASK> 中国 <MASK> 先队坚持中国特色社会主义理论,并将中国 <MASK> 先 <MASK> 表述为:「是中国少年 <MASK> 的群众组织,是少年儿童学习中国特色社会主义和共产主义的 <MASK> ,是建设社会主义和共产 <MASK> 预备 <MASK> 」。队旗为 <MASK> , <MASK> 革命胜利 <MASK> 队旗中央的同盟角星 <MASK> 代表中国共产党的领导,火炬象征光明。队旗寓意着:在中国共产党的领导下,向着光明的未来 <MASK> 。大队旗高为90厘米, <MASK> 为120厘米 <MASK> 旗中心有黄色五 <MAS

In [1]:
def fx():
    for i in range(10):
        yield i
    return 'ok'

In [2]:
for x in fx():
    print(x)

0
1
2
3
4
5
6
7
8
9
