# Megatron 的 Dataset 机制

## Working Directory

In [1]:
%cd ..

/home/Public/Megatron-LM


## Environment Variables

- 预测时，可以使用的 `CUDA` 设备:

In [2]:
# %env CUDA_VISIBLE_DEVICES=0

## Importings

In [21]:
import copy
import csv
import json
import os
import random
import sys
import time
from contextlib import closing
from types import SimpleNamespace

import numpy as np
import torch
import torch.nn.functional as F
from tqdm.auto import tqdm

import mpu
from data_utils.tokenization import SentencePieceTokenizer, make_tokenizer
from utils import print_args
from pretrain_gpt2 import initialize_distributed, set_random_seed, get_train_val_test_data, setup_model_and_optimizer, get_masks_and_position_ids

## Args

In [4]:
args = SimpleNamespace(
    # Model arguments
    num_layers=12,
    hidden_size=768,
    num_attention_heads=12,
    max_position_embeddings=1024,
    vocab_size=None,
    make_vocab_size_divisible_by=128,
    attention_dropout=0.1,
    hidden_dropout=0.1,

    # Train/valid/test data arguments.
    seq_length=512,
    model_parallel_size=1,
    tokenizer_model_type='bert-large-uncased',
    tokenizer_type=SentencePieceTokenizer,
    tokenizer_path="./data/spm/gpt2_huamei_corpus_bpe_32k_v2.model",
    train_data="data/xinli001_qa-dev.json",
    lazy_loader=None,
    valid_data=None,
    test_data=None,
    use_tfrecords=None,
    presplit_sentences=None,
    num_workers=2,
    split = '949,50,1',
    delim=',',
    cache_dir=None,
    text_key='text',
    eval_text_key=None,
    loose_json=True,
    max_preds_per_seq=None,

    # Training arguments.
    load='./checkpoints/gpt2-117m-emotion',
    save='./checkpoints/gpt2-117m-xinli001_qa.finetune',
    save_interval=5000,    
    batch_size=4,
    checkpoint_activations=None,
    checkpoint_num_layers=1,
    finetune=True,
    release=None,
    resume_dataloader=None,
    no_save_optim=None,
    no_save_rng=None,
    no_load_optim=None,
    no_load_rng=None,
    fp16=True,
    hysteresis=2,
    loss_scale=None,
    loss_scale_window=1000,
    min_scale=1,
    shuffle=None,
    distributed_backend='nccl',
    DDP_impl='local',
    local_rank=None,
    reset_position_ids=None,
    reset_attention_mask=None,
    eod_mask_loss=None,
    adlr_autoresume=None,
    weight_decay=0.01,
    clip_grad=1.0,
    train_iters=1000000,
    log_interval=100,
    exit_interval=None,
    seed=1234,
    lr_decay_iters=None,
    lr_decay_style='cosine',
    lr=0.00015,
    min_lr=0.0,
    warmup=0.01,
    override_lr_scheduler=None,
    use_checkpoint_lr_scheduler=None,
    use_npy_data_loader = False,
    # Text generate arguments.
    recompute=None,
    greedy=False,
    top_p=0.0,
    top_k=0,
    temperature=1.0,
    out_seq_length=128,

    # Evaluation arguments.
    eval_batch_size=None,
    eval_iters=100,
    eval_interval=1000,
    eval_seq_length=None,
    eval_max_preds_per_seq=None,
    overlapping_eval=32,
    cloze_eval=None,
    strict_lambada=None,
    eval_hf=None,
    load_openai=None,    
)

In [5]:
args.cuda = torch.cuda.is_available()
args.rank = int(os.getenv('RANK', '0'))
args.world_size = int(os.getenv("WORLD_SIZE", '1'))

if os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'):
    # We are using (OpenMPI) mpirun for launching distributed data parallel processes
    local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'))
    local_size = int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))

    # Possibly running with Slurm
    num_nodes = int(os.getenv('SLURM_JOB_NUM_NODES', '1'))
    nodeid = int(os.getenv('SLURM_NODEID', '0'))

    args.local_rank = local_rank
    args.rank = nodeid*local_size + local_rank
    args.world_size = num_nodes*local_size

args.model_parallel_size = min(args.model_parallel_size, args.world_size)
if args.rank == 0:
    print('using world size: {} and model-parallel size: {} '.format(
        args.world_size, args.model_parallel_size))

args.dynamic_loss_scale = False
if args.loss_scale is None:
    args.dynamic_loss_scale = True
    if args.rank == 0:
        print(' > using dynamic loss scaling')

# The args fp32_* or fp16_* meant to be active when the
# args fp16 is set. So the default behavior should all
# be false.
if not args.fp16:
    args.fp32_embedding = False
    args.fp32_tokentypes = False
    args.fp32_layernorm = False


using world size: 1 and model-parallel size: 1 
 > using dynamic loss scaling


In [6]:
! wc -l {args.train_data}

32 data/xinli001_qa-dev.json


## Tokenizer

In [7]:
%%time

tokenizer = make_tokenizer(SentencePieceTokenizer, None, model_path=args.tokenizer_path)

CPU times: user 73.8 ms, sys: 11.5 ms, total: 85.3 ms
Wall time: 83.7 ms


## Init

In [8]:
%%time

# Pytorch distributed.
initialize_distributed(args)
if torch.distributed.get_rank() == 0:
    print('Pretrain GPT2 model')
    print_args(args)

# Autoresume.
torch.distributed.barrier()
if args.adlr_autoresume:
    enable_adlr_autoresume(args)

# Random seeds for reproducibility.
set_random_seed(args.seed)


> initializing model parallel with size 1
Pretrain GPT2 model
arguments:
  num_layers ................... 12
  hidden_size .................. 768
  num_attention_heads .......... 12
  max_position_embeddings ...... 1024
  vocab_size ................... None
  make_vocab_size_divisible_by . 128
  attention_dropout ............ 0.1
  hidden_dropout ............... 0.1
  seq_length ................... 512
  model_parallel_size .......... 1
  tokenizer_model_type ......... bert-large-uncased
  tokenizer_type ............... <class 'data_utils.tokenization.SentencePieceTokenizer'>
  tokenizer_path ............... ./data/spm/gpt2_huamei_corpus_bpe_32k_v2.model
  train_data ................... data/xinli001_qa-dev.json
  lazy_loader .................. None
  valid_data ................... None
  test_data .................... None
  use_tfrecords ................ None
  presplit_sentences ........... None
  num_workers .................. 2
  split ........................ 949,50,1
  delim ...

## Model

In [9]:
# Model, optimizer, and learning rate.
# model, optimizer, lr_scheduler = setup_model_and_optimizer(args)

## Data

In [10]:
# Data stuff.
(
    train_data, val_data, test_data, args.vocab_size, args.eod_token
) = get_train_val_test_data(args)

configuring data
> padded vocab (size: 32008) with 120 dummy tokens (new size: 32128)
> found end-of-document token: 1


In [16]:
for ds in (train_data, val_data, test_data):
    print(len(ds))

print(args.vocab_size)
print(args.eod_token)

7500
250
250
32128
1


## Get Batch

In [18]:
data_iterator = iter(train_data)
data

{'text': tensor([[ 8878,  7992,  4793,  ...,  8852,  8880,  8785],
         [ 8785,   187, 10440,  ...,  8785,   494,   535],
         [ 8962,  9408,   853,  ...,  3044,  9817,  9779],
         [ 8993,  9787,  8856,  ...,  9831,  8878,  8785]])}

In [19]:

# Items and their type.
keys = ['text']
datatype = torch.int64


data = next(data_iterator)
data_b = mpu.broadcast_data(keys, data, datatype)

# Unpack.
tokens_ = data_b['text'].long()
labels = tokens_[:, 1:].contiguous()
tokens = tokens_[:, :-1].contiguous()

tokens

tensor([[ 880, 5547, 2081,  ..., 2431, 1571, 8878],
        [9025, 8913, 8872,  ..., 8836, 1595, 2652],
        [8785, 2428, 1701,  ...,  297,   57,  686],
        [ 115, 2379, 8830,  ...,   58, 9635, 8818]], device='cuda:0')

In [22]:
# Get the masks and postition ids.
attention_mask, loss_mask, position_ids = get_masks_and_position_ids(
    tokens,
    args.eod_token,
    args.reset_position_ids,
    args.reset_attention_mask,
    args.eod_mask_loss
)

In [23]:
attention_mask, loss_mask, position_ids

(tensor([[[[1., 0., 0.,  ..., 0., 0., 0.],
           [1., 1., 0.,  ..., 0., 0., 0.],
           [1., 1., 1.,  ..., 0., 0., 0.],
           ...,
           [1., 1., 1.,  ..., 1., 0., 0.],
           [1., 1., 1.,  ..., 1., 1., 0.],
           [1., 1., 1.,  ..., 1., 1., 1.]]]], device='cuda:0'),
 tensor([[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]], device='cuda:0'),
 tensor([[  0,   1,   2,  ..., 509, 510, 511],
         [  0,   1,   2,  ..., 509, 510, 511],
         [  0,   1,   2,  ..., 509, 510, 511],
         [  0,   1,   2,  ..., 509, 510, 511]], device='cuda:0'))

In [35]:
attention_mask[0][0]

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [1., 1., 0.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 1., 0., 0.],
        [1., 1., 1.,  ..., 1., 1., 0.],
        [1., 1., 1.,  ..., 1., 1., 1.]], device='cuda:0')

In [28]:
attention_mask.shape

torch.Size([1, 1, 512, 512])