# Megatron 直接使用预训练模型进行预测 (BPE v2)

## 环境准备

准备运行这个笔记本的 Jupyter kernel(**如果已经准备就绪，不要重复执行！**)：


1. 配置一个 Conda 环境作为 Jupyter Kernel

In [1]:
# %conda env update -f environments/environment-ipy.yml

安装完毕后，为该 Notebook 选择这个 Kernel (名为`Megatron_LM-ipy`)

2. 在Kernel所在 Conda 环境中安装 Apex

需要通过 pip 从 github 下载源代码安装：

In [2]:
# %pip install -v -r requirements/apex.txt

## CD

定位到工作目录，根据具体情况决定哦，不一定是下面的命令

In [3]:
%cd ..

/home/Public/Megatron-LM


## 指定 Checkpoints 目录

### 从 S3 下载

文件比较大，根据实际情况选择下载，**不要重复下载**

如果需要下载，修改路径等，然后执行

In [4]:
%%time

import os

HPARAMS_NAME = '345m'
MODEL_NAME = '345m-hmwebmix-bpe-v2' # '345m-hmwebmix-bpe-v2' # '345m-xinliqa-hunyin' # '117m-hmwebmix_191128-bpe_v2' #  
TOKENIZER_TYPE = 'GPT2BPETokenizer_CN'

AWSS3_CKPTS_DIR = os.path.join('s3://huamei/hmgpt2-checkpoints', MODEL_NAME)
LOCAL_CKPTS_DIR = os.path.join('./checkpoints', MODEL_NAME)

# 复制 latest_checkpointed_iteration.txt
!aws s3 cp \
    {AWSS3_CKPTS_DIR} \
    {LOCAL_CKPTS_DIR} \
    --recursive \
    --exclude "*" \
    --include "latest_checkpointed_iteration.txt"

# 下载后读取最新的 checkpoint iter 名称
iteration = open(f'{LOCAL_CKPTS_DIR}/latest_checkpointed_iteration.txt').read()
iteration = int(iteration)
iteration_dir = 'iter_{:07d}'.format(iteration)

awss3_ckpt_dir = os.path.join(AWSS3_CKPTS_DIR, iteration_dir)
local_ckpt_dir = os.path.join(LOCAL_CKPTS_DIR, iteration_dir)

print(f'{awss3_ckpt_dir} ==> {local_ckpt_dir}')
    
# 同步最新的 Checkpiont
!aws s3 sync {awss3_ckpt_dir} {local_ckpt_dir}

#
load_model_dir = LOCAL_CKPTS_DIR
print('load: ', load_model_dir)
print('iteration: ', iteration)

download: s3://huamei/hmgpt2-checkpoints/345m-hmwebmix-bpe-v2/latest_checkpointed_iteration.txt to checkpoints/345m-hmwebmix-bpe-v2/latest_checkpointed_iteration.txt
s3://huamei/hmgpt2-checkpoints/345m-hmwebmix-bpe-v2/iter_0390000 ==> ./checkpoints/345m-hmwebmix-bpe-v2/iter_0390000
load:  ./checkpoints/345m-hmwebmix-bpe-v2
iteration:  390000
CPU times: user 55.7 ms, sys: 20.1 ms, total: 75.8 ms
Wall time: 2.59 s


### 直接使用本地

- 另外一个选择：直接使用本地的已有模型

修改超参数, 路径等：

In [3]:
%%time

import os

HPARAMS_NAME = '117m'
MODEL_NAME = '117m.spm-xinli_qa-hunyin'
TOKENIZER_TYPE = 'SentencePieceTokenizer'

AWSS3_CKPTS_DIR = os.path.join('s3://huamei/hmgpt2-checkpoints', MODEL_NAME)
LOCAL_CKPTS_DIR = os.path.join('./checkpoints', MODEL_NAME)


# 读取最新的 checkpoint iter 名称
iteration = open(f'{LOCAL_CKPTS_DIR}/latest_checkpointed_iteration.txt').read()
iteration_dir = 'iter_{:07d}'.format(int(iteration))

local_ckpt_dir = os.path.join(LOCAL_CKPTS_DIR, iteration_dir)

load_model_dir = LOCAL_CKPTS_DIR
print('load: ', load_model_dir)
print('iteration: ', iteration)

load:  ./checkpoints/117m.spm-xinli_qa-hunyin
iteration:  60000
CPU times: user 1.92 ms, sys: 350 µs, total: 2.26 ms
Wall time: 9.16 ms


## Environment Variables

- 用哪个/些 GPU?

In [5]:
%env CUDA_VISIBLE_DEVICES 0

env: CUDA_VISIBLE_DEVICES=0


## Importings

In [5]:
import copy
import csv
import json
import math
import os
import random
import sys
import time
from copy import copy, deepcopy
from contextlib import closing
from datetime import datetime, timedelta
from itertools import chain, compress
from functools import partial
from multiprocessing import Pool
from types import SimpleNamespace

import numpy as np
import torch
import torch.nn.functional as F
from tqdm.auto import tqdm, trange

import mpu
from data_utils.tokenization import SentencePieceTokenizer, make_tokenizer
from pretrain_gpt2 import initialize_distributed, get_masks_and_position_ids, set_random_seed
from generate_samples import prepare_tokenizer, setup_model, get_token_stream
# from predict_gpt2 import initialize_distributed, prepare_tokenizer, set_random_seed, setup_model, get_token_stream

## Args

In [6]:
args = SimpleNamespace(
    # Model arguments
    # To be updated ...
    vocab_size=None,
    make_vocab_size_divisible_by=128,
    attention_dropout=0.1,
    hidden_dropout=0.1,
    # Train/valid/test data arguments.
    seq_length=1024,
    model_parallel_size=1,
    tokenizer_model_type='bert-large-uncased',
    tokenizer_type=TOKENIZER_TYPE,
    tokenizer_path="./data/spm/gpt2_huamei_corpus_bpe_32k_v2.model",
    cache_dir=None,
    # Training arguments.
    load=load_model_dir,
    seed=1234,
    checkpoint_activations=None,
    checkpoint_num_layers=1,
    finetune=None,
    no_load_optim=None,
    no_load_rng=None,
    resume_dataloader=None,
    fp16=True,
    hysteresis=2,
    loss_scale=None,
    loss_scale_window=1000,
    min_scale=1,
    distributed_backend='nccl',
    DDP_impl='local',
    local_rank=None,
    reset_position_ids=None,
    reset_attention_mask=None,
    eod_mask_loss=None, 
    # Text generate arguments.
    recompute=None,
    greedy=False,
    top_p=0.0,
    top_k=0,
    temperature=1.0,
    out_seq_length=128,
)


# restore generating args
def reset_generating_args(args):
    args.recompute=False
    args.top_p=0.0
    args.top_k=0
    args.temperature=1


In [7]:
HPARAMS_SCHEMA = {
    '117m': dict(
        num_layers=12,
        hidden_size=768,
        num_attention_heads=12,
        max_position_embeddings=1024,
    ),
    '345m': dict(
        num_layers=24,
        hidden_size=1024,
        num_attention_heads=16,
        max_position_embeddings=1024,
    ),
}

# 设置 GPT-2 模型的超参数
for k, v in HPARAMS_SCHEMA[HPARAMS_NAME].items():
    setattr(args, k, v)


In [8]:
args.cuda = torch.cuda.is_available()
args.rank = int(os.getenv('RANK', '0'))
args.world_size = int(os.getenv("WORLD_SIZE", '1'))

if os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'):
    # We are using (OpenMPI) mpirun for launching distributed data parallel processes
    local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'))
    local_size = int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))

    # Possibly running with Slurm
    num_nodes = int(os.getenv('SLURM_JOB_NUM_NODES', '1'))
    nodeid = int(os.getenv('SLURM_NODEID', '0'))

    args.local_rank = local_rank
    args.rank = nodeid*local_size + local_rank
    args.world_size = num_nodes*local_size

args.model_parallel_size = min(args.model_parallel_size, args.world_size)
if args.rank == 0:
    print('using world size: {} and model-parallel size: {} '.format(
        args.world_size, args.model_parallel_size))

args.dynamic_loss_scale = False
if args.loss_scale is None:
    args.dynamic_loss_scale = True
    if args.rank == 0:
        print(' > using dynamic loss scaling')

# The args fp32_* or fp16_* meant to be active when the
# args fp16 is set. So the default behavior should all
# be false.
if not args.fp16:
    args.fp32_embedding = False
    args.fp32_tokentypes = False
    args.fp32_layernorm = False


using world size: 1 and model-parallel size: 1 
 > using dynamic loss scaling


In [9]:
display(args)

namespace(DDP_impl='local', attention_dropout=0.1, cache_dir=None, checkpoint_activations=None, checkpoint_num_layers=1, cuda=True, distributed_backend='nccl', dynamic_loss_scale=True, eod_mask_loss=None, finetune=None, fp16=True, greedy=False, hidden_dropout=0.1, hidden_size=1024, hysteresis=2, load='./checkpoints/345m-hmwebmix-bpe-v2', local_rank=None, loss_scale=None, loss_scale_window=1000, make_vocab_size_divisible_by=128, max_position_embeddings=1024, min_scale=1, model_parallel_size=1, no_load_optim=None, no_load_rng=None, num_attention_heads=16, num_layers=24, out_seq_length=128, rank=0, recompute=None, reset_attention_mask=None, reset_position_ids=None, resume_dataloader=None, seed=1234, seq_length=1024, temperature=1.0, tokenizer_model_type='bert-large-uncased', tokenizer_path='./data/spm/gpt2_huamei_corpus_bpe_32k_v2.model', tokenizer_type='GPT2BPETokenizer_CN', top_k=0, top_p=0.0, vocab_size=None, world_size=1)

## Init

### 初始化函数/全局变量

In [10]:
tokenizer = None
model = None

def initialize():
    global model, tokenizer

    # Disable CuDNN.
    torch.backends.cudnn.enabled = False

    # Pytorch distributed.
    initialize_distributed(args)

    # Random seeds for reproducability.
    set_random_seed(args.seed)

    # get the tokenizer
    tokenizer = prepare_tokenizer(args)

    # Model, optimizer, and learning rate.
    model = setup_model(args)

    args.device = torch.cuda.current_device()

    # setting default batch size to 1
    args.batch_size = 1

    assert mpu.get_model_parallel_rank() == 0

### 主进程初始化

In [11]:
%%time

initialize()

> initializing model parallel with size 1
> initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234
prepare tokenizer done
building GPT2 model ...
 > number of parameters on model parallel rank 0: 336128000
global rank 0 is loading checkpoint ./checkpoints/345m-hmwebmix-bpe-v2/iter_0390000/mp_rank_00/model_optim_rng.pt
  successfully loaded ./checkpoints/345m-hmwebmix-bpe-v2/iter_0390000/mp_rank_00/model_optim_rng.pt
CPU times: user 11.6 s, sys: 4.4 s, total: 16 s
Wall time: 16 s


## Inference functions

In [12]:
def infer_tokens_generative(context_tokens, model, tokenizer):
    with torch.no_grad():
        context_length = len(context_tokens)
        token_stream = get_token_stream(model, [context_tokens], tokenizer, args)   
        for i, (output_tokens, _) in enumerate(token_stream):
            if context_length + i >= args.seq_length:
                break
            ids = output_tokens.cpu().numpy().tolist()[0]
            yield ids[-1]


def infer_text_generative(contex_text, model, tokenizer):
    with torch.no_grad():
        contex_text = contex_text.strip()
        context_tokens = tokenizer.EncodeAsIds(contex_text).tokenization
        context_length = len(context_tokens)

        token_stream = get_token_stream(model, [context_tokens], tokenizer, args)

        for i, (output_tokens, _) in enumerate(token_stream):
            if context_length + i >= args.seq_length:
                break
            ids = output_tokens.cpu().numpy().tolist()[0]
            s = tokenizer.DecodeIds([ids[-1]])
            yield s


## 预测试试看

In [13]:
contex_text = '想和男人谈钱不伤感情，前提只有一个，做到了钱和感情就都有了 '
# '一个疯子进去学校把凳子打烂了，保安不敢动手，报警了，警察现在来了'
# '我喜欢'

print(contex_text)
print('==========')
print()

N = 3
for i in range(N):
    print(f'{1+i}/{N}:\t', end='')
    for s in infer_text_generative(contex_text.strip(), model, tokenizer):
        print(s, end='')
    print()
    print()

想和男人谈钱不伤感情，前提只有一个，做到了钱和感情就都有了 

1/3:	。现在扯到谈感情,就硬扯到钱,让女人退回她的地盘,让男人谈钱后悔。我实在不知道这波操作算不算淫荡,所有的文字里,你绝对想不到自己是深爱这个女人的,她和你谈钱,是对你的爱吗?男人要有钱,至少50万。你要是甩女人一个大嘴巴子,脸皮厚到真能连脸带鼻子来复仇了,说不定你老婆会对你另娶此女,因为她知道你还剩5万年薪;如果你不要脸,哪留着呢,到时有3个人抢着要,

2/3:	。大蛇丸信笔生花,幅度之大,自不潇洒:既然不想不懂感谢,算了,就把我列入你毛条钱的范畴内吧:“首先,在被告知获得官晶之后,才能拿发票到日本,这是为了让日本方面将发票送回我的单位。”这一点,句句交代,事情的来龙去脉都清楚了。即便是有笔者经验丰富的“女秘书”栗原率先撕闪了,“我还是想亲口问问,一拍卖前,吉本老师会让一亿只贵宾室里摆满贵宾席,吉本

3/3:	。在谈资金周转的时候不得注意揣摩着眼前的小钱。别把你的钱象影片中的仆人一样让男人养,区别是嫁入豪门不等于嫁入了国,而爱情比钱更重要。除非你要求钱,那就请明确地告诉他:如今的票子,重要到能与生命的价值相比,甚至是生命的容颜。好了,爱情和物质,在某些男人眼里过过嘴瘾来,过过嘴瘾而已样资不一定要丰盛,精神上富足不一定是车房,只是罗纳德所向往的,年轻的时候向往着一切



## Test

使用 test 语料，从中随机打断，并预测下文，比较原文与预测结果！

随机选 N 个

In [13]:
from datetime import datetime
ts = datetime.now().strftime('%y%m%d%H%M%S')

N = 100
C = 3
sample_random = 1
sample_shuffle = 1

input_file = './data/hmwebmix_191203/test.json'
output_file = f'./data/hmwebmix_191203/predict.{ts}-{HPARAMS_NAME}.{iteration_dir}-{N}x{C}x{args.out_seq_length}-r{sample_random}s{sample_shuffle}.tsv'

print(f'output_file={output_file}')

output_file=./data/hmwebmix_191203/predict.191205170349-345m.iter_0390000-100x3x128-r1s1.tsv


In [14]:
total = sum(1 for _ in tqdm(open(input_file)))
print(f'Test 数据总数: {total:,d}')

assert total >= N

print(f'Test 采样数: {N:,d}')

mask = np.zeros(total, dtype=int)
mask[:N] = 1
if sample_random:
    np.random.shuffle(mask)

samples = []
with open(input_file) as fp:
    reader = compress(fp, mask)
    for line in tqdm(reader, 'sample', total=N):
        line = line.strip()
        if not line:
            continue
        text = json.loads(line)['text']
        samples.append(text)

if sample_shuffle:
    random.shuffle(samples)

with open(output_file, 'w') as fp:
    writer = csv.writer(fp, delimiter='\t')
    for context_txt in tqdm(samples, 'infer'):
        context_ids = tokenizer.EncodeAsIds(context_txt).tokenization
        context_len = len(context_ids)
        idx = round(random.gauss(context_len*0.5, context_len*0.1))
        input_ids = context_ids[:idx]
        label_ids = context_ids[idx:]
        row = [
            tokenizer.DecodeIds(ids)
            for ids in (input_ids, label_ids)
        ]
        for _ in range(C):
            reset_generating_args(args)
            try:
                args.recompute=True
                args.top_p=random.gauss(0.5, 0.5)
                args.temperature=random.gauss(1, 0.05)
                infer_ids = list(infer_tokens_generative(copy(input_ids), model, tokenizer))
                row.append(tokenizer.DecodeIds(infer_ids))
            finally:
                reset_generating_args(args)
        writer.writerow(row)


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Test 数据总数: 18,894
Test 采样数: 100


HBox(children=(IntProgress(value=0, description='sample', style=ProgressStyle(description_width='initial')), H…




HBox(children=(IntProgress(value=0, description='infer', style=ProgressStyle(description_width='initial')), HT…


