# Megatron ForumQA

## CD

定位到工作目录，根据具体情况决定哦，不一定是下面的命令

In [1]:
%cd ..

/home/Public/Megatron-LM


## 环境准备

准备运行这个笔记本的 Jupyter kernel(**如果已经准备就绪，不要重复执行！**)：


1. 配置一个 Conda 环境作为 Jupyter Kernel

In [None]:
%conda env update -f environments/environment-ipy.yml

安装完毕后，为该 Notebook 选择这个 Kernel (名为`Megatron_LM-ipy`)

2. 在Kernel所在 Conda 环境中安装 Apex

需要通过 pip 从 github 下载源代码安装：

In [None]:
%pip install -v -r requirements/apex.txt

## 下载 Checkpoints

文件比较大，根据实际情况选择下载，**不要重复下载**

In [2]:
import os

S3_BUCKET = 'huamei'
CKPTS_DIR = 'checkpoints/345m-mildil'

S3_CKPTS_DIR = 's3://' + os.path.join(S3_BUCKET, CKPTS_DIR)

In [3]:
%%time

# 复制 latest_checkpointed_iteration.txt
!aws s3 cp \
    {S3_CKPTS_DIR} \
    {CKPTS_DIR} \
    --recursive \
    --exclude "*" \
    --include "latest_checkpointed_iteration.txt"

# 下载后读取最新的 checkpoint iter 名称
iter_step = open(f'{CKPTS_DIR}/latest_checkpointed_iteration.txt').read().strip()
ckpt_dir = f'iter_{iter_step}'

print(f'checkpoint {ckpt_dir}')

s3_ckpt_dir = os.path.join(S3_CKPTS_DIR, ckpt_dir)
local_ckpt_dir = os.path.join(CKPTS_DIR, ckpt_dir)

print(f'sync: {s3_ckpt_dir} -> {local_ckpt_dir}')
    
# 同步最新的 Checkpiont
!aws s3 sync \
    s3://huamei/hmgpt2-checkpoints/345m-hmwebmix-bpe-v2/iter_0230000 \
    ./checkpoints/345m-hmwebmix-bpe-v2/iter_0230000


checkpoint iter_230000
sync: s3://huamei/checkpoints/345m-hmwebmix-bpe-v2/iter_230000 -> checkpoints/345m-hmwebmix-bpe-v2/iter_230000
CPU times: user 27.4 ms, sys: 21.1 ms, total: 48.5 ms
Wall time: 2.26 s


## Environment Variables

- 用哪个/些 GPU?

In [3]:
%env CUDA_VISIBLE_DEVICES 0

env: CUDA_VISIBLE_DEVICES=0


## Importings

In [2]:
import copy
import csv
import json
import math
import os
import random
import sys
import time
from contextlib import closing
from itertools import chain, compress
from functools import partial
from multiprocessing import Pool
from types import SimpleNamespace

import numpy as np
import torch
import torch.nn.functional as F
from tqdm.auto import tqdm, trange

import mpu
from data_utils.tokenization import SentencePieceTokenizer, make_tokenizer
from pretrain_gpt2 import get_masks_and_position_ids
from predict_gpt2 import initialize_distributed, prepare_tokenizer, set_random_seed, setup_model, get_token_stream

## Args

In [3]:
MODELS_HPARAMS_DICT = {
    '117m': dict(
        num_layers=12,
        hidden_size=768,
        num_attention_heads=12,
        max_position_embeddings=1024,
    ),
    '345m': dict(
        num_layers=24,
        hidden_size=1024,
        num_attention_heads=16,
        max_position_embeddings=1024,
    )
}

In [4]:
PARAMETERS = '117m'

In [6]:
args = SimpleNamespace(
    # Model arguments
    vocab_size=None,
    make_vocab_size_divisible_by=128,
    attention_dropout=0.1,
    hidden_dropout=0.1,
    # Train/valid/test data arguments.
    seq_length=1024,
    model_parallel_size=1,
    tokenizer_model_type='bert-large-uncased',
#     tokenizer_type='GPT2BPETokenizer_CN',
    tokenizer_type='SentencePieceTokenizer',
    tokenizer_path="./data/spm/gpt2_huamei_corpus_bpe_32k_v2.model",
    cache_dir=None,
    # Training arguments.
    load='./checkpoints/xinliqa.117m/',
    seed=1234,
    checkpoint_activations=None,
    checkpoint_num_layers=1,
    finetune=None,
    no_load_optim=None,
    no_load_rng=None,
    resume_dataloader=None,
    fp16=True,
    hysteresis=2,
    loss_scale=None,
    loss_scale_window=1000,
    min_scale=1,
    distributed_backend='nccl',
    DDP_impl='local',
    local_rank=None,
    reset_position_ids=None,
    reset_attention_mask=None,
    eod_mask_loss=None, 
    # Text generate arguments.
    recompute=None,
    greedy=False,
    top_p=0.0,
    top_k=0,
    temperature=1.0,
    out_seq_length=1024,
)

for k, v in MODELS_HPARAMS_DICT[PARAMETERS].items():
    setattr(args, k, v)

In [7]:
with open(os.path.join(args.load, 'latest_checkpointed_iteration.txt')) as fp:
    latest_checkpointed_iteration = int(fp.read().strip())
print('latest_checkpointed_iteration:', latest_checkpointed_iteration)

latest_checkpointed_iteration: 130000


In [8]:
args.cuda = torch.cuda.is_available()
args.rank = int(os.getenv('RANK', '0'))
args.world_size = int(os.getenv("WORLD_SIZE", '1'))

if os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'):
    # We are using (OpenMPI) mpirun for launching distributed data parallel processes
    local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'))
    local_size = int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))

    # Possibly running with Slurm
    num_nodes = int(os.getenv('SLURM_JOB_NUM_NODES', '1'))
    nodeid = int(os.getenv('SLURM_NODEID', '0'))

    args.local_rank = local_rank
    args.rank = nodeid*local_size + local_rank
    args.world_size = num_nodes*local_size

args.model_parallel_size = min(args.model_parallel_size, args.world_size)
if args.rank == 0:
    print('using world size: {} and model-parallel size: {} '.format(
        args.world_size, args.model_parallel_size))

args.dynamic_loss_scale = False
if args.loss_scale is None:
    args.dynamic_loss_scale = True
    if args.rank == 0:
        print(' > using dynamic loss scaling')

# The args fp32_* or fp16_* meant to be active when the
# args fp16 is set. So the default behavior should all
# be false.
if not args.fp16:
    args.fp32_embedding = False
    args.fp32_tokentypes = False
    args.fp32_layernorm = False


using world size: 1 and model-parallel size: 1 
 > using dynamic loss scaling


## Init

### 初始化函数/全局变量

In [9]:
tokenizer = None
model = None

def initialize():
    global model, tokenizer

    # Disable CuDNN.
    torch.backends.cudnn.enabled = False

    # Pytorch distributed.
    initialize_distributed(args)

    # Random seeds for reproducability.
    set_random_seed(args.seed)

    # get the tokenizer
    tokenizer = prepare_tokenizer(args)

    # Model, optimizer, and learning rate.
    model = setup_model(args)

    args.device = torch.cuda.current_device()

    # setting default batch size to 1
    args.batch_size = 1

    assert mpu.get_model_parallel_rank() == 0

### 主进程初始化

In [10]:
%%time

initialize()


> initializing model parallel with size 1
> initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234
prepare tokenizer done
building GPT2 model ...
 > number of parameters on model parallel rank 0: 110516736
global rank 0 is loading checkpoint ./checkpoints/xinliqa.117m/iter_0130000/mp_rank_00/model_optim_rng.pt
  successfully loaded ./checkpoints/xinliqa.117m/iter_0130000/mp_rank_00/model_optim_rng.pt
CPU times: user 5.22 s, sys: 1.59 s, total: 6.81 s
Wall time: 6.84 s


## 预测用的函数

In [11]:
def infer_tokens_generative(context_tokens, model, tokenizer):
    context_length = len(context_tokens)
    token_stream = get_token_stream(model, [context_tokens], tokenizer, args)   
    for i, (output_tokens, _) in enumerate(token_stream):
        if context_length + i >= args.seq_length:
            break
        ids = output_tokens.cpu().numpy().tolist()[0]
        yield ids[-1]


def infer_text_generative(contex_text, model, tokenizer):
    contex_text = contex_text.strip()
    context_tokens = tokenizer.EncodeAsIds(contex_text).tokenization
    context_length = len(context_tokens)

    token_stream = get_token_stream(model, [context_tokens], tokenizer, args)
    
    for i, (output_tokens, _) in enumerate(token_stream):
        if context_length + i >= args.seq_length:
            break
        ids = output_tokens.cpu().numpy().tolist()[0]
        s = tokenizer.DecodeIds([ids[-1]])
        yield s


def question_to_ids(data, end_with_bos=True):
    result = []
    # 标题, 问题
    for k in ('title', 'text'):
        s = data[k].strip()
        ids = tokenizer.EncodeAsIds(s.strip()).tokenization
        result.extend(ids)
        result.append(tokenizer.TokenToId('<sep>'))
    tags = data.get('tags')
    if tags:
        tag = random.choice(tags)
        ids = tokenizer.EncodeAsIds(tag.strip()).tokenization
        result.extend(ids)
    result.append(tokenizer.TokenToId('<sep>'))
    # 问题/回答 的分隔符，保证连续两个 `<sep>`
    result.append(tokenizer.TokenToId('<sep>'))
    # 回答 的开头 `<bos>`
    if end_with_bos:
        result.append(tokenizer.TokenToId('<bos>'))
    #
    return result


## 验证是否可运行

In [8]:
question_list = [
    {
        "title": "22岁，做直销",
        "text": "父母都很支持，可是我性格内向，而且不喜欢说话，圈内有很多成功了的大神，我没自信能不能成功，直销靠谱不",
        "tags": ["职业", "职业管理", "工作压力"]
    },
    {
        "title": "真理都是相通的?",
        "text": "从洞穴出来看见理性之光的形而上人开始拼凑真理的碎片，到达二点零的人拼凑起来啦一部分碎片形成自己一套看世界的体系。\n这时候的人都会用自己的一套概念体系去描述他研究得那一部分真理，可是你会发现他们说得都是一回事——真理都是相通的只是表述不同罢啦\n人性是人文科学的研究对象，物性是自然科学的研究对象，前者合乎情理（符合哲理），后者合乎逻辑（符合形式逻辑那一套），宗教的研究对象是信仰，哲学是哲学家，美学是艺术家。\n真理的高山一直在那里，我们已经探索啦很多很多，我们学习学科历史是要把山头探索清楚然后摸清楚魔方块的套路，最后构建自己的魔方块",
        "tags": ["科普", "书籍", "热点话题", "心理咨询"]
    },
]

In [11]:
# args.recompute=True
# args.top_p=0.0
# args.top_k=0
# args.temperature=1.0

n_gen = 3
max_try = 3

for question in question_list:
#     print(question['title'])
#     print(question['text'])
#     input_ids = question_to_ids(question)
#     print(tokenizer.DecodeIds(input_ids))
#     print()

    for i in range(n_gen):
#         args.temperature=random.gauss(0.95, 0.05)
        print(f'Answer {i+1}: ', end='')
        s = ''
        for _ in range(max_try):
            if s:
                break
            input_ids = question_to_ids(question)
            for j, id_ in enumerate(infer_tokens_generative(input_ids, model, tokenizer)):
                if j == 0:
#                     if id_ != tokenizer.TokenToId('<bos>'):
#                         break
                    if id_ == tokenizer.TokenToId('<eos>'):
                        break
                s = tokenizer.DecodeIds([id_])
                print(s, end='')
        print()
    print()
    print('=' * 100)
    print()
    

Answer 1: 难,领导请著我离家等,可能跟其他人讲说话又会不好使,气氛尴尬内向 <eos>
Answer 2: 社区都说我是金金立难变。上班暑假过后面试对象,最近,家人对我越来冷淡,踢自行车我也踢了好多要求我去做ay不支持心理学将来,我没有成年学习伙伴,他们就更纯涉之中的,我还大学如何他们都帮助我,后来我都想从一年半前做成为母女,现在他们怕了我朋友跟她妈残害了她有很严重的疾病,我都觉得自己很g很可怕,我很是为别人而活 <eos>
Answer 3: ,可我却吃得是挺辛苦的标志真的很不舒服,但就是我越不想在家表面上大看上去很好看,支持我做郭黑车没出息,而我却2号得了,越多鬱,女儿现在才11一再的信任学校,就更不想来了,怎么调解那种他们?我刚开始发现绯闻有人在窥视我隐私特别的活跃的想法!自从女儿玩,看见女孩子染上自拍而且,脸大了,歟26调都丑多了!这是怎么能判断点儿反应?这我一直在儿子轮流,今年若干年裤結囿于即,2015年打算一骄傲点,感受点浓厚和功利,使聪明反思甚至在他们面前下摇。原来我一直所在的院子里,包括医药管也没人能做出来的,但是我妈,那个项目全丢了。我们钱也是全贴补十八万1.2万可以换回,甚至如果换起来方便更多从头再来一次的资金投诉等,我必须一口气把这个还完。其实我也想过向朋友倾诉,可是因为在她们面前谁都没说,等她们哭的那久终于垮了,只有我自己在电话里跟自己说,现在的社会好黑暗,好想改善现状,想挣脱现状,好想赚钱。与丈夫此刻的相处更糟糕。 <eos>


Answer 1: 像我,让瓣也也被似乎与外师美的面孔哗众取笑,辄主义,不知公主病流不止于夸张,不服输,看法的女孩子和生物之间的竞争我对这方面真不好吧, <eos>
Answer 2: 人心都是个神经元,不过宣传酷伦后来我所在,暗恋过度解读,因为中国式思想和思想们不断步死亡,我被感染但是我就和中国系统学号说如果人中医学不好也提高了。这是我想的?怎样自我探究?有时候那个人说的我很新奇想的一些人,其实有些不完全,但是这是他非的心理学依据???我怕他们注视不是医学专业。。。他们老让我自言自语啦,我该怎么办?求解释 <eos>
Answer 3: 人心目,我的样样蛇女判断力极好,否则不敢得罪人...夜深人,和机器人曾遭遇过強的磨合,学生都很多..而且亲口说活在世,但身边的人便一直隐忍着...这十年,幻听而八面阅

## Test

使用 test 语料，从中随机打断，并预测下文，比较原文与预测结果！

随机选 N 个

In [None]:
n_samples = 1000
n_infer = 5
b_random = True
b_shuffle = True

input_file = './data/xinliqa/test.json'
output_file = f'./data/xinliqa/{PARAMETERS}.predict-{latest_checkpointed_iteration}-{n_samples}_{args.seq_length}_{n_infer}'
if b_random:
    output_file += '.random'
if b_shuffle:
    output_file += '.shuffle'
output_file += '.json'

print(f'output_file={output_file}')

total = sum(1 for _ in tqdm(open(input_file)))
print(f'Test 数据总数: {total:,d}')

if n_samples > 0:
    assert total >= n_samples
else:
    n_samples = total

print(f'Test 采样数: {n_samples:,d}')

mask = np.zeros(total, dtype=int)
mask[:n_samples] = 1
if b_random:
    np.random.shuffle(mask)

samples = []
with open(input_file) as fp:
    reader = compress(fp, mask)
    for line in tqdm(reader, 'sample', total=n_samples):
        line = line.strip()
        if not line:
            continue
        data = json.loads(line)
        samples.append(data)

if b_shuffle:
    random.shuffle(samples)

with open(output_file, 'w') as fp:
    for data in tqdm(samples, 'infer'):
        inferred = data['inferred'] = []
        for i in range(n_infer):
            input_ids = question_to_ids(data)
            output_ids = list(infer_tokens_generative(input_ids, model, tokenizer))
            inferred.append(tokenizer.DecodeIds(output_ids))
        print(json.dumps(data, ensure_ascii=False), file=fp)


output_file=./data/xinliqa/117m.predicted-130000-1000_1024_5.random.shuffle.json


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Test 数据总数: 4,000
Test 采样数: 1,000


HBox(children=(IntProgress(value=0, description='sample', max=1000, style=ProgressStyle(description_width='ini…




HBox(children=(IntProgress(value=0, description='infer', max=1000, style=ProgressStyle(description_width='init…