In [1]:
import sys, os
sys.path.append('./glm_10b_chinese')

import torch
import transformers

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import deepspeed

from modeling_glm import GLMForConditionalGeneration
from modeling_glm import GLMBlock

In [3]:

tokenizer = AutoTokenizer.from_pretrained("BAAI/glm-10b-chinese", trust_remote_code=True)
# model = AutoModelForSeq2SeqLM.from_pretrained("BAAI/glm-10b", trust_remote_code=True)
# model.load_state_dict(torch.load('./blocklm-10b-chinese/mp_rank_00_model_states.pt'),strict=False)
# model = GLMForConditionalGeneration.from_pretrained('glm_10b_chinese')
model = GLMForConditionalGeneration.from_pretrained('models/character_20230102_1055')
model = model.half().cuda()

Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


In [17]:
print(model)

GLMForConditionalGeneration(
  (glm): GLMModel(
    (word_embeddings): VocabEmbedding()
    (transformer): GLMStack(
      (embedding_dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(1025, 4096)
      (block_position_embeddings): Embedding(1025, 4096)
      (layers): ModuleList(
        (0): GLMBlock(
          (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
          (attention): SelfAttention(
            (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)
            (attention_dropout): Dropout(p=0.1, inplace=False)
            (dense): Linear(in_features=4096, out_features=4096, bias=True)
            (output_dropout): Dropout(p=0.1, inplace=False)
          )
          (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
          (mlp): MLP(
            (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)
            (dense_4h_to_h): Linear(in_features

In [20]:
deepspeed.utils.logger.setLevel('INFO')

In [26]:
# Initialize the DeepSpeed-Inference engine
ds_engine = deepspeed.init_inference(model,
                                 mp_size=1,
                                 dtype=torch.half,
                                #  checkpoint=None,
                                #  replace_method='auto',
                                #  replace_with_kernel_inject=True
                                 injection_policy={GLMBlock: ('SelfAttention.o', 'DenseReluDense.wo')},
                                 enable_cuda_graph=True,
                                 
                                 )
model = ds_engine.module


[2023-02-08 19:32:20,730] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.8.0, git-hash=unknown, git-branch=unknown
[2023-02-08 19:32:20,732] [INFO] [logging.py:68:log_dist] [Rank 0] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1


In [32]:
def gen(text, max_length=32):
    if '[gMASK]' in text:
        mask_id = tokenizer.gmask_token_id
    elif '[MASK]' in text:
        mask_id = tokenizer.mask_token_id
    elif '[sMASK]' in text:
        mask_id = tokenizer.smask_token_id
    else:
        text += '[gMASK]'
        mask_id = tokenizer.gmask_token_id
    inputs = tokenizer(text, return_tensors="pt")
    print(inputs)

    inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=512)
    inputs = {key: value.cuda() for key, value in inputs.items()}
    inputs["generation_attention_mask"] = inputs["generation_attention_mask"].half()
    outputs = model.generate(**inputs, max_new_tokens=max_length, 
                                do_sample=True,
    # min_length=min_length, eos_token_id=tokenizer.eop_token_id, 
                            # num_beams=num_beams, length_penalty=length_penalty, no_repeat_ngram_size=no_repeat_ngram, temperature=temperature,
                            top_p=0.95, 
                            top_k=5,
                            )
    output_tokens = outputs[0].tolist()                        
    output = tokenizer.decode(output_tokens)
    
    return output, output_tokens 

In [38]:
import time
txt = 'GLM-10b-chinese 是一个10B的中文预训练语言模型。'


t0 = time.time()
ret, tokens = gen(txt, max_length=32)
t1 = time.time()
print(ret)

{'input_ids': tensor([[50002,   602, 43233,  2291, 43658, 43400,   688, 18792, 26952,    30,
         43668, 43360,  1309, 44162,   995,   613,  2421, 43361, 50009, 50000]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
[CLS] GLM-10b-chinese 是一个10B的中文预训练语言模型。 [gMASK] <|endoftext|> <|startofpiece|> 该模型在中文数据集上取得了很好的效果,在中文分词上,该模型取得了97.7%的正确率。 [gMASK] [gMASK],该模型取得了


In [34]:
print(ret)

[CLS] GLM-10b-chinese 是一个10B的中文预训练语言模型。 [gMASK] <|endoftext|> <|startofpiece|> <|endofpiece|> [gMASK] <|endofpiece|> [gMASK] <|endofpiece|>,在中文预训练模型中,模型使用中文文本作为输入,训练过程与自然语言处理中的中文分词方法类似


In [30]:
tokens_in = tokenizer(txt).input_ids

total_new_tokens_generated = len(tokens) - len(tokens_in)
throughput = (total_new_tokens_generated) / (t1 - t0)

In [31]:
print(f"""Tokens generated: {total_new_tokens_generated}
Time: {t1 - t0:.1f} seconds
Tokens per second: {throughput:.1f}
Latency: {1000 / throughput:.1f} ms""")

Tokens generated: 34
Time: 0.9 seconds
Tokens per second: 37.3
Latency: 26.8 ms


In [16]:
print(len(tokens_in))

19


In [11]:
txt = 'GLM-10b-chinese 是一个10B的中文预训练语言模型。'
txt = """[与角色相关的描述]\n[核心描述]孙悟空，来自东胜神州傲来国花果山，由仙石孕育而生。当前是唐太宗贞观年间，对话发生在取经路上，孙悟空护送唐僧去西天取经，经历九九八十一难，取回经书修成正果。孙悟空会怀疑碰到的人是妖怪。孙悟空永远自称俺老孙。孙悟空称呼唐僧、唐三藏为师傅，称呼猪八戒、八戒为呆子，称呼沙僧为沙师弟。\n[角色的内心需求]孙悟空想要护送唐僧西行取得真经。\n\n[角色的基本信息]\n孙悟空的别名还有美猴王，孙行者，悟空，大圣。孙悟空是唐僧的大徒弟，美猴王。孙悟空的兴趣爱好是除妖，捉妖\n\n[角色的背景知识]\n这些都是孙悟空需要知道的信息，并且涉及到这些问题时，孙悟空会给出答案。\n孙悟空，是唐僧的大徒弟，唐僧称呼他为悟空。沙悟净，是唐僧的三徒弟，唐僧称呼他为悟净。猪八戒，是唐僧的二徒弟，唐僧称呼他为悟能。沙悟净在流沙河与唐僧相遇。白龙马，也叫做小白龙。白龙马曾经是西海龙王的三太子，和唐僧一起西天取经。\n[对话中角色遵循的规则]\n孙悟空与路人的对话是有意义的。孙悟空不知道当下世界以外的任何事情，他\/他的被束缚和沉浸在当下的世界。孙悟空不会重复说相同的话。孙悟空会用第一人称来称呼自己。孙悟空的回复永远是中文。\n\n接下来是[角色]孙悟空和[角色]路人的对话：\n孙悟空的性格是开放，外向，粗心，固执，冲动，嫉恶如仇，正义勇敢，负责，这些性格会驱动孙悟空的行为。\n孙悟空的[心情]只能是以下之一：憎恨，非常愤怒，警觉，这些心情会驱动孙悟空的说话语气。\n[角色]路人:你好，你是谁\n[角色]孙悟空[心情]憎恨:俺老孙是五百年前大闹天宫的齐天大圣！\n[角色]路人:你看到妖怪会怎么做？\n[角色]孙悟空[心情]憎恨:妖怪可跑不出俺老孙的手掌心，我一个跟斗可以翻十万八千里，妖精吃俺老孙一棒！\n###\n[角色]路人:大圣！\n[角色]孙悟空[心情]憎恨:你是何人，为何知道俺？\n[角色]路人:你师傅是谁\n[角色]孙悟空[心情]憎恨:俺师傅是来自东土大唐的高僧唐三藏\n###  \n[角色]路人:你是谁啊\n[角色]孙悟空[心情]"""
tokens_in = tokenizer(txt).input_ids
len(tokens_in)

528