In [1]:
import sys, os
sys.path.append('./glm_10b_chinese')

import torch
import transformers

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import deepspeed

from modeling_glm import GLMForConditionalGeneration
from modeling_glm import GLMBlock

In [3]:

tokenizer = AutoTokenizer.from_pretrained("BAAI/glm-10b-chinese", trust_remote_code=True)
# model = AutoModelForSeq2SeqLM.from_pretrained("BAAI/glm-10b", trust_remote_code=True)
# model.load_state_dict(torch.load('./blocklm-10b-chinese/mp_rank_00_model_states.pt'),strict=False)
# model = GLMForConditionalGeneration.from_pretrained('glm_10b_chinese')
model = GLMForConditionalGeneration.from_pretrained('models/character_20230102_1055')
model = model.half().cuda()

Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


In [4]:
# # Initialize the DeepSpeed-Inference engine
# ds_engine = deepspeed.init_inference(model,
#                                  mp_size=1,
#                                  dtype=torch.half,
#                                  checkpoint=None,
#                                  replace_method='auto',
#                                  #eplace_with_kernel_inject=True
#                                  injection_policy={GLMBlock: ('SelfAttention.o',)}
#                                  )
# model = ds_engine.module


In [5]:
def gen(text, max_length=500):
    if '[gMASK]' in text:
        mask_id = tokenizer.gmask_token_id
    elif '[MASK]' in text:
        mask_id = tokenizer.mask_token_id
    elif '[sMASK]' in text:
        mask_id = tokenizer.smask_token_id
    else:
        text += '[gMASK]'
        mask_id = tokenizer.gmask_token_id
    inputs = tokenizer(text, return_tensors="pt")
    print(inputs)

    inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=512)
    inputs = {key: value.cuda() for key, value in inputs.items()}
    inputs["generation_attention_mask"] = inputs["generation_attention_mask"].half()
    outputs = model.generate(**inputs, max_new_tokens=max_length, 
    # min_length=min_length, eos_token_id=tokenizer.eop_token_id, 
                            # num_beams=num_beams, length_penalty=length_penalty, no_repeat_ngram_size=no_repeat_ngram, temperature=temperature,
                            # top_p=top_p, top_k=top_k
                            )
    output_tokens = outputs[0].tolist()                        
    output = tokenizer.decode(output_tokens)
    
    return output, output_tokens 

In [6]:
import time
txt = 'GLM-10b-chinese 是一个10B的中文预训练语言模型。'


t0 = time.time()
ret, tokens = gen(txt, max_length=16)
t1 = time.time()

{'input_ids': tensor([[50002,   602, 43233,  2291, 43658, 43400,   688, 18792, 26952,    30,
         43668, 43360,  1309, 44162,   995,   613,  2421, 43361, 50009, 50000]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
prepare...
is sep:  False
hidden states size:  torch.Size([1, 21, 4096])
hidden states size:  torch.Size([1, 21, 4096])
hidden states size:  torch.Size([1, 21, 4096])
hidden states size:  torch.Size([1, 21, 4096])
hidden states size:  torch.Size([1, 21, 4096])
hidden states size:  torch.Size([1, 21, 4096])
hidden states size:  torch.Size([1, 21, 4096])
hidden states size:  torch.Size([1, 21, 4096])
hidden states size:  torch.Size([1, 21, 4096])
hidden states size:  torch.Size([1, 21, 4096])
hidden states size:  torch.Size([1, 21, 4096])
hidden states size:  torch.Size([1, 21, 4096])
hidden states size:  torch.Size([1, 21, 4096])
hidden states 

In [12]:
print(ret)

[CLS] GLM-10b-chinese 是一个10B的中文预训练语言模型。 [gMASK] <|endoftext|> <|startofpiece|> 它使用一个预训练好的中文句子,通过一个简单的学习过程,来


In [8]:
tokens_in = tokenizer(txt)

total_new_tokens_generated = len(tokens) - len(tokens_in)
throughput = (total_new_tokens_generated) / (t1 - t0)

In [9]:
print(f"""Tokens generated: {total_new_tokens_generated}
Time: {t1 - t0:.1f} seconds
Tokens per second: {throughput:.1f}
Latency: {1000 / throughput:.1f} ms""")

Tokens generated: 34
Time: 1.1 seconds
Tokens per second: 30.8
Latency: 32.5 ms


In [10]:
a = torch.empty((1,0,3))
b = torch.ones((1,2,3))
a.size(), b.size()

(torch.Size([1, 0, 3]), torch.Size([1, 2, 3]))

In [11]:
torch.cat([a,b], dim=1).size()

torch.Size([1, 2, 3])