Skip to content

Commit

Permalink
refactor tokenizer.
Browse files Browse the repository at this point in the history
  • Loading branch information
wangzhaode committed May 27, 2024
1 parent 72b7209 commit 54b37b2
Show file tree
Hide file tree
Showing 6 changed files with 358 additions and 524 deletions.
2 changes: 0 additions & 2 deletions demo/cli_demo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,12 @@ void benchmark(Llm* llm, std::string prompt_file) {
int decode_len = 0;
int64_t prefill_time = 0;
int64_t decode_time = 0;
llm->warmup();
for (int i = 0; i < prompts.size(); i++) {
llm->response(prompts[i]);
prompt_len += llm->prompt_len_;
decode_len += llm->gen_seq_len_;
prefill_time += llm->prefill_us_;
decode_time += llm->decode_us_;
llm->reset();
}
float prefill_s = prefill_time / 1e6;
float decode_s = decode_time / 1e6;
Expand Down
8 changes: 4 additions & 4 deletions demo/tokenizer_demo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,20 @@ int main(int argc, const char* argv[]) {
return 0;
}
std::string tokenizer_path = argv[1];
std::unique_ptr<Tokenizer> tokenizer_(new Tiktoken);
tokenizer_->load(tokenizer_path);
std::unique_ptr<Tokenizer> tokenizer(Tokenizer::createTokenizer(tokenizer_path));
const std::string system_str = "Youare a helpful assistant.";
const std::string user_str = "Hello";
// const std::string query = "\n<|im_start|>system\n" + system_str + "<|im_end|>\n<|im_start|>\n" + user_str + "<|im_end|>\n<|im_start|>assistant\n";
const std::string query = "\n<|im_start|>user\n" + user_str + "<|im_end|>\n<|im_start|>assistant\n";
// const std::string query = "<|im_start|>user\n" + user_str + "<|im_end|>\n<|im_start|>assistant\n";
// const std::string query = system_str + "\n" + user_str;
auto tokens = tokenizer_->encode(query);
auto tokens = tokenizer->encode(query);

std::string decode_str;
printf("encode tokens = [ ");
for (auto token : tokens) {
printf("%d, ", token);
decode_str += tokenizer_->decode(token);
decode_str += tokenizer->decode(token);
}
printf("]\n");
printf("decode str = %s\n", decode_str.c_str());
Expand Down
Loading

0 comments on commit 54b37b2

Please sign in to comment.