Skip to content

Commit

Permalink
refactor tokenizer.
Browse files Browse the repository at this point in the history
  • Loading branch information
wangzhaode committed May 13, 2024
1 parent 72b7209 commit eb8e94e
Show file tree
Hide file tree
Showing 6 changed files with 346 additions and 518 deletions.
2 changes: 0 additions & 2 deletions demo/cli_demo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,12 @@ void benchmark(Llm* llm, std::string prompt_file) {
int decode_len = 0;
int64_t prefill_time = 0;
int64_t decode_time = 0;
llm->warmup();
for (int i = 0; i < prompts.size(); i++) {
llm->response(prompts[i]);
prompt_len += llm->prompt_len_;
decode_len += llm->gen_seq_len_;
prefill_time += llm->prefill_us_;
decode_time += llm->decode_us_;
llm->reset();
}
float prefill_s = prefill_time / 1e6;
float decode_s = decode_time / 1e6;
Expand Down
8 changes: 4 additions & 4 deletions demo/tokenizer_demo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,20 @@ int main(int argc, const char* argv[]) {
return 0;
}
std::string tokenizer_path = argv[1];
std::unique_ptr<Tokenizer> tokenizer_(new Tiktoken);
tokenizer_->load(tokenizer_path);
std::unique_ptr<Tokenizer> tokenizer(Tokenizer::createTokenizer(tokenizer_path));
const std::string system_str = "Youare a helpful assistant.";
const std::string user_str = "Hello";
// const std::string query = "\n<|im_start|>system\n" + system_str + "<|im_end|>\n<|im_start|>\n" + user_str + "<|im_end|>\n<|im_start|>assistant\n";
const std::string query = "\n<|im_start|>user\n" + user_str + "<|im_end|>\n<|im_start|>assistant\n";
// const std::string query = "<|im_start|>user\n" + user_str + "<|im_end|>\n<|im_start|>assistant\n";
// const std::string query = system_str + "\n" + user_str;
auto tokens = tokenizer_->encode(query);
auto tokens = tokenizer->encode(query);

std::string decode_str;
printf("encode tokens = [ ");
for (auto token : tokens) {
printf("%d, ", token);
decode_str += tokenizer_->decode(token);
decode_str += tokenizer->decode(token);
}
printf("]\n");
printf("decode str = %s\n", decode_str.c_str());
Expand Down
212 changes: 47 additions & 165 deletions include/llm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,6 @@ class LlmConfig {
return config_.value("model_type", "unknow");
}

std::string tokenizer_type() const {
return config_.value("tokenizer_type", "tiktoken");
}

std::string llm_model() const {
return base_dir_ + config_.value("llm_model", "llm.mnn");
}
Expand All @@ -93,6 +89,18 @@ class LlmConfig {
return base_dir_ + config_.value("llm_weight", "llm.mnn.weight");
}

std::string block_model(int index) const {
return base_dir_ + config_.value("block_model", "block_") + std::to_string(index) + ".mnn";
}

std::string lm_model() const {
return base_dir_ + config_.value("lm_model", "lm.mnn");
}

std::string embedding_model() const {
return base_dir_ + config_.value("embedding_model", "embedding.mnn");
}

std::string embedding_file() const {
return base_dir_ + config_.value("embedding_file", "embeddings_bf16.bin");
}
Expand All @@ -101,16 +109,28 @@ class LlmConfig {
return base_dir_ + config_.value("tokenizer_file", "tokenizer.txt");
}

bool is_single() const {
return config_.value("is_single", true);
}

int max_new_tokens() const {
return config_.value("max_new_tokens", 512);
}

int hidden_size() const {
return config_.value("hidden_size", 4096);
}

int layer_nums() const {
return config_.value("layer_nums", 32);
}

std::vector<int> key_value_shape() const {
return config_.value("key_value_shape", std::vector<int>{});
}

std::vector<int> stop_ids() const {
return config_.value("stop_ids", std::vector<int>{});
std::string attention_mask() const {
return config_.value("attention_mask", "int");
}

std::string prompt_template() const {
Expand Down Expand Up @@ -139,10 +159,7 @@ class LlmConfig {

class Llm {
public:
Llm() {
// default tokenier is senrencepiece
tokenizer_.reset(new Sentencepiece);
}
Llm(std::shared_ptr<LlmConfig> config) : config_(config) {}
virtual ~Llm() {
modules_.clear();
visual_module_.reset();
Expand All @@ -151,68 +168,53 @@ class Llm {
static Llm* createLLM(const std::string& path, std::string model_type = "auto");
void load();
void chat();
void warmup();
int forward(const std::vector<int>& input_ids);
std::string apply_chat_template(const std::string& input_str) const;
std::string response(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr);
std::string response_nohistory(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr);
void generate_init();
std::string generate(const std::vector<int>& input_ids, std::ostream* os, const char* end_with);
std::vector<int> generate(const std::vector<int>& input_ids);
int forward(const std::vector<int>& input_ids);
float load_progress() { return load_progress_; }
void reset();
std::vector<int> generate(const std::vector<int>& input_ids, int max_new_tokens = -1);
void print_speed();
friend class Pipeline;
public:
std::vector<int> history_;
// TODO
std::string model_name_ = "";
bool is_single_ = true;
bool is_disk_embedding_ = true;
bool is_visual_ = false;
int layer_nums_ = 0;
int hidden_size_ = 4096;
// config
int max_new_tokens_ = 1024;
int backend_type_ = 0;
int thread_num_ = 4;
bool low_precision_ = true;
bool chatml_ = true;
// forward info
int prompt_len_ = 0;
int gen_seq_len_ = 0;
int all_seq_len_ = 0;
// time
int64_t prefill_us_ = 0;
int64_t decode_us_ = 0;
LlmConfig config_;
std::shared_ptr<LlmConfig> config_;
std::unique_ptr<Tokenizer> tokenizer_;
protected:
VARP embedding(const std::vector<int>& input_ids);
VARP txt_embedding(const std::vector<int>& input_ids);
std::vector<int> tokenizer_encode(const std::string& input_str);
std::string decode(int id);
protected:
VARP inputs_embeds_, attention_mask_, position_ids_;
// model configs
bool is_single_ = true;
bool is_disk_embedding_ = true;
bool is_visual_ = false;
int layer_nums_ = 0;
int hidden_size_ = 4096;
std::vector<int> key_value_shape_ = {};
// gen info
float load_progress_ = 0.f;
// tokenizer
std::unique_ptr<Tokenizer> tokenizer_;
VARP inputs_embeds_, attention_mask_, position_ids_;
std::shared_ptr<Module> visual_module_;
std::shared_ptr<Executor::RuntimeManager> runtime_manager_;
std::vector<std::shared_ptr<Module>> modules_;
std::vector<VARP> past_key_values_;
private:
virtual VARP visual_embedding(const std::vector<int>& input_ids) { return nullptr; }
virtual std::vector<int> tokenizer(const std::string& query);
virtual VARP gen_attention_mask(int seq_len);
virtual VARP gen_position_ids(int seq_len);
virtual bool is_stop(int token_id);
private:
// MNN Modules
std::shared_ptr<Executor::RuntimeManager> runtime_manager_;
std::vector<std::shared_ptr<Module>> modules_;
std::vector<VARP> past_key_values_;
// model dir
std::string model_dir_;
};

#if 0
// some llm models
class Chatglm_6b : public Llm {
public:
Expand All @@ -222,27 +224,12 @@ class Chatglm_6b : public Llm {
key_value_shape_ = {2, 0, 1, 32, 128};
}
private:
virtual std::vector<int> tokenizer(const std::string& query) override;
virtual VARP gen_attention_mask(int seq_len) override;
virtual VARP gen_position_ids(int seq_len) override;
virtual bool is_stop(int token_id) override;
int context_len_ = 0;
};

class Chatglm2_6b : public Llm {
public:
Chatglm2_6b() {
model_name_ = "Chatglm2_6b";
layer_nums_ = 28;
key_value_shape_ = {2, 0, 1, 2, 128};
}
private:
virtual std::vector<int> tokenizer(const std::string& query) override;
virtual VARP gen_attention_mask(int seq_len) override;
virtual VARP gen_position_ids(int seq_len) override;
virtual bool is_stop(int token_id) override;
};

/*
class Phi_2 : public Chatglm2_6b {
public:
Phi_2() {
Expand All @@ -256,24 +243,9 @@ class Phi_2 : public Chatglm2_6b {
virtual std::vector<int> tokenizer(const std::string& query) override;
virtual bool is_stop(int token_id) override;
};
*/

class Qwen_7b : public Llm {
public:
Qwen_7b() {
model_name_ = "Qwen_7b";
layer_nums_ = 32;
key_value_shape_ = {2, 1, 0, 32, 128};
hidden_size_ = 4096;
tokenizer_.reset(new Tiktoken);
}
private:
virtual std::vector<int> tokenizer(const std::string& query) override;
virtual VARP gen_attention_mask(int seq_len) override;
virtual VARP gen_position_ids(int seq_len) override;
virtual bool is_stop(int token_id) override;
};

class Qwen_vl : public Qwen_7b {
class Qwen_vl : public Llm {
public:
Qwen_vl() {
model_name_ = "Qwen_vl";
Expand All @@ -292,21 +264,9 @@ class Qwen_vl : public Qwen_7b {
private:
std::vector<int> url_encode(const std::string& url);
virtual VARP visual_embedding(const std::vector<int>& input_ids) override;
virtual std::vector<int> tokenizer(const std::string& query) override;
virtual VARP gen_attention_mask(int seq_len) override;
};

class Qwen_1_8b : public Qwen_7b {
public:
Qwen_1_8b() {
model_name_ = "Qwen_1.8b";
layer_nums_ = 24;
key_value_shape_ = {2, 1, 0, 16, 128};
hidden_size_ = 2048;
tokenizer_.reset(new Tiktoken);
}
};

class Llama2_7b : public Llm {
public:
Llama2_7b() {
Expand All @@ -315,97 +275,21 @@ class Llama2_7b : public Llm {
key_value_shape_ = {2, 1, 32, 0, 128};
}
private:
virtual std::vector<int> tokenizer(const std::string& query) override;
virtual VARP gen_attention_mask(int seq_len) override;
virtual VARP gen_position_ids(int seq_len) override;
virtual bool is_stop(int token_id) override;
};

class Qwen2 : public Llama2_7b {
public:
Qwen2() {
model_name_ = "Qwen2";
tokenizer_.reset(new HuggingfaceTokenizer);
}
private:
virtual std::vector<int> tokenizer(const std::string& query) override;
virtual bool is_stop(int token_id) override;
};

class Qwen2_0_5b : public Qwen2 {
public:
Qwen2_0_5b() {
model_name_ = "Qwen2_0.5b";
layer_nums_ = 24;
key_value_shape_ = {2, 1, 16, 0, 64};
hidden_size_ = 1024;
}
};

class Qwen2_1_8b : public Qwen2 {
public:
Qwen2_1_8b() {
model_name_ = "Qwen2_1.8b";
layer_nums_ = 24;
key_value_shape_ = {2, 1, 16, 0, 128};
hidden_size_ = 2048;
}
};

class Qwen2_4b : public Qwen2 {
public:
Qwen2_4b() {
model_name_ = "Qwen2_4b";
layer_nums_ = 40;
key_value_shape_ = {2, 1, 20, 0, 128};
hidden_size_ = 2560;
}
};

class Qwen2_7b : public Qwen2 {
public:
Qwen2_7b() {
model_name_ = "Qwen2_7b";
layer_nums_ = 32;
key_value_shape_ = {2, 1, 32, 0, 128};
hidden_size_ = 4096;
}
};

class TinyLlama : public Llama2_7b {
public:
TinyLlama() {
model_name_ = "TinyLlama";
layer_nums_ = 22;
key_value_shape_ = {2, 1, 4, 0, 64};
}
private:
virtual std::vector<int> tokenizer(const std::string& query) override;
};

class Yi_6b : public Llama2_7b {
public:
Yi_6b() {
model_name_ = "Yi_6b";
key_value_shape_ = {2, 1, 4, 0, 128};
}
private:
virtual std::vector<int> tokenizer(const std::string& query) override;
virtual bool is_stop(int token_id) override;
};

class Llama3_8b : public Llama2_7b {
public:
Llama3_8b() {
model_name_ = "Llama3_8b";
layer_nums_ = 32;
key_value_shape_ = {2, 1, 8, 0, 128};
hidden_size_ = 4096;
}
private:
virtual std::vector<int> tokenizer(const std::string& query) override;
virtual bool is_stop(int token_id) override;
};
#endif
// Llm end

// Embedding start
Expand All @@ -429,8 +313,6 @@ class Embedding {
// time
int64_t embedding_us_ = 0;
int prompt_len_ = 0;
protected:
std::vector<int> tokenizer_encode(const std::string& input_str);
protected:
// model configs
int layer_nums_ = 0;
Expand Down

0 comments on commit eb8e94e

Please sign in to comment.