Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

quantize: Handle user-defined quantization levels for additional tensors #12511

Open
wants to merge 27 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
09f716d
Add llama_model_quantize_params parameters
EAddario Mar 13, 2025
ac908af
Add new quantize parameters parsing and validation
EAddario Mar 13, 2025
337d979
Update usage
EAddario Mar 13, 2025
6f8d16d
Add new parameters defaults
EAddario Mar 13, 2025
71c9f93
Add new quantization parameters logic
EAddario Mar 13, 2025
8e18131
Add llama_model_quantize_params parameters
EAddario Mar 13, 2025
a77d947
Add new quantize parameters parsing and validation
EAddario Mar 13, 2025
2414eaa
Update usage
EAddario Mar 13, 2025
0dd66b8
Add new parameters defaults
EAddario Mar 13, 2025
1d841c6
Add new quantization parameters logic
EAddario Mar 13, 2025
120f71b
Merge main changes into branch
EAddario Mar 14, 2025
dbcc0b5
Merge branch 'master' into quantize
EAddario Mar 14, 2025
d86de03
Minor refactoring as per the contributors' coding guidelines
EAddario Mar 14, 2025
99bae5e
Update descriptions to match existing style
EAddario Mar 14, 2025
60b0a53
Merge branch 'master' into quantize
EAddario Mar 14, 2025
3e2063d
Merge branch 'master' into quantize
EAddario Mar 16, 2025
b99fa62
Merge branch 'master' into quantize
EAddario Mar 19, 2025
f97b693
Add llama_model_quantize_params parameters
EAddario Mar 19, 2025
f11e3da
Add new quantize parameters parsing and validation
EAddario Mar 19, 2025
ad1e352
Update usage
EAddario Mar 19, 2025
4e5c96a
Add new parameters defaults
EAddario Mar 19, 2025
9b3ccb5
Add new quantization parameters logic
EAddario Mar 19, 2025
35f45f1
Minor refactoring as per the contributors' guidelines
EAddario Mar 19, 2025
071e9ef
Merge branch 'master' into quantize
EAddario Mar 22, 2025
54e13cf
Implement general --tensor-type instead of tensor-specific command op…
EAddario Mar 29, 2025
31d642c
Merge branch 'master' into quantize
EAddario Mar 29, 2025
b3c7db5
Fix implied type bug
EAddario Mar 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 102 additions & 3 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
@@ -7,7 +7,6 @@
#include <string>
#include <unordered_map>
#include <fstream>
#include <cmath>
#include <cctype>

struct quant_option {
@@ -16,7 +15,7 @@ struct quant_option {
std::string desc;
};

static const std::vector<struct quant_option> QUANT_OPTIONS = {
static const std::vector<quant_option> QUANT_OPTIONS = {
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
@@ -105,7 +104,8 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
//
[[noreturn]]
static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type]\n", executable);
printf(" [--token-embedding-type] [--tensor-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
@@ -114,6 +114,8 @@ static void usage(const char * executable) {
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
printf(" --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n");
printf(" --keep-split: will generate quantized model in the same shards as input\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -244,6 +246,95 @@ static ggml_type parse_ggml_type(const char * arg) {
return GGML_TYPE_COUNT;
}

// Allowed tensors for arbitrary quantization with --tensor-type option
static const std::vector<std::string> ALLOWED_TENSOR_TYPE = {
"attn_k",
"attn_kv_a_mqa",
"attn_kv_b",
"attn_out",
"attn_q_a",
"attn_q_b",
"attn_q",
"attn_qkv",
"attn_v",
"channel_mix_key",
"channel_mix_receptance",
"channel_mix_value",
"cls_out",
"cls",
"dec_attn_k",
"dec_attn_out",
"dec_attn_q",
"dec_attn_v",
"dec_cross_attn_k",
"dec_cross_attn_out",
"dec_cross_attn_q",
"dec_cross_attn_v",
"ffn_act",
"ffn_down_exp",
"ffn_down_shexp",
"ffn_down",
"ffn_gate_exp",
"ffn_gate_shexp",
"ffn_gate",
"ffn_up_exp",
"ffn_up_shexp",
"ffn_up",
"ssm_in",
"ssm_out",
"time_mix_gate",
"time_mix_key",
"time_mix_output",
"time_mix_receptance",
"time_mix_value",
};

// changes to this struct must be replicated in llama-quant.cpp
struct tensor_quantization {
std::string name;
ggml_type quant = GGML_TYPE_COUNT;
};

static bool string_parse_tensor_type(const char * data, std::vector<tensor_quantization> & tensor_type) {
const char * sep = strchr(data, '=');
if (sep == nullptr) {
printf("\n%s: malformed tensor type '%s'\n\n", __func__, data);
return false;
}

const size_t tn_len = sep - data;
if (tn_len == 0) {
printf("\n%s: missing tensor name\n\n", __func__);
return false;
}

if (const size_t qt_len = strlen(sep); qt_len == 1) {
printf("\n%s: missing quantization type\n\n", __func__);
return false;
}

std::string tn(data, tn_len);
std::transform(tn.begin(), tn.end(), tn.begin(), tolower);
sep++;
const std::string qt(sep);

if (find(ALLOWED_TENSOR_TYPE.begin(), ALLOWED_TENSOR_TYPE.end(), tn) == ALLOWED_TENSOR_TYPE.end()) {
printf("\n%s: invalid tensor name '%s'\n\n", __func__, tn.c_str());
return false;
}

if (parse_ggml_type(qt.c_str()) == GGML_TYPE_COUNT) {
printf("\n%s: invalid quantization type '%s'\n\n", __func__, qt.c_str());
return false;
}

tensor_quantization tqz;
tqz.name = tn;
tqz.quant = parse_ggml_type(qt.c_str());
tensor_type.emplace_back(std::move(tqz));
return true;
}

int main(int argc, char ** argv) {
if (argc < 3) {
usage(argv[0]);
@@ -255,6 +346,7 @@ int main(int argc, char ** argv) {
std::string imatrix_file;
std::vector<std::string> included_weights, excluded_weights;
std::vector<llama_model_kv_override> kv_overrides;
std::vector<tensor_quantization> tensor_types;

for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -277,6 +369,10 @@ int main(int argc, char ** argv) {
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--tensor-type") == 0) {
if (arg_idx == argc-1 || !string_parse_tensor_type(argv[++arg_idx], tensor_types)) {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
usage(argv[0]);
@@ -361,6 +457,9 @@ int main(int argc, char ** argv) {
kv_overrides.back().key[0] = 0;
params.kv_overrides = &kv_overrides;
}
if (!tensor_types.empty()) {
params.tensor_types = &tensor_types;
}

llama_backend_init();

23 changes: 12 additions & 11 deletions include/llama.h
Original file line number Diff line number Diff line change
@@ -356,17 +356,18 @@ extern "C" {

// model quantization parameters
typedef struct llama_model_quantize_params {
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype
enum ggml_type output_tensor_type; // output tensor type
enum ggml_type token_embedding_type; // token embeddings tensor type
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // quantize all tensors to the default type
bool keep_split; // quantize to the same number of shards
void * imatrix; // pointer to importance matrix data
void * kv_overrides; // pointer to vector containing overrides
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype
enum ggml_type output_tensor_type; // output tensor type
enum ggml_type token_embedding_type; // token embeddings tensor type
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // quantize all tensors to the default type
bool keep_split; // quantize to the same number of shards
void * imatrix; // pointer to importance matrix data
void * kv_overrides; // pointer to vector containing overrides
void * tensor_types; // pointer to vector containing tensor types
} llama_model_quantize_params;

typedef struct llama_logit_bias {
33 changes: 24 additions & 9 deletions src/llama-quant.cpp
Original file line number Diff line number Diff line change
@@ -5,11 +5,9 @@
#include "llama-model-loader.h"

#include <algorithm>
#include <cmath>
#include <cstring>
#include <cinttypes>
#include <fstream>
#include <mutex>
#include <thread>
#include <unordered_map>

@@ -47,8 +45,14 @@ struct quantize_state_impl {
{}
};

// changes to this struct must be replicated in quantize.cpp
struct tensor_quantization {
std::string name;
ggml_type quant = GGML_TYPE_COUNT;
};

static void llama_tensor_dequantize_impl(
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
const size_t nelements, const int nthread
) {
if (output.size() < nelements) {
@@ -536,7 +540,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
model.load_hparams(ml);
model.load_stats (ml);

struct quantize_state_impl qs(model, params);
quantize_state_impl qs(model, params);

if (params->only_copy) {
ftype = ml.ftype;
@@ -661,7 +665,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// populate the original tensors so we get an initial meta data
for (const auto * it : tensors) {
uint16_t i_split = params->keep_split ? it->idx : 0;
struct ggml_tensor * tensor = it->tensor;
ggml_tensor * tensor = it->tensor;
if (!ctx_outs[i_split]) {
ctx_outs[i_split].reset(gguf_init_empty());
}
@@ -710,7 +714,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
new_ofstream(0);
for (const auto * it : tensors) {
const auto & weight = *it;
struct ggml_tensor * tensor = weight.tensor;
ggml_tensor * tensor = weight.tensor;
if (weight.idx != cur_split && params->keep_split) {
close_ofstream();
new_ofstream(weight.idx);
@@ -776,7 +780,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// do not quantize relative position bias (T5)
quantize &= name.find("attn_rel_b.weight") == std::string::npos;

enum ggml_type new_type;
ggml_type new_type;
void * new_data;
size_t new_size;

@@ -786,6 +790,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// get more optimal quantization type based on the tensor shape, layer, etc.
if (!params->pure && ggml_is_quantized(default_type)) {
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
// unless the user specifies a type
if (params->tensor_types) {
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
for (const auto & [name, quant] : tensor_types) {
if (std::string str(tensor->name); str.find(name) != std::string::npos) {
new_type = quant;
break;
}
}
}
}
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
new_type = params->token_embedding_type;
@@ -910,8 +924,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// interface implementation
//

struct llama_model_quantize_params llama_model_quantize_default_params() {
struct llama_model_quantize_params result = {
llama_model_quantize_params llama_model_quantize_default_params() {
llama_model_quantize_params result = {
/*.nthread =*/ 0,
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
@@ -923,6 +937,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
/*.keep_split =*/ false,
/*.imatrix =*/ nullptr,
/*.kv_overrides =*/ nullptr,
/*.tensor_type =*/ nullptr,
};

return result;