diff --git a/examples/reranker/src/main.rs b/examples/reranker/src/main.rs index 73b0fb30..73918adb 100644 --- a/examples/reranker/src/main.rs +++ b/examples/reranker/src/main.rs @@ -12,7 +12,6 @@ use std::time::Duration; use anyhow::{bail, Context, Result}; use clap::Parser; -use hf_hub::api::sync::ApiBuilder; use llama_cpp_2::context::params::{LlamaContextParams, LlamaPoolingType}; use llama_cpp_2::context::LlamaContext; @@ -92,13 +91,11 @@ fn main() -> Result<()> { .with_n_threads_batch(std::thread::available_parallelism()?.get().try_into()?) .with_embeddings(true) .with_pooling_type(pooling_type); - println!("ctx_params: {:?}", ctx_params); + println!("ctx_params: {ctx_params:?}"); let mut ctx = model .new_context(&backend, ctx_params) .with_context(|| "unable to create the llama_context")?; - let n_embd = model.n_embd(); - let prompt_lines = { let mut lines = Vec::new(); for doc in documents { @@ -108,13 +105,13 @@ fn main() -> Result<()> { lines }; - println!("prompt_lines: {:?}", prompt_lines); + println!("prompt_lines: {prompt_lines:?}"); // tokenize the prompt let tokens_lines_list = prompt_lines .iter() .map(|line| model.str_to_token(line, AddBos::Always)) .collect::, _>>() - .with_context(|| format!("failed to tokenize {:?}", prompt_lines))?; + .with_context(|| format!("failed to tokenize {prompt_lines:?}"))?; let n_ctx = ctx.n_ctx() as usize; let n_ctx_train = model.n_ctx_train(); @@ -156,7 +153,6 @@ fn main() -> Result<()> { // } else { // tokens_lines_list.len() // }; - let mut embeddings_stored = 0; let mut max_seq_id_batch = 0; let mut output = Vec::with_capacity(tokens_lines_list.len()); @@ -169,16 +165,10 @@ fn main() -> Result<()> { &mut ctx, &mut batch, max_seq_id_batch, - n_embd, &mut output, normalise, - pooling.clone(), + &pooling, )?; - embeddings_stored += if pooling == "none" { - batch.n_tokens() - } else { - max_seq_id_batch - }; max_seq_id_batch = 0; batch.clear(); } @@ -191,34 +181,23 @@ fn main() -> Result<()> { &mut ctx, &mut batch, max_seq_id_batch, - n_embd, &mut output, normalise, - pooling.clone(), + &pooling, )?; let t_main_end = ggml_time_us(); for (j, embeddings) in output.iter().enumerate() { - if pooling == "none" { - eprintln!("embedding {j}: "); - for i in 0..n_embd as usize { - if !normalise { - eprint!("{:6.5} ", embeddings[i]); - } else { - eprint!("{:9.6} ", embeddings[i]); - } - } - eprintln!(); - } else if pooling == "rank" { + if pooling == "rank" { eprintln!("rerank score {j}: {:8.3}", embeddings[0]); } else { eprintln!("embedding {j}: "); - for i in 0..n_embd as usize { - if !normalise { - eprint!("{:6.5} ", embeddings[i]); + for embedding in embeddings { + if normalise { + eprint!("{embedding:9.6} "); } else { - eprint!("{:9.6} ", embeddings[i]); + eprint!("{embedding:6.5} "); } } eprintln!(); @@ -243,10 +222,9 @@ fn batch_decode( ctx: &mut LlamaContext, batch: &mut LlamaBatch, s_batch: i32, - n_embd: i32, output: &mut Vec>, normalise: bool, - pooling: String, + pooling: &str, ) -> Result<()> { eprintln!( "{}: n_tokens = {}, n_seq = {}", @@ -266,9 +244,9 @@ fn batch_decode( .with_context(|| "Failed to get sequence embeddings")?; let normalized = if normalise { if pooling == "rank" { - normalize_embeddings(&embeddings, -1) + normalize_embeddings(embeddings, -1) } else { - normalize_embeddings(&embeddings, 2) + normalize_embeddings(embeddings, 2) } } else { embeddings.to_vec() @@ -291,27 +269,30 @@ fn normalize_embeddings(input: &[f32], embd_norm: i32) -> Vec { 0 => { // max absolute let max_abs = input.iter().map(|x| x.abs()).fold(0.0f32, f32::max) / 32760.0; - max_abs as f64 + f64::from(max_abs) } 2 => { // euclidean norm input .iter() - .map(|x| (*x as f64).powi(2)) + .map(|x| f64::from(*x).powi(2)) .sum::() .sqrt() } p => { // p-norm - let sum = input.iter().map(|x| (x.abs() as f64).powi(p)).sum::(); - sum.powf(1.0 / p as f64) + let sum = input + .iter() + .map(|x| f64::from(x.abs()).powi(p)) + .sum::(); + sum.powf(1.0 / f64::from(p)) } }; let norm = if sum > 0.0 { 1.0 / sum } else { 0.0 }; for i in 0..n { - output[i] = (input[i] as f64 * norm) as f32; + output[i] = (f64::from(input[i]) * norm) as f32; } output diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs index e7299938..13809a00 100644 --- a/examples/simple/src/main.rs +++ b/examples/simple/src/main.rs @@ -97,7 +97,7 @@ struct Args { fn parse_key_val(s: &str) -> Result<(String, ParamOverrideValue)> { let pos = s .find('=') - .ok_or_else(|| anyhow!("invalid KEY=value: no `=` found in `{}`", s))?; + .ok_or_else(|| anyhow!("invalid KEY=value: no `=` found in `{s}`"))?; let key = s[..pos].parse()?; let value: String = s[pos + 1..].parse()?; let value = i64::from_str(&value) diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs index 16f05e31..8b298402 100644 --- a/llama-cpp-2/src/lib.rs +++ b/llama-cpp-2/src/lib.rs @@ -449,6 +449,7 @@ pub struct LogOptions { impl LogOptions { /// If enabled, logs are sent to tracing. If disabled, all logs are suppressed. Default is for /// logs to be sent to tracing. + #[must_use] pub fn with_logs_enabled(mut self, enabled: bool) -> Self { self.disabled = !enabled; self diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs index d8625b8d..d9f3e4f2 100644 --- a/llama-cpp-2/src/model.rs +++ b/llama-cpp-2/src/model.rs @@ -36,9 +36,9 @@ pub struct LlamaLoraAdapter { pub(crate) lora_adapter: NonNull, } -/// A performance-friendly wrapper around [LlamaModel::chat_template] which is then -/// fed into [LlamaModel::apply_chat_template] to convert a list of messages into an LLM -/// prompt. Internally the template is stored as a CString to avoid round-trip conversions +/// A performance-friendly wrapper around [`LlamaModel::chat_template`] which is then +/// fed into [`LlamaModel::apply_chat_template`] to convert a list of messages into an LLM +/// prompt. Internally the template is stored as a `CString` to avoid round-trip conversions /// within the FFI. #[derive(Eq, PartialEq, Clone, PartialOrd, Ord, Hash)] pub struct LlamaChatTemplate(CString); @@ -55,7 +55,7 @@ impl LlamaChatTemplate { &self.0 } - /// Attempts to convert the CString into a Rust str reference. + /// Attempts to convert the `CString` into a Rust str reference. pub fn to_str(&self) -> Result<&str, Utf8Error> { self.0.to_str() } @@ -569,7 +569,7 @@ impl LlamaModel { /// Get chat template from model by name. If the name parameter is None, the default chat template will be returned. /// - /// You supply this into [Self::apply_chat_template] to get back a string with the appropriate template + /// You supply this into [`Self::apply_chat_template`] to get back a string with the appropriate template /// substitution applied to convert a list of messages into a prompt the LLM can use to complete /// the chat. /// @@ -666,11 +666,11 @@ impl LlamaModel { /// There is many ways this can fail. See [`LlamaContextLoadError`] for more information. // we intentionally do not derive Copy on `LlamaContextParams` to allow llama.cpp to change the type to be non-trivially copyable. #[allow(clippy::needless_pass_by_value)] - pub fn new_context( - &self, + pub fn new_context<'a>( + &'a self, _: &LlamaBackend, params: LlamaContextParams, - ) -> Result { + ) -> Result, LlamaContextLoadError> { let context_params = params.context_params; let context = unsafe { llama_cpp_sys_2::llama_new_context_with_model(self.model.as_ptr(), context_params) @@ -681,14 +681,14 @@ impl LlamaModel { } /// Apply the models chat template to some messages. - /// See https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template + /// See /// - /// Unlike the llama.cpp apply_chat_template which just randomly uses the ChatML template when given + /// Unlike the llama.cpp `apply_chat_template` which just randomly uses the ChatML template when given /// a null pointer for the template, this requires an explicit template to be specified. If you want to /// use "chatml", then just do `LlamaChatTemplate::new("chatml")` or any other model name or template /// string. /// - /// Use [Self::chat_template] to retrieve the template baked into the model (this is the preferred + /// Use [`Self::chat_template`] to retrieve the template baked into the model (this is the preferred /// mechanism as using the wrong chat template can result in really unexpected responses from the LLM). /// /// You probably want to set `add_ass` to true so that the generated template string ends with a the @@ -764,7 +764,7 @@ where let mut buffer = vec![0u8; capacity]; // call the foreign function - let result = c_function(buffer.as_mut_ptr() as *mut c_char, buffer.len()); + let result = c_function(buffer.as_mut_ptr().cast::(), buffer.len()); if result < 0 { return Err(MetaValError::NegativeReturn(result)); } diff --git a/llama-cpp-2/src/model/params.rs b/llama-cpp-2/src/model/params.rs index 47f4c257..e07488f8 100644 --- a/llama-cpp-2/src/model/params.rs +++ b/llama-cpp-2/src/model/params.rs @@ -149,7 +149,7 @@ impl LlamaModelParams { /// assert_eq!(count, 0); /// ``` #[must_use] - pub fn kv_overrides(&self) -> KvOverrides { + pub fn kv_overrides<'a>(&'a self) -> KvOverrides<'a> { KvOverrides::new(self) } @@ -235,7 +235,7 @@ impl LlamaModelParams { ); // There should be some way to do this without iterating over everything. - for (_i, &c) in key.to_bytes_with_nul().iter().enumerate() { + for &c in key.to_bytes_with_nul().iter() { c_char::try_from(c).expect("invalid character in key"); } diff --git a/llama-cpp-2/src/model/params/kv_overrides.rs b/llama-cpp-2/src/model/params/kv_overrides.rs index b17516a1..6a20714b 100644 --- a/llama-cpp-2/src/model/params/kv_overrides.rs +++ b/llama-cpp-2/src/model/params/kv_overrides.rs @@ -78,7 +78,7 @@ pub struct KvOverrides<'a> { } impl KvOverrides<'_> { - pub(super) fn new(model_params: &LlamaModelParams) -> KvOverrides { + pub(super) fn new<'a>(model_params: &'a LlamaModelParams) -> KvOverrides<'a> { KvOverrides { model_params } } } diff --git a/llama-cpp-2/src/sampling.rs b/llama-cpp-2/src/sampling.rs index d1275aec..10509423 100644 --- a/llama-cpp-2/src/sampling.rs +++ b/llama-cpp-2/src/sampling.rs @@ -385,7 +385,7 @@ impl LlamaSampler { /// Penalizes tokens for being present in the context. /// - /// Parameters: + /// Parameters: /// - ``penalty_last_n``: last n tokens to penalize (0 = disable penalty, -1 = context size) /// - ``penalty_repeat``: 1.0 = disabled /// - ``penalty_freq``: 0.0 = disabled @@ -415,15 +415,15 @@ impl LlamaSampler { /// - ``n_vocab``: [`LlamaModel::n_vocab`] /// - ``seed``: Seed to initialize random generation with. /// - ``tau``: The target cross-entropy (or surprise) value you want to achieve for the - /// generated text. A higher value corresponds to more surprising or less predictable text, - /// while a lower value corresponds to less surprising or more predictable text. + /// generated text. A higher value corresponds to more surprising or less predictable text, + /// while a lower value corresponds to less surprising or more predictable text. /// - ``eta``: The learning rate used to update `mu` based on the error between the target and - /// observed surprisal of the sampled word. A larger learning rate will cause `mu` to be - /// updated more quickly, while a smaller learning rate will result in slower updates. + /// observed surprisal of the sampled word. A larger learning rate will cause `mu` to be + /// updated more quickly, while a smaller learning rate will result in slower updates. /// - ``m``: The number of tokens considered in the estimation of `s_hat`. This is an arbitrary - /// value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. - /// In the paper, they use `m = 100`, but you can experiment with different values to see how - /// it affects the performance of the algorithm. + /// value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. + /// In the paper, they use `m = 100`, but you can experiment with different values to see how + /// it affects the performance of the algorithm. #[must_use] pub fn mirostat(n_vocab: i32, seed: u32, tau: f32, eta: f32, m: i32) -> Self { let sampler = @@ -436,11 +436,11 @@ impl LlamaSampler { /// # Parameters: /// - ``seed``: Seed to initialize random generation with. /// - ``tau``: The target cross-entropy (or surprise) value you want to achieve for the - /// generated text. A higher value corresponds to more surprising or less predictable text, - /// while a lower value corresponds to less surprising or more predictable text. + /// generated text. A higher value corresponds to more surprising or less predictable text, + /// while a lower value corresponds to less surprising or more predictable text. /// - ``eta``: The learning rate used to update `mu` based on the error between the target and - /// observed surprisal of the sampled word. A larger learning rate will cause `mu` to be - /// updated more quickly, while a smaller learning rate will result in slower updates. + /// observed surprisal of the sampled word. A larger learning rate will cause `mu` to be + /// updated more quickly, while a smaller learning rate will result in slower updates. #[must_use] pub fn mirostat_v2(seed: u32, tau: f32, eta: f32) -> Self { let sampler = unsafe { llama_cpp_sys_2::llama_sampler_init_mirostat_v2(seed, tau, eta) }; diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index 1cea4a2c..aa70bee5 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -627,7 +627,7 @@ fn main() { if matches!(target_os, TargetOs::Linux) && target_triple.contains("aarch64") - && !env::var(format!("CARGO_FEATURE_{}", "native".to_uppercase())).is_ok() + && env::var(format!("CARGO_FEATURE_{}", "native".to_uppercase())).is_err() { // If the native feature is not enabled, we take off the native ARM64 support. // It is useful in docker environments where the native feature is not enabled. diff --git a/llama-cpp-sys-2/src/lib.rs b/llama-cpp-sys-2/src/lib.rs index f91bb3a7..2dd834fa 100644 --- a/llama-cpp-sys-2/src/lib.rs +++ b/llama-cpp-sys-2/src/lib.rs @@ -3,5 +3,6 @@ #![allow(non_upper_case_globals)] #![allow(non_camel_case_types)] #![allow(non_snake_case)] +#![allow(unpredictable_function_pointer_comparisons)] include!(concat!(env!("OUT_DIR"), "/bindings.rs"));