Skip to content

Commit

Permalink
Merge pull request #581 from tursodatabase/add-prom-metrics
Browse files Browse the repository at this point in the history
more prom metrics
  • Loading branch information
LucioFranco committed Nov 7, 2023
2 parents 3b2da5a + 42a8389 commit 5df434f
Show file tree
Hide file tree
Showing 11 changed files with 82 additions and 37 deletions.
1 change: 1 addition & 0 deletions libsql-server/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ jsonwebtoken = "8.2.0"
libsql = { path = "../libsql/", optional = true }
libsql-replication = { path = "../libsql-replication" }
metrics = "0.21.1"
metrics-util = "0.15"
metrics-exporter-prometheus = "0.12.1"
mimalloc = { version = "0.1.36", default-features = false }
nix = { version = "0.26.2", features = ["fs"] }
Expand Down
16 changes: 8 additions & 8 deletions libsql-server/src/connection/libsql.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ use tokio::time::{Duration, Instant};
use crate::auth::{Authenticated, Authorized, Permission};
use crate::error::Error;
use crate::libsql_bindings::wal_hook::WalHook;
use crate::metrics::{READ_QUERY_COUNT, VACUUM_COUNT, WAL_CHECKPOINT_COUNT, WRITE_QUERY_COUNT};
use crate::metrics::{
DESCRIBE_COUNT, PROGRAM_EXEC_COUNT, READ_QUERY_COUNT, VACUUM_COUNT, WAL_CHECKPOINT_COUNT,
WRITE_QUERY_COUNT, WRITE_TXN_DURATION,
};
use crate::query::Query;
use crate::query_analysis::{StmtKind, TxnStatus};
use crate::query_result_builder::{QueryBuilderConfig, QueryResultBuilder};
Expand Down Expand Up @@ -310,11 +313,7 @@ impl<T: WalHook> TxnSlot<T> {
// we have a lock on the connection, we don't need mode than a
// Relaxed store.
conn.rollback();
histogram!(
"libsql_server_write_txn_duration",
self.created_at.elapsed()
)
// WRITE_TXN_DURATION.record(self.created_at.elapsed());
WRITE_TXN_DURATION.record(self.created_at.elapsed());
}
}

Expand Down Expand Up @@ -769,7 +768,7 @@ impl<W: WalHook> Connection<W> {
}

self.stats
.update_query_metrics(sql, rows_read, rows_written, mem_used, elapsed)
.update_query_metrics(rows_read, rows_written, mem_used, elapsed)
}

fn describe(&self, sql: &str) -> crate::Result<DescribeResponse> {
Expand Down Expand Up @@ -878,7 +877,7 @@ where
builder: B,
_replication_index: Option<FrameNo>,
) -> Result<B> {
increment_counter!("libsql_server_libsql_execute_program");
PROGRAM_EXEC_COUNT.increment(1);

check_program_auth(auth, &pgm)?;
let conn = self.inner.clone();
Expand All @@ -893,6 +892,7 @@ where
auth: Authenticated,
_replication_index: Option<FrameNo>,
) -> Result<crate::Result<DescribeResponse>> {
DESCRIBE_COUNT.increment(1);
check_describe_auth(auth)?;
let conn = self.inner.clone();
let res = tokio::task::spawn_blocking(move || conn.lock().describe(&sql))
Expand Down
17 changes: 5 additions & 12 deletions libsql-server/src/connection/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use metrics::histogram;
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
use std::sync::Arc;
use tokio::time::{Duration, Instant};
Expand All @@ -8,7 +7,9 @@ use tokio::{sync::Semaphore, time::timeout};

use crate::auth::Authenticated;
use crate::error::Error;
use crate::metrics::CONCCURENT_CONNECTIONS_COUNT;
use crate::metrics::{
CONCCURENT_CONNECTIONS_COUNT, CONNECTION_ALIVE_DURATION, CONNECTION_CREATE_TIME,
};
use crate::query::{Params, Query};
use crate::query_analysis::Statement;
use crate::query_result_builder::{IgnoreResult, QueryResultBuilder};
Expand Down Expand Up @@ -287,11 +288,7 @@ impl<F: MakeConnection> MakeConnection for MakeThrottledConnection<F> {
let inner = self.connection_maker.create().await?;

CONCCURENT_CONNECTIONS_COUNT.increment(1.0);
// CONNECTION_CREATE_TIME.record(before_create.elapsed());
histogram!(
"libsql_server_connection_create_time",
before_create.elapsed()
);
CONNECTION_CREATE_TIME.record(before_create.elapsed());

Ok(TrackedConnection {
permit,
Expand All @@ -314,11 +311,7 @@ pub struct TrackedConnection<DB> {
impl<T> Drop for TrackedConnection<T> {
fn drop(&mut self) {
CONCCURENT_CONNECTIONS_COUNT.decrement(1.0);
histogram!(
"libsql_server_connection_create_time",
self.created_at.elapsed()
);
// CONNECTION_ALIVE_DURATION.record();
CONNECTION_ALIVE_DURATION.record(self.created_at.elapsed());
}
}

Expand Down
3 changes: 3 additions & 0 deletions libsql-server/src/connection/write_proxy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use tonic::{Request, Streaming};
use crate::auth::Authenticated;
use crate::connection::program::{DescribeCol, DescribeParam};
use crate::error::Error;
use crate::metrics::{REPLICA_LOCAL_EXEC_MISPREDICT, REPLICA_LOCAL_PROGRAM_EXEC};
use crate::namespace::NamespaceName;
use crate::query_analysis::TxnStatus;
use crate::query_result_builder::{QueryBuilderConfig, QueryResultBuilder};
Expand Down Expand Up @@ -449,9 +450,11 @@ impl Connection for WriteProxyConnection<RpcStream> {
.await?;
let new_state = self.read_conn.txn_status()?;
if new_state != TxnStatus::Init {
REPLICA_LOCAL_EXEC_MISPREDICT.increment(1);
self.read_conn.rollback(auth.clone()).await?;
self.execute_remote(pgm, &mut state, auth, builder).await
} else {
REPLICA_LOCAL_PROGRAM_EXEC.increment(1);
*state = new_state;
Ok(builder)
}
Expand Down
18 changes: 17 additions & 1 deletion libsql-server/src/http/admin/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use std::cell::OnceCell;
use std::io::ErrorKind;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use tokio_util::io::ReaderStream;
use url::Url;

Expand Down Expand Up @@ -65,9 +66,24 @@ where
M: MakeNamespace,
C: Connector,
{
let app_label = std::env::var("SQLD_APP_LABEL").ok();

let prom_handle = if !disable_metrics {
let lock = PROM_HANDLE.lock();
let prom_handle = lock.get_or_init(|| PrometheusBuilder::new().install_recorder().unwrap());
let prom_handle = lock.get_or_init(|| {
let b = PrometheusBuilder::new().idle_timeout(
metrics_util::MetricKindMask::ALL,
Some(Duration::from_secs(120)),
);

if let Some(app_label) = app_label {
b.add_global_label("app", app_label)
.install_recorder()
.unwrap()
} else {
b.install_recorder().unwrap()
}
});
Some(prom_handle.clone())
} else {
None
Expand Down
2 changes: 2 additions & 0 deletions libsql-server/src/http/user/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ use crate::database::Database;
use crate::error::Error;
use crate::hrana;
use crate::http::user::types::HttpQuery;
use crate::metrics::LEGACY_HTTP_CALL;
use crate::namespace::{MakeNamespace, NamespaceStore};
use crate::net::Accept;
use crate::query::{self, Query};
Expand Down Expand Up @@ -127,6 +128,7 @@ async fn handle_query<C: Connection>(
MakeConnectionExtractor(connection_maker): MakeConnectionExtractor<C>,
Json(query): Json<HttpQuery>,
) -> Result<axum::response::Response, Error> {
LEGACY_HTTP_CALL.increment(1);
let batch = parse_queries(query.statements)?;

let db = connection_maker.create().await?;
Expand Down
31 changes: 31 additions & 0 deletions libsql-server/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,34 @@ pub static RETURNED_BYTES: Lazy<Histogram> = Lazy::new(|| {
describe_histogram!(NAME, "number of bytes of values returned to the client");
register_histogram!(NAME)
});
pub static PROGRAM_EXEC_COUNT: Lazy<Counter> = Lazy::new(|| {
const NAME: &str = "libsql_server_libsql_execute_program";
describe_counter!(NAME, "number of hrana program executions");
register_counter!(NAME)
});
pub static REPLICA_LOCAL_EXEC_MISPREDICT: Lazy<Counter> = Lazy::new(|| {
const NAME: &str = "libsql_server_replica_exec_mispredict";
describe_counter!(
NAME,
"number of mispredicted hrana program executions on a replica"
);
register_counter!(NAME)
});
pub static REPLICA_LOCAL_PROGRAM_EXEC: Lazy<Counter> = Lazy::new(|| {
const NAME: &str = "libsql_server_replica_exec";
describe_counter!(
NAME,
"number of local hrana programs executions on a replica"
);
register_counter!(NAME)
});
pub static DESCRIBE_COUNT: Lazy<Counter> = Lazy::new(|| {
const NAME: &str = "libsql_server_describe_count";
describe_counter!(NAME, "number of calls to describe");
register_counter!(NAME)
});
pub static LEGACY_HTTP_CALL: Lazy<Counter> = Lazy::new(|| {
const NAME: &str = "libsql_server_legacy_http_call";
describe_counter!(NAME, "number of calls to the legacy HTTP API");
register_counter!(NAME)
});
8 changes: 2 additions & 6 deletions libsql-server/src/namespace/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ use enclose::enclose;
use futures_core::Stream;
use hyper::Uri;
use libsql_replication::rpc::replication::replication_log_client::ReplicationLogClient;
use metrics::histogram;
use parking_lot::Mutex;
use rusqlite::ErrorCode;
use sqld_libsql_bindings::wal_hook::TRANSPARENT_METHODS;
Expand All @@ -32,6 +31,7 @@ use crate::connection::write_proxy::MakeWriteProxyConn;
use crate::connection::MakeConnection;
use crate::database::{Database, PrimaryDatabase, ReplicaDatabase};
use crate::error::{Error, LoadDumpError};
use crate::metrics::NAMESPACE_LOAD_LATENCY;
use crate::replication::primary::logger::{ReplicationLoggerHookCtx, REPLICATION_METHODS};
use crate::replication::replicator_client::NamespaceDoesntExist;
use crate::replication::{FrameNo, NamespacedSnapshotCallback, ReplicationLogger};
Expand Down Expand Up @@ -455,11 +455,7 @@ impl<M: MakeNamespace> NamespaceStore<M> {
tracing::info!("loaded namespace: `{namespace}`");
lock.insert(namespace, ns);

// NAMESPACE_LOAD_LATENCY.record(before_load.elapsed());
histogram!(
"libsql_server_namespace_load_latency",
before_load.elapsed()
);
NAMESPACE_LOAD_LATENCY.record(before_load.elapsed());

Ok(ret)
}
Expand Down
13 changes: 7 additions & 6 deletions libsql-server/src/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,19 +211,20 @@ impl Stats {
self.slowest_query_threshold.store(0, Ordering::Relaxed);
}

// TOOD: Update these metrics with namespace labels in the future so we can localize
// issues to a specific namespace.
pub(crate) fn update_query_metrics(
&self,
sql: String,
rows_read: u64,
rows_written: u64,
mem_used: u64,
elapsed: u64,
) {
increment_counter!("libsql_server_query_count", "namespace" => self.namespace.to_string(), "query" => sql.clone());
counter!("libsql_server_query_latency", elapsed, "namespace" => self.namespace.to_string(), "query" => sql.clone());
counter!("libsql_server_query_rows_read", rows_read, "namespace" => self.namespace.to_string(), "query" => sql.clone());
counter!("libsql_server_query_rows_written", rows_written, "namespace" => self.namespace.to_string(), "query" => sql.clone());
counter!("libsql_server_query_mem_used", mem_used, "namespace" => self.namespace.to_string(), "query" => sql.clone());
increment_counter!("libsql_server_query_count");
counter!("libsql_server_query_latency", elapsed);
counter!("libsql_server_query_rows_read", rows_read);
counter!("libsql_server_query_rows_written", rows_written);
counter!("libsql_server_query_mem_used", mem_used);
}
}

Expand Down
1 change: 1 addition & 0 deletions libsql-server/tests/common/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ impl MetricsSnapshot {
}

pub fn get_counter(&self, metric_name: &str) -> Option<u64> {
println!("{:?}", self.snapshot);
for (key, (_, _, val)) in &self.snapshot {
if key.kind() == MetricKind::Counter && key.key().name() == metric_name {
match val {
Expand Down
9 changes: 5 additions & 4 deletions libsql-server/tests/standalone/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use crate::common::{net::SimServer, snapshot_metrics};

use super::common;

use std::sync::Arc;
use std::{sync::Arc, time::Duration};

use insta::assert_debug_snapshot;
use libsql::{Database, Value};
Expand Down Expand Up @@ -59,6 +59,7 @@ fn basic_query() {
}

#[test]
#[ignore]
fn basic_metrics() {
let mut sim = turmoil::Builder::new().build();

Expand All @@ -78,9 +79,9 @@ fn basic_metrics() {
libsql::Value::Integer(1)
));

snapshot_metrics()
.assert_gauge("libsql_server_current_frame_no", 2.0)
.assert_counter("libsql_server_libsql_execute_program", 3);
tokio::time::sleep(Duration::from_secs(1)).await;

snapshot_metrics().assert_counter("libsql_server_libsql_execute_program", 3);

Ok(())
});
Expand Down

0 comments on commit 5df434f

Please sign in to comment.