Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mertrics: report region flow #1517

Merged
merged 10 commits into from Jan 16, 2017
5 changes: 5 additions & 0 deletions src/raftstore/store/config.rs
Expand Up @@ -50,6 +50,8 @@ const DEFAULT_SNAPSHOT_APPLY_BATCH_SIZE: usize = 1024 * 1024 * 10; // 10m
// We should turn on this only in our tests.
const DEFAULT_CONSISTENCY_CHECK_INTERVAL: u64 = 0;

const DEFAULT_REPORT_REGION_FLOW_INTERVAL: u64 = 30000; // 30 seconds

#[derive(Debug, Clone)]
pub struct Config {
// store capacity.
Expand Down Expand Up @@ -113,6 +115,8 @@ pub struct Config {

// Interval (ms) to check region whether the data is consistent.
pub consistency_check_tick_interval: u64,

pub report_region_flow_interval: u64,
}

impl Default for Config {
Expand Down Expand Up @@ -146,6 +150,7 @@ impl Default for Config {
snap_apply_batch_size: DEFAULT_SNAPSHOT_APPLY_BATCH_SIZE,
lock_cf_compact_interval: DEFAULT_LOCK_CF_COMPACT_INTERVAL,
consistency_check_tick_interval: DEFAULT_CONSISTENCY_CHECK_INTERVAL,
report_region_flow_interval: DEFAULT_REPORT_REGION_FLOW_INTERVAL,
}
}
}
Expand Down
8 changes: 4 additions & 4 deletions src/raftstore/store/local_metrics.rs
Expand Up @@ -23,7 +23,7 @@ pub struct RaftReadyMetrics {
pub commit: u64,
pub append: u64,
pub snapshot: u64,
pub keys_written: u64,
pub store_written_keys: u64,
}

impl RaftReadyMetrics {
Expand Down Expand Up @@ -54,9 +54,9 @@ impl RaftReadyMetrics {
.unwrap();
self.snapshot = 0;
}
if self.keys_written > 0 {
STORE_KEYS_WRITTEN_COUNTER.inc_by(self.keys_written as f64).unwrap();
self.keys_written = 0;
if self.store_written_keys > 0 {
STORE_WRITTEN_KEYS_COUNTER.inc_by(self.store_written_keys as f64).unwrap();
self.store_written_keys = 0;
}
}
}
Expand Down
18 changes: 16 additions & 2 deletions src/raftstore/store/metrics.rs
Expand Up @@ -143,9 +143,23 @@ lazy_static! {
&["cf"]
).unwrap();

pub static ref STORE_KEYS_WRITTEN_COUNTER: Counter =
pub static ref STORE_WRITTEN_KEYS_COUNTER: Counter =
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think just REGION_WRITTEN_KEY_HISTOGRAM is enough.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Em, here we may just want to know the written keys rate, if too high, the store may have a high load, so Counter is ok.

register_counter!(
"tikv_engine_keys_written_count",
"tikv_engine_written_keys_count",
"Count of keys has been written for this interval"
).unwrap();

pub static ref REGION_WRITTEN_BYTES_HISTOGRAM: Histogram =
register_histogram!(
"tikv_region_written_bytes",
"Histogram of bytes written for regions",
exponential_buckets(256.0, 2.0, 20).unwrap()
).unwrap();

pub static ref REGION_WRITTEN_KEYS_HISTOGRAM: Histogram =
register_histogram!(
"tikv_region_written_keys",
"Histogram of keys written for regions",
exponential_buckets(1.0, 2.0, 20).unwrap()
).unwrap();
}
1 change: 1 addition & 0 deletions src/raftstore/store/msg.rs
Expand Up @@ -34,6 +34,7 @@ pub enum Tick {
SnapGc,
CompactLockCf,
ConsistencyCheck,
ReportRegionFlow,
}

pub struct SnapshotStatusMsg {
Expand Down
9 changes: 8 additions & 1 deletion src/raftstore/store/peer.rs
Expand Up @@ -246,6 +246,9 @@ pub struct Peer {

leader_lease_expired_time: Option<Timespec>,
election_timeout: TimeDuration,

pub written_bytes: u64,
pub written_keys: u64,
}

impl Peer {
Expand Down Expand Up @@ -342,6 +345,8 @@ impl Peer {
leader_lease_expired_time: None,
election_timeout: TimeDuration::milliseconds(cfg.raft_base_tick_interval as i64) *
cfg.raft_election_timeout_ticks as i32,
written_bytes: 0,
written_keys: 0,
};

peer.load_all_coprocessors();
Expand Down Expand Up @@ -1371,7 +1376,9 @@ impl Peer {
.unwrap_or_else(|e| panic!("{} failed to save apply context: {:?}", self.tag, e));
}

metrics.keys_written += ctx.wb.count() as u64;
metrics.store_written_keys += ctx.wb.count() as u64;
self.written_bytes += ctx.wb.data_size() as u64;
self.written_keys += ctx.wb.count() as u64;

// Commit write and change storage fields atomically.
self.mut_store()
Expand Down
35 changes: 35 additions & 0 deletions src/raftstore/store/store.rs
Expand Up @@ -59,6 +59,7 @@ use super::cmd_resp::{bind_uuid, bind_term, bind_error};
use super::transport::Transport;
use super::metrics::*;
use super::local_metrics::RaftMetrics;
use prometheus::local::LocalHistogram;

type Key = Vec<u8>;

Expand Down Expand Up @@ -109,6 +110,9 @@ pub struct Store<T: Transport, C: PdClient + 'static> {

start_time: Timespec,
is_busy: bool,

region_written_bytes: LocalHistogram,
region_written_keys: LocalHistogram,
}

pub fn create_event_loop<T, C>(cfg: &Config) -> Result<EventLoop<Store<T, C>>>
Expand Down Expand Up @@ -178,6 +182,8 @@ impl<T: Transport, C: PdClient> Store<T, C> {
tag: tag,
start_time: time::get_time(),
is_busy: false,
region_written_bytes: REGION_WRITTEN_BYTES_HISTOGRAM.local(),
region_written_keys: REGION_WRITTEN_KEYS_HISTOGRAM.local(),
};
try!(s.init());
Ok(s)
Expand Down Expand Up @@ -279,6 +285,7 @@ impl<T: Transport, C: PdClient> Store<T, C> {
self.register_snap_mgr_gc_tick(event_loop);
self.register_compact_lock_cf_tick(event_loop);
self.register_consistency_check_tick(event_loop);
self.register_report_region_flow_tick(event_loop);

let split_check_runner = SplitCheckRunner::new(self.sendch.clone(),
self.cfg.region_max_size,
Expand Down Expand Up @@ -1144,6 +1151,33 @@ impl<T: Transport, C: PdClient> Store<T, C> {
};
}

fn register_report_region_flow_tick(&self, event_loop: &mut EventLoop<Self>) {
if let Err(e) = register_timer(event_loop,
Tick::ReportRegionFlow,
self.cfg.report_region_flow_interval) {
error!("{} register raft gc log tick err: {:?}", self.tag, e);
};
}

fn on_report_region_flow(&mut self, event_loop: &mut EventLoop<Self>) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please cover it in test

for (_, peer) in &mut self.region_peers {
if !peer.is_leader() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If peer steps down, all records will be lost.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is ok, we don't need exact result.

peer.written_bytes = 0;
peer.written_keys = 0;
continue;
}

self.region_written_bytes.observe(peer.written_bytes as f64);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why observe it here rather every time handle ready?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1, then we can remove the timer here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

on_raft_ready know nothing about time, what we want is to get region flow in a period of time.

self.region_written_keys.observe(peer.written_keys as f64);
peer.written_bytes = 0;
peer.written_keys = 0;
}
self.region_written_bytes.flush();
self.region_written_keys.flush();

self.register_report_region_flow_tick(event_loop);
}

#[allow(if_same_then_else)]
fn on_raft_gc_log_tick(&mut self, event_loop: &mut EventLoop<Self>) {
for (&region_id, peer) in &mut self.region_peers {
Expand Down Expand Up @@ -1834,6 +1868,7 @@ impl<T: Transport, C: PdClient> mio::Handler for Store<T, C> {
Tick::SnapGc => self.on_snap_mgr_gc(event_loop),
Tick::CompactLockCf => self.on_compact_lock_cf(event_loop),
Tick::ConsistencyCheck => self.on_consistency_check_tick(event_loop),
Tick::ReportRegionFlow => self.on_report_region_flow(event_loop),
}
slow_log!(t, "{} handle timeout {:?}", self.tag, timeout);
}
Expand Down
1 change: 1 addition & 0 deletions tests/raftstore/util.rs
Expand Up @@ -86,6 +86,7 @@ pub fn new_store_cfg() -> Config {
// In production environment, the value of max_leader_missing_duration
// should be configured far beyond the election timeout.
max_leader_missing_duration: Duration::from_secs(3),
report_region_flow_interval: 100, // 100ms
..Config::default()
}
}
Expand Down