From a78439edfefce422439a2581ba10405d8842cec8 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Mon, 8 Feb 2021 16:35:31 -0800 Subject: [PATCH 01/51] Creating new UT, want to dedupe next --- src/lib.rs | 152 +++++++++++++++++- src/protos/mod.rs | 6 +- .../{poller_test.rs => simple_wf_tests.rs} | 44 +++++ tests/main.rs | 2 +- 4 files changed, 194 insertions(+), 10 deletions(-) rename tests/integ_tests/{poller_test.rs => simple_wf_tests.rs} (58%) diff --git a/src/lib.rs b/src/lib.rs index 8a5aaa36e..4f3a5a658 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -285,6 +285,7 @@ pub enum CoreError { #[cfg(test)] mod test { use super::*; + use crate::protos::coresdk::TimerFiredTaskAttributes; use crate::{ machines::test_help::TestHistoryBuilder, pollers::MockServerGateway, @@ -304,7 +305,7 @@ mod test { use tracing::Level; #[test] - fn workflow_bridge() { + fn timer_test_accross_wf_bridge() { let s = span!(Level::DEBUG, "Test start"); let _enter = s.enter(); @@ -389,15 +390,14 @@ mod test { let task_tok = res.task_token; core.complete_task(CompleteTaskReq::ok_from_api_attrs( - StartTimerCommandAttributes { + vec![StartTimerCommandAttributes { timer_id: timer_id.to_string(), ..Default::default() } - .into(), + .into()], task_tok, )) .unwrap(); - dbg!("sent completion w/ start timer"); let res = dbg!(core.poll_task(task_queue).unwrap()); // TODO: uggo @@ -409,10 +409,150 @@ mod test { ); let task_tok = res.task_token; core.complete_task(CompleteTaskReq::ok_from_api_attrs( - CompleteWorkflowExecutionCommandAttributes { result: None }.into(), + vec![CompleteWorkflowExecutionCommandAttributes { result: None }.into()], + task_tok, + )) + .unwrap(); + } + + #[test] + fn parallel_timer_test_accross_wf_bridge() { + let s = span!(Level::DEBUG, "Test start"); + let _enter = s.enter(); + + let wfid = "fake_wf_id"; + let run_id = "fake_run_id"; + let timer_1_id = "timer1".to_string(); + let timer_2_id = "timer2".to_string(); + let task_queue = "test-task-queue"; + + let mut t = TestHistoryBuilder::default(); + t.add_by_type(EventType::WorkflowExecutionStarted); + t.add_workflow_task(); + let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); + let timer_2_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); + t.add( + EventType::TimerFired, + history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { + started_event_id: timer_started_event_id, + timer_id: timer_1_id.clone(), + }), + ); + t.add( + EventType::TimerFired, + history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { + started_event_id: timer_2_started_event_id, + timer_id: timer_2_id.clone(), + }), + ); + t.add_workflow_task_scheduled_and_started(); + /* + 1: EVENT_TYPE_WORKFLOW_EXECUTION_STARTED + 2: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED + 3: EVENT_TYPE_WORKFLOW_TASK_STARTED + --- + 4: EVENT_TYPE_WORKFLOW_TASK_COMPLETED + 5: EVENT_TYPE_TIMER_STARTED + 6: EVENT_TYPE_TIMER_STARTED + 7: EVENT_TYPE_TIMER_FIRED + 8: EVENT_TYPE_TIMER_FIRED + 9: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED + 10: EVENT_TYPE_WORKFLOW_TASK_STARTED + --- + */ + let events_first_batch = t.get_history_info(1).unwrap().events; + let wf = Some(WorkflowExecution { + workflow_id: wfid.to_string(), + run_id: run_id.to_string(), + }); + let first_response = PollWorkflowTaskQueueResponse { + history: Some(History { + events: events_first_batch, + }), + workflow_execution: wf.clone(), + ..Default::default() + }; + let events_second_batch = t.get_history_info(2).unwrap().events; + let second_response = PollWorkflowTaskQueueResponse { + history: Some(History { + events: events_second_batch, + }), + workflow_execution: wf, + ..Default::default() + }; + let responses = vec![first_response, second_response]; + + let mut tasks = VecDeque::from(responses); + let mut mock_gateway = MockServerGateway::new(); + mock_gateway + .expect_poll_workflow_task() + .returning(move |_| Ok(tasks.pop_front().unwrap())); + // Response not really important here + mock_gateway + .expect_complete_workflow_task() + .returning(|_, _| Ok(RespondWorkflowTaskCompletedResponse::default())); + + let runtime = Runtime::new().unwrap(); + let core = CoreSDK { + runtime, + server_gateway: Arc::new(mock_gateway), + workflow_machines: DashMap::new(), + workflow_task_tokens: DashMap::new(), + }; + + let res = core.poll_task(task_queue).unwrap(); + // TODO: uggo + assert_matches!( + res.get_wf_jobs().as_slice(), + [WfActivationJob { + attributes: Some(wf_activation_job::Attributes::StartWorkflow(_)), + }] + ); + assert!(core.workflow_machines.get(run_id).is_some()); + + let task_tok = res.task_token; + core.complete_task(CompleteTaskReq::ok_from_api_attrs( + vec![ + StartTimerCommandAttributes { + timer_id: timer_1_id.clone(), + ..Default::default() + } + .into(), + StartTimerCommandAttributes { + timer_id: timer_2_id.clone(), + ..Default::default() + } + .into(), + ], + task_tok, + )) + .unwrap(); + + let res = core.poll_task(task_queue).unwrap(); + // TODO: uggo + assert_matches!( + res.get_wf_jobs().as_slice(), + [ + WfActivationJob { + attributes: Some(wf_activation_job::Attributes::TimerFired( + TimerFiredTaskAttributes { timer_id: t1_id } + )), + }, + WfActivationJob { + attributes: Some(wf_activation_job::Attributes::TimerFired( + TimerFiredTaskAttributes { timer_id: t2_id } + )), + } + ] => { + assert_eq!(t1_id, &timer_1_id); + assert_eq!(t2_id, &timer_2_id); + } + ); + let task_tok = res.task_token; + core.complete_task(CompleteTaskReq::ok_from_api_attrs( + vec![CompleteWorkflowExecutionCommandAttributes { result: None }.into()], task_tok, )) .unwrap(); - dbg!("sent workflow done"); } } diff --git a/src/protos/mod.rs b/src/protos/mod.rs index 47232bd12..57650f312 100644 --- a/src/protos/mod.rs +++ b/src/protos/mod.rs @@ -60,11 +60,11 @@ pub mod coresdk { impl CompleteTaskReq { /// Build a successful completion from some api command attributes and a task token pub fn ok_from_api_attrs( - cmd: api_command::command::Attributes, + cmds: Vec, task_token: Vec, ) -> Self { - let cmd: ApiCommand = cmd.into(); - let success: WfActivationSuccess = vec![cmd].into(); + let cmds: Vec = cmds.into_iter().map(Into::into).collect(); + let success: WfActivationSuccess = cmds.into(); CompleteTaskReq { task_token, completion: Some(Completion::Workflow(WfActivationCompletion { diff --git a/tests/integ_tests/poller_test.rs b/tests/integ_tests/simple_wf_tests.rs similarity index 58% rename from tests/integ_tests/poller_test.rs rename to tests/integ_tests/simple_wf_tests.rs index d0d5ef1a4..e89033859 100644 --- a/tests/integ_tests/poller_test.rs +++ b/tests/integ_tests/simple_wf_tests.rs @@ -66,3 +66,47 @@ fn timer_workflow() { run_id ); } + +#[test] +fn parallel_timer_workflow() { + let temporal_server_address = match env::var("TEMPORAL_SERVICE_ADDRESS") { + Ok(addr) => addr, + Err(_) => "http://localhost:7233".to_owned(), + }; + let url = Url::try_from(&*temporal_server_address).unwrap(); + let gateway_opts = ServerGatewayOptions { + namespace: NAMESPACE.to_string(), + identity: "none".to_string(), + worker_binary_id: "".to_string(), + long_poll_timeout: Duration::from_secs(60), + target_url: url, + }; + let core = temporal_sdk_core::init(CoreInitOptions { gateway_opts }).unwrap(); + let mut rng = rand::thread_rng(); + let workflow_id: u32 = rng.gen(); + let run_id = dbg!(create_workflow(&core, &workflow_id.to_string())); + let timer_id: String = rng.gen::().to_string(); + let task = dbg!(core.poll_task(TASK_QUEUE).unwrap()); + core.complete_task(CompleteTaskReq::ok_from_api_attrs( + StartTimerCommandAttributes { + timer_id: timer_id.to_string(), + start_to_fire_timeout: Some(Duration::from_secs(1).into()), + ..Default::default() + } + .into(), + task.task_token, + )) + .unwrap(); + dbg!("sent completion w/ start timer"); + let task = dbg!(core.poll_task(TASK_QUEUE).unwrap()); + core.complete_task(CompleteTaskReq::ok_from_api_attrs( + CompleteWorkflowExecutionCommandAttributes { result: None }.into(), + task.task_token, + )) + .unwrap(); + dbg!( + "sent workflow done, completed workflow", + workflow_id, + run_id + ); +} diff --git a/tests/main.rs b/tests/main.rs index 976cf28e4..887c013d1 100644 --- a/tests/main.rs +++ b/tests/main.rs @@ -1,4 +1,4 @@ #[cfg(test)] mod integ_tests { - mod poller_test; + mod simple_wf_tests; } From 79495e63149de603c585d7433ed7f01bdfd66477 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Mon, 8 Feb 2021 16:53:39 -0800 Subject: [PATCH 02/51] Dedupe core response setup --- src/lib.rs | 104 ++++------------------------ src/machines/test_help/mod.rs | 62 +++++++++++++++++ src/machines/timer_state_machine.rs | 1 - 3 files changed, 74 insertions(+), 93 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 4f3a5a658..6b112b0ed 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -285,23 +285,19 @@ pub enum CoreError { #[cfg(test)] mod test { use super::*; - use crate::protos::coresdk::TimerFiredTaskAttributes; use crate::{ - machines::test_help::TestHistoryBuilder, - pollers::MockServerGateway, + machines::test_help::{build_fake_core, TestHistoryBuilder}, protos::{ - coresdk::{wf_activation_job, WfActivationJob}, + coresdk::{wf_activation_job, TimerFiredTaskAttributes, WfActivationJob}, temporal::api::{ command::v1::{ CompleteWorkflowExecutionCommandAttributes, StartTimerCommandAttributes, }, enums::v1::EventType, - history::v1::{history_event, History, TimerFiredEventAttributes}, - workflowservice::v1::RespondWorkflowTaskCompletedResponse, + history::v1::{history_event, TimerFiredEventAttributes}, }, }, }; - use std::collections::VecDeque; use tracing::Level; #[test] @@ -311,7 +307,7 @@ mod test { let wfid = "fake_wf_id"; let run_id = "fake_run_id"; - let timer_id = "fake_timer"; + let timer_id = "fake_timer".to_string(); let task_queue = "test-task-queue"; let mut t = TestHistoryBuilder::default(); @@ -322,7 +318,7 @@ mod test { EventType::TimerFired, history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { started_event_id: timer_started_event_id, - timer_id: "timer1".to_string(), + timer_id: timer_id.clone(), }), ); t.add_workflow_task_scheduled_and_started(); @@ -338,47 +334,9 @@ mod test { 8: EVENT_TYPE_WORKFLOW_TASK_STARTED --- */ - let events_first_batch = t.get_history_info(1).unwrap().events; - let wf = Some(WorkflowExecution { - workflow_id: wfid.to_string(), - run_id: run_id.to_string(), - }); - let first_response = PollWorkflowTaskQueueResponse { - history: Some(History { - events: events_first_batch, - }), - workflow_execution: wf.clone(), - ..Default::default() - }; - let events_second_batch = t.get_history_info(2).unwrap().events; - let second_response = PollWorkflowTaskQueueResponse { - history: Some(History { - events: events_second_batch, - }), - workflow_execution: wf, - ..Default::default() - }; - let responses = vec![first_response, second_response]; - - let mut tasks = VecDeque::from(responses); - let mut mock_gateway = MockServerGateway::new(); - mock_gateway - .expect_poll_workflow_task() - .returning(move |_| Ok(tasks.pop_front().unwrap())); - // Response not really important here - mock_gateway - .expect_complete_workflow_task() - .returning(|_, _| Ok(RespondWorkflowTaskCompletedResponse::default())); - - let runtime = Runtime::new().unwrap(); - let core = CoreSDK { - runtime, - server_gateway: Arc::new(mock_gateway), - workflow_machines: DashMap::new(), - workflow_task_tokens: DashMap::new(), - }; + let core = build_fake_core(wfid, run_id, &mut t); - let res = dbg!(core.poll_task(task_queue).unwrap()); + let res = core.poll_task(task_queue).unwrap(); // TODO: uggo assert_matches!( res.get_wf_jobs().as_slice(), @@ -391,7 +349,7 @@ mod test { let task_tok = res.task_token; core.complete_task(CompleteTaskReq::ok_from_api_attrs( vec![StartTimerCommandAttributes { - timer_id: timer_id.to_string(), + timer_id, ..Default::default() } .into()], @@ -399,7 +357,7 @@ mod test { )) .unwrap(); - let res = dbg!(core.poll_task(task_queue).unwrap()); + let res = core.poll_task(task_queue).unwrap(); // TODO: uggo assert_matches!( res.get_wf_jobs().as_slice(), @@ -460,45 +418,7 @@ mod test { 10: EVENT_TYPE_WORKFLOW_TASK_STARTED --- */ - let events_first_batch = t.get_history_info(1).unwrap().events; - let wf = Some(WorkflowExecution { - workflow_id: wfid.to_string(), - run_id: run_id.to_string(), - }); - let first_response = PollWorkflowTaskQueueResponse { - history: Some(History { - events: events_first_batch, - }), - workflow_execution: wf.clone(), - ..Default::default() - }; - let events_second_batch = t.get_history_info(2).unwrap().events; - let second_response = PollWorkflowTaskQueueResponse { - history: Some(History { - events: events_second_batch, - }), - workflow_execution: wf, - ..Default::default() - }; - let responses = vec![first_response, second_response]; - - let mut tasks = VecDeque::from(responses); - let mut mock_gateway = MockServerGateway::new(); - mock_gateway - .expect_poll_workflow_task() - .returning(move |_| Ok(tasks.pop_front().unwrap())); - // Response not really important here - mock_gateway - .expect_complete_workflow_task() - .returning(|_, _| Ok(RespondWorkflowTaskCompletedResponse::default())); - - let runtime = Runtime::new().unwrap(); - let core = CoreSDK { - runtime, - server_gateway: Arc::new(mock_gateway), - workflow_machines: DashMap::new(), - workflow_task_tokens: DashMap::new(), - }; + let core = build_fake_core(wfid, run_id, &mut t); let res = core.poll_task(task_queue).unwrap(); // TODO: uggo @@ -544,8 +464,8 @@ mod test { )), } ] => { - assert_eq!(t1_id, &timer_1_id); - assert_eq!(t2_id, &timer_2_id); + assert_eq!(t1_id, &timer_1_id); + assert_eq!(t2_id, &timer_2_id); } ); let task_tok = res.task_token; diff --git a/src/machines/test_help/mod.rs b/src/machines/test_help/mod.rs index b7ab7509b..e1c0233eb 100644 --- a/src/machines/test_help/mod.rs +++ b/src/machines/test_help/mod.rs @@ -5,3 +5,65 @@ mod workflow_driver; pub(crate) use history_builder::TestHistoryBuilder; pub(super) use workflow_driver::{CommandSender, TestWFCommand, TestWorkflowDriver}; + +use crate::{ + pollers::MockServerGateway, + protos::temporal::api::common::v1::WorkflowExecution, + protos::temporal::api::history::v1::History, + protos::temporal::api::workflowservice::v1::{ + PollWorkflowTaskQueueResponse, RespondWorkflowTaskCompletedResponse, + }, + CoreSDK, +}; +use dashmap::DashMap; +use std::{collections::VecDeque, sync::Arc}; +use tokio::runtime::Runtime; + +/// Given identifiers for a workflow/run, and a test history builder, construct an instance of +/// the core SDK with a mock server gateway that will produce the responses as appropriate. +pub(crate) fn build_fake_core( + wfid: &str, + run_id: &str, + t: &mut TestHistoryBuilder, +) -> CoreSDK { + let events_first_batch = t.get_history_info(1).unwrap().events; + let wf = Some(WorkflowExecution { + workflow_id: wfid.to_string(), + run_id: run_id.to_string(), + }); + let first_response = PollWorkflowTaskQueueResponse { + history: Some(History { + events: events_first_batch, + }), + workflow_execution: wf.clone(), + ..Default::default() + }; + let events_second_batch = t.get_history_info(2).unwrap().events; + let second_response = PollWorkflowTaskQueueResponse { + history: Some(History { + events: events_second_batch, + }), + workflow_execution: wf, + ..Default::default() + }; + let responses = vec![first_response, second_response]; + + let mut tasks = VecDeque::from(responses); + let mut mock_gateway = MockServerGateway::new(); + mock_gateway + .expect_poll_workflow_task() + .returning(move |_| Ok(tasks.pop_front().unwrap())); + // Response not really important here + mock_gateway + .expect_complete_workflow_task() + .returning(|_, _| Ok(RespondWorkflowTaskCompletedResponse::default())); + + let runtime = Runtime::new().unwrap(); + let core = CoreSDK { + runtime, + server_gateway: Arc::new(mock_gateway), + workflow_machines: DashMap::new(), + workflow_task_tokens: DashMap::new(), + }; + core +} diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index 79c932a0b..c315043e7 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -343,7 +343,6 @@ mod test { let commands = t .handle_workflow_task_take_cmds(&mut state_machines, Some(2)) .unwrap(); - dbg!(&commands); assert_eq!(commands.len(), 1); assert_eq!( commands[0].command_type, From 070f863d1ae672294efacf64ca45b2e383ff1d87 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Mon, 8 Feb 2021 17:12:19 -0800 Subject: [PATCH 03/51] Response batch control --- src/lib.rs | 15 +++----- src/machines/test_help/history_builder.rs | 3 +- src/machines/test_help/mod.rs | 43 +++++++++++------------ 3 files changed, 27 insertions(+), 34 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 6b112b0ed..3d344b100 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -298,13 +298,9 @@ mod test { }, }, }; - use tracing::Level; #[test] - fn timer_test_accross_wf_bridge() { - let s = span!(Level::DEBUG, "Test start"); - let _enter = s.enter(); - + fn timer_test_across_wf_bridge() { let wfid = "fake_wf_id"; let run_id = "fake_run_id"; let timer_id = "fake_timer".to_string(); @@ -334,7 +330,7 @@ mod test { 8: EVENT_TYPE_WORKFLOW_TASK_STARTED --- */ - let core = build_fake_core(wfid, run_id, &mut t); + let core = build_fake_core(wfid, run_id, &mut t, &[1, 2]); let res = core.poll_task(task_queue).unwrap(); // TODO: uggo @@ -374,10 +370,7 @@ mod test { } #[test] - fn parallel_timer_test_accross_wf_bridge() { - let s = span!(Level::DEBUG, "Test start"); - let _enter = s.enter(); - + fn parallel_timer_test_across_wf_bridge() { let wfid = "fake_wf_id"; let run_id = "fake_run_id"; let timer_1_id = "timer1".to_string(); @@ -418,7 +411,7 @@ mod test { 10: EVENT_TYPE_WORKFLOW_TASK_STARTED --- */ - let core = build_fake_core(wfid, run_id, &mut t); + let core = build_fake_core(wfid, run_id, &mut t, &[1, 2]); let res = core.poll_task(task_queue).unwrap(); // TODO: uggo diff --git a/src/machines/test_help/history_builder.rs b/src/machines/test_help/history_builder.rs index 2f4dac37f..b9fc0a3dc 100644 --- a/src/machines/test_help/history_builder.rs +++ b/src/machines/test_help/history_builder.rs @@ -148,7 +148,8 @@ impl TestHistoryBuilder { Ok(()) } - /// Iterates over the events in this builder to return a [HistoryInfo] of the n-th workflow task. + /// Iterates over the events in this builder to return a [HistoryInfo] including events up to + /// the provided `to_wf_task_num` pub(crate) fn get_history_info( &self, to_wf_task_num: usize, diff --git a/src/machines/test_help/mod.rs b/src/machines/test_help/mod.rs index e1c0233eb..ba216ab7e 100644 --- a/src/machines/test_help/mod.rs +++ b/src/machines/test_help/mod.rs @@ -21,32 +21,32 @@ use tokio::runtime::Runtime; /// Given identifiers for a workflow/run, and a test history builder, construct an instance of /// the core SDK with a mock server gateway that will produce the responses as appropriate. +/// +/// `response_batches` is used to control the fake [PollWorkflowTaskQueueResponse]s returned. +/// For each number in the input list, a fake response will be prepared which includes history +/// up to the workflow task with that number, as in [TestHistoryBuilder::get_history_info]. pub(crate) fn build_fake_core( - wfid: &str, + wf_id: &str, run_id: &str, t: &mut TestHistoryBuilder, + response_batches: &[usize], ) -> CoreSDK { - let events_first_batch = t.get_history_info(1).unwrap().events; let wf = Some(WorkflowExecution { - workflow_id: wfid.to_string(), + workflow_id: wf_id.to_string(), run_id: run_id.to_string(), }); - let first_response = PollWorkflowTaskQueueResponse { - history: Some(History { - events: events_first_batch, - }), - workflow_execution: wf.clone(), - ..Default::default() - }; - let events_second_batch = t.get_history_info(2).unwrap().events; - let second_response = PollWorkflowTaskQueueResponse { - history: Some(History { - events: events_second_batch, - }), - workflow_execution: wf, - ..Default::default() - }; - let responses = vec![first_response, second_response]; + + let responses: Vec<_> = response_batches + .iter() + .map(|to_task_num| { + let batch = t.get_history_info(*to_task_num).unwrap().events; + PollWorkflowTaskQueueResponse { + history: Some(History { events: batch }), + workflow_execution: wf.clone(), + ..Default::default() + } + }) + .collect(); let mut tasks = VecDeque::from(responses); let mut mock_gateway = MockServerGateway::new(); @@ -59,11 +59,10 @@ pub(crate) fn build_fake_core( .returning(|_, _| Ok(RespondWorkflowTaskCompletedResponse::default())); let runtime = Runtime::new().unwrap(); - let core = CoreSDK { + CoreSDK { runtime, server_gateway: Arc::new(mock_gateway), workflow_machines: DashMap::new(), workflow_task_tokens: DashMap::new(), - }; - core + } } From ab685cc22543693da3a0c2ddc710e5bed59bd0e3 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Tue, 9 Feb 2021 13:21:06 -0800 Subject: [PATCH 04/51] Saving here while waiting for some feedback --- Cargo.toml | 4 +- src/lib.rs | 54 +++++++++++++++++++++++ src/machines/test_help/workflow_driver.rs | 6 +-- src/machines/timer_state_machine.rs | 10 +++++ src/machines/workflow_machines.rs | 1 + tests/integ_tests/simple_wf_tests.rs | 1 + 6 files changed, 72 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 63703200b..c0b880f38 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,16 +17,18 @@ env_logger = "0.8" futures = "0.3" log = "0.4" opentelemetry-jaeger = "0.10" +opentelemetry = "0.11.2" prost = "0.7" prost-types = "0.7" thiserror = "1.0" tokio = { version = "1.1", features = ["rt", "rt-multi-thread"] } tracing = { version = "0.1", features = ["log"] } -tracing-opentelemetry = "0.11" +tracing-opentelemetry = "0.10" tracing-subscriber = "0.2" url = "2.2" rand = "0.8.3" uuid = { version = "0.8.2", features = ["v4"] } + [dependencies.tonic] version = "0.4" #path = "../tonic/tonic" diff --git a/src/lib.rs b/src/lib.rs index 3d344b100..be5fd900a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -298,6 +298,8 @@ mod test { }, }, }; + use tracing_subscriber::layer::SubscriberExt; + use tracing_subscriber::util::SubscriberInitExt; #[test] fn timer_test_across_wf_bridge() { @@ -468,4 +470,56 @@ mod test { )) .unwrap(); } + + #[test] + fn single_timer_whole_replay_test_across_wf_bridge() { + let (tracer, _uninstall) = opentelemetry_jaeger::new_pipeline() + .with_service_name("report_example") + .install() + .unwrap(); + let opentelemetry = tracing_opentelemetry::layer().with_tracer(tracer); + tracing_subscriber::registry() + .with(opentelemetry) + .try_init() + .unwrap(); + + let s = span!(Level::DEBUG, "Test start", t = "bridge"); + let _enter = s.enter(); + + let wfid = "fake_wf_id"; + let run_id = "fake_run_id"; + let timer_1_id = "timer1".to_string(); + let task_queue = "test-task-queue"; + + let mut t = TestHistoryBuilder::default(); + t.add_by_type(EventType::WorkflowExecutionStarted); + t.add_workflow_task(); + let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); + t.add( + EventType::TimerFired, + history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { + started_event_id: timer_started_event_id, + timer_id: timer_1_id.clone(), + }), + ); + t.add_workflow_task_scheduled_and_started(); + let core = build_fake_core(wfid, run_id, &mut t, &[2]); + + let res = core.poll_task(task_queue).unwrap(); + // TODO: Not the right expectation -- is timer fired? + assert_matches!( + res.get_wf_jobs().as_slice(), + [WfActivationJob { + attributes: Some(wf_activation_job::Attributes::StartWorkflow(_)), + }] + ); + assert!(core.workflow_machines.get(run_id).is_some()); + + let task_tok = res.task_token; + core.complete_task(CompleteTaskReq::ok_from_api_attrs( + vec![CompleteWorkflowExecutionCommandAttributes { result: None }.into()], + task_tok, + )) + .unwrap(); + } } diff --git a/src/machines/test_help/workflow_driver.rs b/src/machines/test_help/workflow_driver.rs index 56d9843ef..670408595 100644 --- a/src/machines/test_help/workflow_driver.rs +++ b/src/machines/test_help/workflow_driver.rs @@ -72,12 +72,11 @@ where F: Fn(CommandSender) -> Fut + Send + Sync, Fut: Future, { - #[instrument(skip(self))] fn start(&mut self, _attribs: WorkflowExecutionStartedEventAttributes) -> Vec { + event!(Level::DEBUG, msg = "Test WF driver start called"); vec![] } - #[instrument(skip(self))] fn fetch_workflow_iteration_output(&mut self) -> Vec { let (sender, receiver) = CommandSender::new(self.cache.clone()); // Call the closure that produces the workflow future @@ -88,7 +87,6 @@ where rt.block_on(wf_future); let cmds = receiver.into_iter(); - event!(Level::DEBUG, msg = "Test wf driver emitting", ?cmds); let mut last_cmd = None; for cmd in cmds { @@ -101,6 +99,8 @@ where } } + event!(Level::DEBUG, msg = "Test wf driver emitting", ?last_cmd); + // Return only the last command, since that's what would've been yielded in a real wf if let Some(c) = last_cmd { vec![c] diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index c315043e7..2c9950be8 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -336,6 +336,16 @@ mod test { #[rstest] fn test_fire_happy_path_full(fire_happy_hist: (TestHistoryBuilder, WorkflowMachines)) { + let (tracer, _uninstall) = opentelemetry_jaeger::new_pipeline() + .with_service_name("report_example") + .install() + .unwrap(); + let opentelemetry = tracing_opentelemetry::layer().with_tracer(tracer); + tracing_subscriber::registry() + .with(opentelemetry) + .try_init() + .unwrap(); + let s = span!(Level::DEBUG, "Test start", t = "full"); let _enter = s.enter(); diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index a6a87548d..cb0efef0e 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -439,6 +439,7 @@ impl WorkflowMachines { fn prepare_commands(&mut self) { while let Some(c) = self.current_wf_task_commands.pop_front() { + dbg!(&c); // TODO - some special case stuff that can maybe be managed differently? // handleCommand should be called even on canceled ones to support mutableSideEffect // command.handleCommand(command.getCommandType()); diff --git a/tests/integ_tests/simple_wf_tests.rs b/tests/integ_tests/simple_wf_tests.rs index e89033859..9375ac203 100644 --- a/tests/integ_tests/simple_wf_tests.rs +++ b/tests/integ_tests/simple_wf_tests.rs @@ -67,6 +67,7 @@ fn timer_workflow() { ); } +// TODO: Actually make this different #[test] fn parallel_timer_workflow() { let temporal_server_address = match env::var("TEMPORAL_SERVICE_ADDRESS") { From 72327b9f89b3dc3a937cfb43c175aa9be43c83cd Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Wed, 10 Feb 2021 10:05:06 -0800 Subject: [PATCH 05/51] Replay test works. Just need to do a bit of cleanup --- Cargo.toml | 1 + src/lib.rs | 75 ++++++++++++++++------- src/machines/test_help/history_builder.rs | 38 ++---------- src/machines/test_help/mod.rs | 2 + src/machines/timer_state_machine.rs | 12 +--- src/protos/mod.rs | 43 +++++++++++++ src/workflow/mod.rs | 59 +++++++++++++++++- tests/integ_tests/simple_wf_tests.rs | 2 +- 8 files changed, 162 insertions(+), 70 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c0b880f38..fc54cecd2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ edition = "2018" [dependencies] anyhow = "1.0" async-trait = "0.1" +crossbeam = "0.8" dashmap = "4.0" derive_more = "0.99" displaydoc = "0.1" diff --git a/src/lib.rs b/src/lib.rs index be5fd900a..51de5b948 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,9 +30,10 @@ use crate::{ common::v1::WorkflowExecution, workflowservice::v1::PollWorkflowTaskQueueResponse, }, }, - protosext::{HistoryInfo, HistoryInfoError}, + protosext::HistoryInfoError, workflow::{WfManagerProtected, WorkflowManager}, }; +use crossbeam::queue::SegQueue; use dashmap::DashMap; use std::{convert::TryInto, sync::mpsc::SendError, sync::Arc}; use tokio::runtime::Runtime; @@ -85,6 +86,7 @@ pub fn init(opts: CoreInitOptions) -> Result { server_gateway: Arc::new(work_provider), workflow_machines: Default::default(), workflow_task_tokens: Default::default(), + pending_activations: Default::default(), }) } @@ -107,6 +109,10 @@ where workflow_machines: DashMap, /// Maps task tokens to workflow run ids workflow_task_tokens: DashMap, String>, + + /// Workflows that are currently under replay will queue their run ID here, indicating that + /// there are more workflow tasks / activations to be performed. + pending_activations: SegQueue, } impl Core for CoreSDK @@ -115,6 +121,22 @@ where { #[instrument(skip(self))] fn poll_task(&self, task_queue: &str) -> Result { + // We must first check if there are pending workflow tasks for workflows that are currently + // replaying + if let Some(run_id) = self.pending_activations.pop() { + dbg!(&run_id); + let (activation, more_tasks) = + self.access_machine(&run_id, |mgr| mgr.get_next_activation())?; + if more_tasks { + self.pending_activations.push(run_id); + } + return Ok(Task { + // TODO: Set this properly + task_token: vec![], + variant: activation.map(Into::into), + }); + } + // This will block forever in the event there is no work from the server let work = self .runtime @@ -134,7 +156,7 @@ where event!( Level::DEBUG, - msg = "Received workflow task", + msg = "Received workflow task from server", ?work.task_token ); @@ -142,14 +164,13 @@ where self.workflow_task_tokens .insert(work.task_token.clone(), run_id.clone()); - // We pass none since we want to apply all the history we just got. - // Will need to change a bit once we impl caching. - let hist_info = HistoryInfo::new_from_history(&history, None)?; - let activation = self.access_machine(&run_id, |mgr| { - let machines = &mut mgr.machines; - hist_info.apply_history_events(machines)?; - Ok(machines.get_wf_activation()) - })?; + let (activation, more_tasks) = + self.access_machine(&run_id, |mgr| mgr.feed_history_from_server(history))?; + + if more_tasks { + dbg!("More tasks!"); + self.pending_activations.push(run_id); + } Ok(Task { task_token: work.task_token, @@ -298,8 +319,6 @@ mod test { }, }, }; - use tracing_subscriber::layer::SubscriberExt; - use tracing_subscriber::util::SubscriberInitExt; #[test] fn timer_test_across_wf_bridge() { @@ -355,6 +374,7 @@ mod test { )) .unwrap(); + dbg!("Second poll"); let res = core.poll_task(task_queue).unwrap(); // TODO: uggo assert_matches!( @@ -473,16 +493,6 @@ mod test { #[test] fn single_timer_whole_replay_test_across_wf_bridge() { - let (tracer, _uninstall) = opentelemetry_jaeger::new_pipeline() - .with_service_name("report_example") - .install() - .unwrap(); - let opentelemetry = tracing_opentelemetry::layer().with_tracer(tracer); - tracing_subscriber::registry() - .with(opentelemetry) - .try_init() - .unwrap(); - let s = span!(Level::DEBUG, "Test start", t = "bridge"); let _enter = s.enter(); @@ -503,6 +513,8 @@ mod test { }), ); t.add_workflow_task_scheduled_and_started(); + // NOTE! What makes this a replay test is the server only responds with *one* batch here. + // So, server is polled once, but lang->core interactions look just like non-replay test. let core = build_fake_core(wfid, run_id, &mut t, &[2]); let res = core.poll_task(task_queue).unwrap(); @@ -515,6 +527,25 @@ mod test { ); assert!(core.workflow_machines.get(run_id).is_some()); + let task_tok = res.task_token; + core.complete_task(CompleteTaskReq::ok_from_api_attrs( + vec![StartTimerCommandAttributes { + timer_id: timer_1_id, + ..Default::default() + } + .into()], + task_tok, + )) + .unwrap(); + + let res = core.poll_task(task_queue).unwrap(); + // TODO: uggo + assert_matches!( + res.get_wf_jobs().as_slice(), + [WfActivationJob { + attributes: Some(wf_activation_job::Attributes::TimerFired(_)), + }] + ); let task_tok = res.task_token; core.complete_task(CompleteTaskReq::ok_from_api_attrs( vec![CompleteWorkflowExecutionCommandAttributes { result: None }.into()], diff --git a/src/machines/test_help/history_builder.rs b/src/machines/test_help/history_builder.rs index b9fc0a3dc..8a3230790 100644 --- a/src/machines/test_help/history_builder.rs +++ b/src/machines/test_help/history_builder.rs @@ -1,4 +1,5 @@ use super::Result; +use crate::protos::temporal::api::history::v1::History; use crate::{ machines::{workflow_machines::WorkflowMachines, ProtoCommand}, protos::temporal::api::{ @@ -87,41 +88,10 @@ impl TestHistoryBuilder { self.build_and_push_event(EventType::WorkflowTaskCompleted, attrs.into()); } - /// Counts the number of whole workflow tasks. Looks for WFTaskStarted followed by - /// WFTaskCompleted, adding one to the count for every match. It will additionally count - /// a WFTaskStarted at the end of the event list. - /// - /// If `up_to_event_id` is provided, the count will be returned as soon as processing advances - /// past that id. - pub fn get_workflow_task_count(&self, up_to_event_id: Option) -> Result { - let mut last_wf_started_id = 0; - let mut count = 0; - let mut history = self.events.iter().peekable(); - while let Some(event) = history.next() { - let next_event = history.peek(); - if let Some(upto) = up_to_event_id { - if event.event_id > upto { - return Ok(count); - } - } - let next_is_completed = next_event.map_or(false, |ne| { - ne.event_type == EventType::WorkflowTaskCompleted as i32 - }); - if event.event_type == EventType::WorkflowTaskStarted as i32 - && (next_event.is_none() || next_is_completed) - { - last_wf_started_id = event.event_id; - count += 1; - } - - if next_event.is_none() { - if last_wf_started_id != event.event_id { - bail!("Last item in history wasn't WorkflowTaskStarted") - } - return Ok(count); - } + pub fn as_history(&self) -> History { + History { + events: self.events.clone(), } - Ok(count) } /// Handle workflow task(s) using the provided [WorkflowMachines]. Will process as many workflow diff --git a/src/machines/test_help/mod.rs b/src/machines/test_help/mod.rs index ba216ab7e..7f18520d0 100644 --- a/src/machines/test_help/mod.rs +++ b/src/machines/test_help/mod.rs @@ -52,6 +52,7 @@ pub(crate) fn build_fake_core( let mut mock_gateway = MockServerGateway::new(); mock_gateway .expect_poll_workflow_task() + .times(response_batches.len()) .returning(move |_| Ok(tasks.pop_front().unwrap())); // Response not really important here mock_gateway @@ -64,5 +65,6 @@ pub(crate) fn build_fake_core( server_gateway: Arc::new(mock_gateway), workflow_machines: DashMap::new(), workflow_task_tokens: DashMap::new(), + pending_activations: Default::default(), } } diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index 2c9950be8..e77b1d7c3 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -305,7 +305,7 @@ mod test { }), ); t.add_workflow_task_scheduled_and_started(); - assert_eq!(2, t.get_workflow_task_count(None).unwrap()); + assert_eq!(2, t.as_history().get_workflow_task_count(None).unwrap()); (t, state_machines) } @@ -336,16 +336,6 @@ mod test { #[rstest] fn test_fire_happy_path_full(fire_happy_hist: (TestHistoryBuilder, WorkflowMachines)) { - let (tracer, _uninstall) = opentelemetry_jaeger::new_pipeline() - .with_service_name("report_example") - .install() - .unwrap(); - let opentelemetry = tracing_opentelemetry::layer().with_tracer(tracer); - tracing_subscriber::registry() - .with(opentelemetry) - .try_init() - .unwrap(); - let s = span!(Level::DEBUG, "Test start", t = "full"); let _enter = s.enter(); diff --git a/src/protos/mod.rs b/src/protos/mod.rs index 57650f312..f4c71801e 100644 --- a/src/protos/mod.rs +++ b/src/protos/mod.rs @@ -129,11 +129,54 @@ pub mod temporal { use crate::protos::temporal::api::{ enums::v1::EventType, history::v1::history_event::Attributes, }; + use crate::protosext::HistoryInfoError; use prost::alloc::fmt::Formatter; use std::fmt::Display; include!("temporal.api.history.v1.rs"); + impl History { + /// Counts the number of whole workflow tasks. Looks for WFTaskStarted followed + /// by WFTaskCompleted, adding one to the count for every match. It will + /// additionally count a WFTaskStarted at the end of the event list. + /// + /// If `up_to_event_id` is provided, the count will be returned as soon as + /// processing advances past that id. + pub fn get_workflow_task_count( + &self, + up_to_event_id: Option, + ) -> Result { + let mut last_wf_started_id = 0; + let mut count = 0; + let mut history = self.events.iter().peekable(); + while let Some(event) = history.next() { + let next_event = history.peek(); + if let Some(upto) = up_to_event_id { + if event.event_id > upto { + return Ok(count); + } + } + let next_is_completed = next_event.map_or(false, |ne| { + ne.event_type == EventType::WorkflowTaskCompleted as i32 + }); + if event.event_type == EventType::WorkflowTaskStarted as i32 + && (next_event.is_none() || next_is_completed) + { + last_wf_started_id = event.event_id; + count += 1; + } + + if next_event.is_none() { + if last_wf_started_id != event.event_id { + return Err(HistoryInfoError::HistoryEndsUnexpectedly); + } + return Ok(count); + } + } + Ok(count) + } + } + impl HistoryEvent { /// Returns true if this is an event created to mirror a command pub fn is_command_event(&self) -> bool { diff --git a/src/workflow/mod.rs b/src/workflow/mod.rs index 08c894f3b..da699c69d 100644 --- a/src/workflow/mod.rs +++ b/src/workflow/mod.rs @@ -2,6 +2,9 @@ mod bridge; pub(crate) use bridge::WorkflowBridge; +use crate::protos::coresdk::WfActivation; +use crate::protos::temporal::api::history::v1::History; +use crate::protosext::HistoryInfo; use crate::{ machines::{ProtoCommand, WFCommand, WorkflowMachines}, protos::temporal::api::workflowservice::v1::StartWorkflowExecutionResponse, @@ -56,7 +59,8 @@ pub trait StartWorkflowExecutionApi { ) -> Result; } -/// Manages concurrent access to an instance of a [WorkflowMachines], which is not thread-safe. +/// Manages concurrent access to an instance of a [WorkflowMachines], which is not thread-safe, +/// as well as other data associated with that specific workflow run. pub(crate) struct WorkflowManager { data: Arc>, } @@ -64,6 +68,13 @@ pub(crate) struct WorkflowManager { pub(crate) struct WfManagerProtected { pub machines: WorkflowMachines, pub command_sink: Sender>, + /// The last recorded history we received from the server for this workflow run. This must be + /// kept because the lang side polls & completes for every workflow task, but we do not need + /// to poll the server that often during replay. + pub last_history_from_server: Option, + pub last_history_task_count: Option, + /// The current workflow task number this run is on. Starts at one and monotonically increases. + pub current_wf_task_num: usize, } impl WorkflowManager { @@ -74,6 +85,9 @@ impl WorkflowManager { let protected = WfManagerProtected { machines: state_machines, command_sink: cmd_sink, + last_history_from_server: None, + last_history_task_count: None, + current_wf_task_num: 1, }; Self { data: Arc::new(Mutex::new(protected)), @@ -91,13 +105,54 @@ impl WorkflowManager { } } +impl WfManagerProtected { + /// Given history that was just obtained from the server, pipe it into this workflow's machines. + /// + /// Should only be called when a workflow has caught up on replay. It will return a workflow + /// activation if one is needed, as well as a bool indicating if there are more workflow tasks + /// that need to be performed to replay the remaining history. + pub fn feed_history_from_server( + &mut self, + hist: History, + ) -> Result<(Option, bool)> { + let task_hist = HistoryInfo::new_from_history(&hist, Some(self.current_wf_task_num))?; + let task_ct = hist.get_workflow_task_count(None)?; + self.last_history_task_count = Some(task_ct); + self.last_history_from_server = Some(hist); + task_hist.apply_history_events(&mut self.machines)?; + let activation = self.machines.get_wf_activation(); + let more_activations_needed = task_ct > self.current_wf_task_num; + + self.current_wf_task_num += 1; + Ok((activation, more_activations_needed)) + } + + pub fn get_next_activation(&mut self) -> Result<(Option, bool)> { + self.current_wf_task_num += 1; + // TODO: Proper errors + let hist = self + .last_history_from_server + .as_ref() + .ok_or_else(|| CoreError::Unknown)?; + let task_hist = HistoryInfo::new_from_history(hist, Some(self.current_wf_task_num))?; + task_hist.apply_history_events(&mut self.machines)?; + let activation = self.machines.get_wf_activation(); + let more_activations_needed = self.current_wf_task_num + <= self + .last_history_task_count + .ok_or_else(|| CoreError::Unknown)?; + + Ok((activation, more_activations_needed)) + } +} + #[cfg(test)] mod tests { use super::*; - // Enforce thread-safeness of wf manager fn enforcer(_: W) {} + // Enforce thread-safeness of wf manager #[test] fn is_threadsafe() { enforcer(WorkflowManager::new(&Default::default())); diff --git a/tests/integ_tests/simple_wf_tests.rs b/tests/integ_tests/simple_wf_tests.rs index 9375ac203..4c1616665 100644 --- a/tests/integ_tests/simple_wf_tests.rs +++ b/tests/integ_tests/simple_wf_tests.rs @@ -67,7 +67,7 @@ fn timer_workflow() { ); } -// TODO: Actually make this different +// TODO: Actually make this different from serial #[test] fn parallel_timer_workflow() { let temporal_server_address = match env::var("TEMPORAL_SERVICE_ADDRESS") { From a29489ec77a8b2e0e5d9dd86bed782d720fb5900 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Wed, 10 Feb 2021 10:41:13 -0800 Subject: [PATCH 06/51] Fix task token stuff --- src/lib.rs | 28 +++++++++++++++++----------- src/machines/test_help/mod.rs | 3 +++ src/machines/workflow_machines.rs | 1 - src/workflow/mod.rs | 24 ++++++++++++++++-------- 4 files changed, 36 insertions(+), 20 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 51de5b948..2785427a3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -112,7 +112,13 @@ where /// Workflows that are currently under replay will queue their run ID here, indicating that /// there are more workflow tasks / activations to be performed. - pending_activations: SegQueue, + pending_activations: SegQueue, +} + +#[derive(Debug)] +struct PendingActivation { + run_id: String, + task_token: Vec, } impl Core for CoreSDK @@ -123,16 +129,16 @@ where fn poll_task(&self, task_queue: &str) -> Result { // We must first check if there are pending workflow tasks for workflows that are currently // replaying - if let Some(run_id) = self.pending_activations.pop() { - dbg!(&run_id); + if let Some(pa) = self.pending_activations.pop() { + event!(Level::DEBUG, msg = "Applying pending activations", ?pa); let (activation, more_tasks) = - self.access_machine(&run_id, |mgr| mgr.get_next_activation())?; + self.access_machine(&pa.run_id, |mgr| mgr.get_next_activation())?; + let task_token = pa.task_token.clone(); if more_tasks { - self.pending_activations.push(run_id); + self.pending_activations.push(pa); } return Ok(Task { - // TODO: Set this properly - task_token: vec![], + task_token, variant: activation.map(Into::into), }); } @@ -168,8 +174,10 @@ where self.access_machine(&run_id, |mgr| mgr.feed_history_from_server(history))?; if more_tasks { - dbg!("More tasks!"); - self.pending_activations.push(run_id); + self.pending_activations.push(PendingActivation { + run_id, + task_token: work.task_token.clone(), + }); } Ok(Task { @@ -374,7 +382,6 @@ mod test { )) .unwrap(); - dbg!("Second poll"); let res = core.poll_task(task_queue).unwrap(); // TODO: uggo assert_matches!( @@ -518,7 +525,6 @@ mod test { let core = build_fake_core(wfid, run_id, &mut t, &[2]); let res = core.poll_task(task_queue).unwrap(); - // TODO: Not the right expectation -- is timer fired? assert_matches!( res.get_wf_jobs().as_slice(), [WfActivationJob { diff --git a/src/machines/test_help/mod.rs b/src/machines/test_help/mod.rs index 7f18520d0..9c4a73df8 100644 --- a/src/machines/test_help/mod.rs +++ b/src/machines/test_help/mod.rs @@ -16,6 +16,7 @@ use crate::{ CoreSDK, }; use dashmap::DashMap; +use rand::{thread_rng, Rng}; use std::{collections::VecDeque, sync::Arc}; use tokio::runtime::Runtime; @@ -40,9 +41,11 @@ pub(crate) fn build_fake_core( .iter() .map(|to_task_num| { let batch = t.get_history_info(*to_task_num).unwrap().events; + let task_token: [u8; 16] = thread_rng().gen(); PollWorkflowTaskQueueResponse { history: Some(History { events: batch }), workflow_execution: wf.clone(), + task_token: task_token.to_vec(), ..Default::default() } }) diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index cb0efef0e..a6a87548d 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -439,7 +439,6 @@ impl WorkflowMachines { fn prepare_commands(&mut self) { while let Some(c) = self.current_wf_task_commands.pop_front() { - dbg!(&c); // TODO - some special case stuff that can maybe be managed differently? // handleCommand should be called even on canceled ones to support mutableSideEffect // command.handleCommand(command.getCommandType()); diff --git a/src/workflow/mod.rs b/src/workflow/mod.rs index da699c69d..8b638045c 100644 --- a/src/workflow/mod.rs +++ b/src/workflow/mod.rs @@ -2,24 +2,27 @@ mod bridge; pub(crate) use bridge::WorkflowBridge; -use crate::protos::coresdk::WfActivation; -use crate::protos::temporal::api::history::v1::History; -use crate::protosext::HistoryInfo; use crate::{ machines::{ProtoCommand, WFCommand, WorkflowMachines}, - protos::temporal::api::workflowservice::v1::StartWorkflowExecutionResponse, - protos::temporal::api::{ - common::v1::WorkflowExecution, - workflowservice::v1::{ - PollWorkflowTaskQueueResponse, RespondWorkflowTaskCompletedResponse, + protos::{ + coresdk::WfActivation, + temporal::api::{ + common::v1::WorkflowExecution, + history::v1::History, + workflowservice::v1::{ + PollWorkflowTaskQueueResponse, RespondWorkflowTaskCompletedResponse, + StartWorkflowExecutionResponse, + }, }, }, + protosext::HistoryInfo, CoreError, Result, }; use std::{ ops::DerefMut, sync::{mpsc::Sender, Arc, Mutex}, }; +use tracing::Level; /// Implementors can provide new workflow tasks to the SDK. The connection to the server is the real /// implementor. @@ -111,6 +114,7 @@ impl WfManagerProtected { /// Should only be called when a workflow has caught up on replay. It will return a workflow /// activation if one is needed, as well as a bool indicating if there are more workflow tasks /// that need to be performed to replay the remaining history. + #[instrument(skip(self))] pub fn feed_history_from_server( &mut self, hist: History, @@ -123,6 +127,10 @@ impl WfManagerProtected { let activation = self.machines.get_wf_activation(); let more_activations_needed = task_ct > self.current_wf_task_num; + if more_activations_needed { + event!(Level::DEBUG, msg = "More activations needed"); + } + self.current_wf_task_num += 1; Ok((activation, more_activations_needed)) } From 4bf1703030e5d192d723e459665ab1c67b86d13b Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Wed, 10 Feb 2021 17:16:57 -0800 Subject: [PATCH 07/51] Some refactoring to eliminate invalid states in workflow representation --- src/lib.rs | 102 +++++++++++++++------------ src/machines/workflow_machines.rs | 4 +- src/workflow/mod.rs | 73 +++++++++++-------- tests/integ_tests/simple_wf_tests.rs | 12 ++-- 4 files changed, 108 insertions(+), 83 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 2785427a3..4c5f02f8f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,15 +26,13 @@ use crate::{ complete_task_req::Completion, wf_activation_completion::Status, CompleteTaskReq, Task, WfActivationCompletion, WfActivationSuccess, }, - temporal::api::{ - common::v1::WorkflowExecution, workflowservice::v1::PollWorkflowTaskQueueResponse, - }, + temporal::api::workflowservice::v1::PollWorkflowTaskQueueResponse, }, protosext::HistoryInfoError, - workflow::{WfManagerProtected, WorkflowManager}, + workflow::{NextWfActivation, WfManagerProtected, WorkflowManager}, }; use crossbeam::queue::SegQueue; -use dashmap::DashMap; +use dashmap::{mapref::entry::Entry, DashMap}; use std::{convert::TryInto, sync::mpsc::SendError, sync::Arc}; use tokio::runtime::Runtime; use tonic::codegen::http::uri::InvalidUri; @@ -128,18 +126,18 @@ where #[instrument(skip(self))] fn poll_task(&self, task_queue: &str) -> Result { // We must first check if there are pending workflow tasks for workflows that are currently - // replaying + // replaying, and issue those tasks before bothering the server. if let Some(pa) = self.pending_activations.pop() { event!(Level::DEBUG, msg = "Applying pending activations", ?pa); - let (activation, more_tasks) = + let next_activation = self.access_machine(&pa.run_id, |mgr| mgr.get_next_activation())?; let task_token = pa.task_token.clone(); - if more_tasks { + if next_activation.more_activations_needed { self.pending_activations.push(pa); } return Ok(Task { task_token, - variant: activation.map(Into::into), + variant: next_activation.activation.map(Into::into), }); } @@ -147,42 +145,25 @@ where let work = self .runtime .block_on(self.server_gateway.poll_workflow_task(task_queue))?; - let run_id = match &work.workflow_execution { - Some(we) => { - self.instantiate_workflow_if_needed(we); - we.run_id.clone() - } - None => return Err(CoreError::BadDataFromWorkProvider(work)), - }; - let history = if let Some(hist) = work.history { - hist - } else { - return Err(CoreError::BadDataFromWorkProvider(work)); - }; - + let task_token = work.task_token.clone(); event!( Level::DEBUG, msg = "Received workflow task from server", - ?work.task_token + ?task_token ); - // Correlate task token w/ run ID - self.workflow_task_tokens - .insert(work.task_token.clone(), run_id.clone()); + let (next_activation, run_id) = self.instantiate_or_update_workflow(work)?; - let (activation, more_tasks) = - self.access_machine(&run_id, |mgr| mgr.feed_history_from_server(history))?; - - if more_tasks { + if next_activation.more_activations_needed { self.pending_activations.push(PendingActivation { run_id, - task_token: work.task_token.clone(), + task_token: task_token.clone(), }); } Ok(Task { - task_token: work.task_token, - variant: activation.map(Into::into), + task_token, + variant: next_activation.activation.map(Into::into), }) } @@ -232,17 +213,50 @@ where } impl CoreSDK { - fn instantiate_workflow_if_needed(&self, workflow_execution: &WorkflowExecution) { - if self - .workflow_machines - .contains_key(&workflow_execution.run_id) + /// Will create a new workflow manager if needed for the workflow task, if not, it will + /// feed the existing manager the updated history we received from the server. + /// + /// Also updates [CoreSDK::workflow_task_tokens] and validates the + /// [PollWorkflowTaskQueueResponse] + /// + /// Returns the next workflow activation and the workflow's run id + fn instantiate_or_update_workflow( + &self, + poll_wf_resp: PollWorkflowTaskQueueResponse, + ) -> Result<(NextWfActivation, String)> { + if let PollWorkflowTaskQueueResponse { + task_token, + workflow_execution: Some(workflow_execution), + .. + } = &poll_wf_resp { - return; + let run_id = workflow_execution.run_id.clone(); + // Correlate task token w/ run ID + self.workflow_task_tokens + .insert(task_token.clone(), run_id.clone()); + + match self.workflow_machines.entry(run_id.clone()) { + Entry::Occupied(mut existing) => { + if let Some(history) = poll_wf_resp.history { + let activation = existing + .get_mut() + .lock()? + .feed_history_from_server(history)?; + Ok((activation, run_id)) + } else { + Err(CoreError::BadDataFromWorkProvider(poll_wf_resp)) + } + } + Entry::Vacant(vacant) => { + let wfm = WorkflowManager::new(poll_wf_resp)?; + let activation = wfm.lock()?.get_next_activation()?; + vacant.insert(wfm); + Ok((activation, run_id)) + } + } + } else { + Err(CoreError::BadDataFromWorkProvider(poll_wf_resp)) } - self.workflow_machines.insert( - workflow_execution.run_id.clone(), - WorkflowManager::new(workflow_execution), - ); } /// Feed commands from the lang sdk into the appropriate workflow bridge @@ -281,8 +295,6 @@ impl CoreSDK { #[allow(clippy::large_enum_variant)] // NOTE: Docstrings take the place of #[error("xxxx")] here b/c of displaydoc pub enum CoreError { - /// Unknown service error - Unknown, /// No tasks to perform for now NoWork, /// Poll response from server was malformed: {0:?} @@ -292,7 +304,7 @@ pub enum CoreError { /// Error buffering commands CantSendCommands(#[from] SendError>), /// Couldn't interpret command from - UninterprableCommand(#[from] InconvertibleCommandError), + UninterpretableCommand(#[from] InconvertibleCommandError), /// Underlying error in history processing UnderlyingHistError(#[from] HistoryInfoError), /// Task token had nothing associated with it: {0:?} diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index a6a87548d..abad68299 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -43,9 +43,9 @@ pub(crate) struct WorkflowMachines { /// True if the workflow is replaying from history replaying: bool, /// Workflow identifier - workflow_id: String, + pub workflow_id: String, /// Identifies the current run and is used as a seed for faux-randomness. - run_id: String, + pub run_id: String, /// The current workflow time if it has been established current_wf_time: Option, diff --git a/src/workflow/mod.rs b/src/workflow/mod.rs index 8b638045c..8b03c5588 100644 --- a/src/workflow/mod.rs +++ b/src/workflow/mod.rs @@ -7,7 +7,6 @@ use crate::{ protos::{ coresdk::WfActivation, temporal::api::{ - common::v1::WorkflowExecution, history::v1::History, workflowservice::v1::{ PollWorkflowTaskQueueResponse, RespondWorkflowTaskCompletedResponse, @@ -74,27 +73,38 @@ pub(crate) struct WfManagerProtected { /// The last recorded history we received from the server for this workflow run. This must be /// kept because the lang side polls & completes for every workflow task, but we do not need /// to poll the server that often during replay. - pub last_history_from_server: Option, - pub last_history_task_count: Option, + pub last_history_from_server: History, + pub last_history_task_count: usize, /// The current workflow task number this run is on. Starts at one and monotonically increases. pub current_wf_task_num: usize, } impl WorkflowManager { - pub fn new(we: &WorkflowExecution) -> Self { + /// Create a new workflow manager from a server workflow task queue response. + pub fn new(poll_resp: PollWorkflowTaskQueueResponse) -> Result { + let (history, we) = if let PollWorkflowTaskQueueResponse { + workflow_execution: Some(we), + history: Some(hist), + .. + } = poll_resp + { + (hist, we) + } else { + return Err(CoreError::BadDataFromWorkProvider(poll_resp.clone())); + }; + let (wfb, cmd_sink) = WorkflowBridge::new(); - let state_machines = - WorkflowMachines::new(we.workflow_id.clone(), we.run_id.clone(), Box::new(wfb)); + let state_machines = WorkflowMachines::new(we.workflow_id, we.run_id, Box::new(wfb)); let protected = WfManagerProtected { machines: state_machines, command_sink: cmd_sink, - last_history_from_server: None, - last_history_task_count: None, + last_history_task_count: history.get_workflow_task_count(None)?, + last_history_from_server: history, current_wf_task_num: 1, }; - Self { + Ok(Self { data: Arc::new(Mutex::new(protected)), - } + }) } pub fn lock(&self) -> Result + '_> { @@ -108,6 +118,11 @@ impl WorkflowManager { } } +pub(crate) struct NextWfActivation { + pub activation: Option, + pub more_activations_needed: bool, +} + impl WfManagerProtected { /// Given history that was just obtained from the server, pipe it into this workflow's machines. /// @@ -115,14 +130,11 @@ impl WfManagerProtected { /// activation if one is needed, as well as a bool indicating if there are more workflow tasks /// that need to be performed to replay the remaining history. #[instrument(skip(self))] - pub fn feed_history_from_server( - &mut self, - hist: History, - ) -> Result<(Option, bool)> { + pub fn feed_history_from_server(&mut self, hist: History) -> Result { let task_hist = HistoryInfo::new_from_history(&hist, Some(self.current_wf_task_num))?; let task_ct = hist.get_workflow_task_count(None)?; - self.last_history_task_count = Some(task_ct); - self.last_history_from_server = Some(hist); + self.last_history_task_count = task_ct; + self.last_history_from_server = hist; task_hist.apply_history_events(&mut self.machines)?; let activation = self.machines.get_wf_activation(); let more_activations_needed = task_ct > self.current_wf_task_num; @@ -132,25 +144,26 @@ impl WfManagerProtected { } self.current_wf_task_num += 1; - Ok((activation, more_activations_needed)) + + Ok(NextWfActivation { + activation, + more_activations_needed, + }) } - pub fn get_next_activation(&mut self) -> Result<(Option, bool)> { - self.current_wf_task_num += 1; - // TODO: Proper errors - let hist = self - .last_history_from_server - .as_ref() - .ok_or_else(|| CoreError::Unknown)?; + pub fn get_next_activation(&mut self) -> Result { + let hist = &self.last_history_from_server; let task_hist = HistoryInfo::new_from_history(hist, Some(self.current_wf_task_num))?; task_hist.apply_history_events(&mut self.machines)?; let activation = self.machines.get_wf_activation(); - let more_activations_needed = self.current_wf_task_num - <= self - .last_history_task_count - .ok_or_else(|| CoreError::Unknown)?; - Ok((activation, more_activations_needed)) + self.current_wf_task_num += 1; + let more_activations_needed = self.current_wf_task_num <= self.last_history_task_count; + + Ok(NextWfActivation { + activation, + more_activations_needed, + }) } } @@ -163,6 +176,6 @@ mod tests { // Enforce thread-safeness of wf manager #[test] fn is_threadsafe() { - enforcer(WorkflowManager::new(&Default::default())); + enforcer(WorkflowManager::new(Default::default())); } } diff --git a/tests/integ_tests/simple_wf_tests.rs b/tests/integ_tests/simple_wf_tests.rs index 4c1616665..9e3ca7291 100644 --- a/tests/integ_tests/simple_wf_tests.rs +++ b/tests/integ_tests/simple_wf_tests.rs @@ -44,19 +44,19 @@ fn timer_workflow() { let timer_id: String = rng.gen::().to_string(); let task = dbg!(core.poll_task(TASK_QUEUE).unwrap()); core.complete_task(CompleteTaskReq::ok_from_api_attrs( - StartTimerCommandAttributes { + vec![StartTimerCommandAttributes { timer_id: timer_id.to_string(), start_to_fire_timeout: Some(Duration::from_secs(1).into()), ..Default::default() } - .into(), + .into()], task.task_token, )) .unwrap(); dbg!("sent completion w/ start timer"); let task = dbg!(core.poll_task(TASK_QUEUE).unwrap()); core.complete_task(CompleteTaskReq::ok_from_api_attrs( - CompleteWorkflowExecutionCommandAttributes { result: None }.into(), + vec![CompleteWorkflowExecutionCommandAttributes { result: None }.into()], task.task_token, )) .unwrap(); @@ -89,19 +89,19 @@ fn parallel_timer_workflow() { let timer_id: String = rng.gen::().to_string(); let task = dbg!(core.poll_task(TASK_QUEUE).unwrap()); core.complete_task(CompleteTaskReq::ok_from_api_attrs( - StartTimerCommandAttributes { + vec![StartTimerCommandAttributes { timer_id: timer_id.to_string(), start_to_fire_timeout: Some(Duration::from_secs(1).into()), ..Default::default() } - .into(), + .into()], task.task_token, )) .unwrap(); dbg!("sent completion w/ start timer"); let task = dbg!(core.poll_task(TASK_QUEUE).unwrap()); core.complete_task(CompleteTaskReq::ok_from_api_attrs( - CompleteWorkflowExecutionCommandAttributes { result: None }.into(), + vec![CompleteWorkflowExecutionCommandAttributes { result: None }.into()], task.task_token, )) .unwrap(); From fed36fb3410d4324a83c79011563ddf5a5ca7661 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Thu, 11 Feb 2021 13:31:29 -0800 Subject: [PATCH 08/51] Make parallel timer integ test actually test two timers --- src/protos/mod.rs | 1 - tests/integ_tests/simple_wf_tests.rs | 65 ++++++++++++++++++---------- 2 files changed, 41 insertions(+), 25 deletions(-) diff --git a/src/protos/mod.rs b/src/protos/mod.rs index f4c71801e..6f93e48ad 100644 --- a/src/protos/mod.rs +++ b/src/protos/mod.rs @@ -26,7 +26,6 @@ pub mod coresdk { } /// Returns any contained jobs if this task was a wf activation and it had some - #[cfg(test)] pub fn get_wf_jobs(&self) -> Vec { if let Some(task::Variant::Workflow(a)) = &self.variant { a.jobs.clone() diff --git a/tests/integ_tests/simple_wf_tests.rs b/tests/integ_tests/simple_wf_tests.rs index 9e3ca7291..4ab4246c7 100644 --- a/tests/integ_tests/simple_wf_tests.rs +++ b/tests/integ_tests/simple_wf_tests.rs @@ -1,8 +1,9 @@ +use assert_matches::assert_matches; use rand::{self, Rng}; use std::{convert::TryFrom, env, time::Duration}; use temporal_sdk_core::{ protos::{ - coresdk::CompleteTaskReq, + coresdk::{wf_activation_job, CompleteTaskReq, TimerFiredTaskAttributes, WfActivationJob}, temporal::api::command::v1::{ CompleteWorkflowExecutionCommandAttributes, StartTimerCommandAttributes, }, @@ -40,9 +41,9 @@ fn timer_workflow() { let core = temporal_sdk_core::init(CoreInitOptions { gateway_opts }).unwrap(); let mut rng = rand::thread_rng(); let workflow_id: u32 = rng.gen(); - let run_id = dbg!(create_workflow(&core, &workflow_id.to_string())); + dbg!(create_workflow(&core, &workflow_id.to_string())); let timer_id: String = rng.gen::().to_string(); - let task = dbg!(core.poll_task(TASK_QUEUE).unwrap()); + let task = core.poll_task(TASK_QUEUE).unwrap(); core.complete_task(CompleteTaskReq::ok_from_api_attrs( vec![StartTimerCommandAttributes { timer_id: timer_id.to_string(), @@ -53,21 +54,14 @@ fn timer_workflow() { task.task_token, )) .unwrap(); - dbg!("sent completion w/ start timer"); let task = dbg!(core.poll_task(TASK_QUEUE).unwrap()); core.complete_task(CompleteTaskReq::ok_from_api_attrs( vec![CompleteWorkflowExecutionCommandAttributes { result: None }.into()], task.task_token, )) .unwrap(); - dbg!( - "sent workflow done, completed workflow", - workflow_id, - run_id - ); } -// TODO: Actually make this different from serial #[test] fn parallel_timer_workflow() { let temporal_server_address = match env::var("TEMPORAL_SERVICE_ADDRESS") { @@ -86,28 +80,51 @@ fn parallel_timer_workflow() { let mut rng = rand::thread_rng(); let workflow_id: u32 = rng.gen(); let run_id = dbg!(create_workflow(&core, &workflow_id.to_string())); - let timer_id: String = rng.gen::().to_string(); + let timer_id = "timer 1".to_string(); + let timer_2_id = "timer 2".to_string(); let task = dbg!(core.poll_task(TASK_QUEUE).unwrap()); core.complete_task(CompleteTaskReq::ok_from_api_attrs( - vec![StartTimerCommandAttributes { - timer_id: timer_id.to_string(), - start_to_fire_timeout: Some(Duration::from_secs(1).into()), - ..Default::default() - } - .into()], + vec![ + StartTimerCommandAttributes { + timer_id: timer_id.clone(), + start_to_fire_timeout: Some(Duration::from_millis(100).into()), + ..Default::default() + } + .into(), + StartTimerCommandAttributes { + timer_id: timer_2_id.clone(), + start_to_fire_timeout: Some(Duration::from_millis(200).into()), + ..Default::default() + } + .into(), + ], task.task_token, )) .unwrap(); - dbg!("sent completion w/ start timer"); - let task = dbg!(core.poll_task(TASK_QUEUE).unwrap()); + // Wait long enough for both timers to complete + std::thread::sleep(Duration::from_millis(400)); + let task = core.poll_task(TASK_QUEUE).unwrap(); + assert_matches!( + task.get_wf_jobs().as_slice(), + [ + WfActivationJob { + attributes: Some(wf_activation_job::Attributes::TimerFired( + TimerFiredTaskAttributes { timer_id: t1_id } + )), + }, + WfActivationJob { + attributes: Some(wf_activation_job::Attributes::TimerFired( + TimerFiredTaskAttributes { timer_id: t2_id } + )), + } + ] => { + assert_eq!(t1_id, &timer_id); + assert_eq!(t2_id, &timer_2_id); + } + ); core.complete_task(CompleteTaskReq::ok_from_api_attrs( vec![CompleteWorkflowExecutionCommandAttributes { result: None }.into()], task.task_token, )) .unwrap(); - dbg!( - "sent workflow done, completed workflow", - workflow_id, - run_id - ); } From 121977afe35ea6bc34a88f732c4d96c1601bfc7b Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Thu, 11 Feb 2021 13:43:11 -0800 Subject: [PATCH 09/51] Clippy lints --- src/lib.rs | 12 +++++++----- src/machines/mod.rs | 23 ++++++++++------------- src/machines/workflow_machines.rs | 9 ++------- src/pollers/mod.rs | 1 + tests/integ_tests/poller_test.rs | 1 + 5 files changed, 21 insertions(+), 25 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index e0b76f065..e5e31e940 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -329,7 +329,9 @@ mod test { use crate::{ machines::test_help::{build_fake_core, TestHistoryBuilder}, protos::{ - coresdk::{wf_activation_job, TimerFiredTaskAttributes, WfActivationJob}, + coresdk::{ + wf_activation_job, TaskCompletion, TimerFiredTaskAttributes, WfActivationJob, + }, temporal::api::{ command::v1::{ CompleteWorkflowExecutionCommandAttributes, StartTimerCommandAttributes, @@ -384,7 +386,7 @@ mod test { assert!(core.workflow_machines.get(run_id).is_some()); let task_tok = res.task_token; - core.complete_task(CompleteTaskReq::ok_from_api_attrs( + core.complete_task(TaskCompletion::ok_from_api_attrs( vec![StartTimerCommandAttributes { timer_id, ..Default::default() @@ -403,7 +405,7 @@ mod test { }] ); let task_tok = res.task_token; - core.complete_task(CompleteTaskReq::ok_from_api_attrs( + core.complete_task(TaskCompletion::ok_from_api_attrs( vec![CompleteWorkflowExecutionCommandAttributes { result: None }.into()], task_tok, )) @@ -503,7 +505,7 @@ mod test { } ); let task_tok = res.task_token; - core.complete_task(CompleteTaskReq::ok_from_api_attrs( + core.complete_task(TaskCompletion::ok_from_api_attrs( vec![CompleteWorkflowExecutionCommandAttributes { result: None }.into()], task_tok, )) @@ -546,7 +548,7 @@ mod test { assert!(core.workflow_machines.get(run_id).is_some()); let task_tok = res.task_token; - core.complete_task(CompleteTaskReq::ok_from_api_attrs( + core.complete_task(TaskCompletion::ok_from_api_attrs( vec![StartTimerCommandAttributes { timer_id: timer_1_id, ..Default::default() diff --git a/src/machines/mod.rs b/src/machines/mod.rs index 0c9b261e2..57f771939 100644 --- a/src/machines/mod.rs +++ b/src/machines/mod.rs @@ -120,20 +120,17 @@ impl TryFrom for WFCommand { fn try_from(c: coresdk::Command) -> Result { // TODO: Return error without cloning match c.variant.clone() { - Some(a) => match a { - Variant::Api(Command { - attributes: Some(attrs), - .. - }) => match attrs { - Attributes::StartTimerCommandAttributes(s) => Ok(WFCommand::AddTimer(s)), - Attributes::CompleteWorkflowExecutionCommandAttributes(c) => { - Ok(WFCommand::CompleteWorkflow(c)) - } - _ => unimplemented!(), - }, - _ => Err(c.into()), + Some(Variant::Api(Command { + attributes: Some(attrs), + .. + })) => match attrs { + Attributes::StartTimerCommandAttributes(s) => Ok(WFCommand::AddTimer(s)), + Attributes::CompleteWorkflowExecutionCommandAttributes(c) => { + Ok(WFCommand::CompleteWorkflow(c)) + } + _ => unimplemented!(), }, - None => Err(c.into()), + _ => Err(c.into()), } } } diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index abad68299..0a34c6332 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -185,11 +185,7 @@ impl WorkflowMachines { /// Called when we want to run the event loop because a workflow task started event has /// triggered - pub(super) fn task_started( - &mut self, - task_started_event_id: i64, - time: SystemTime, - ) -> Result<()> { + pub(super) fn task_started(&mut self, task_started_event_id: i64, time: SystemTime) { let s = span!(Level::DEBUG, "Task started trigger"); let _enter = s.enter(); @@ -215,7 +211,6 @@ impl WorkflowMachines { self.current_started_event_id = task_started_event_id; self.set_current_time(time); self.event_loop(); - Ok(()) } /// A command event is an event which is generated from a command emitted by a past decision. @@ -413,7 +408,7 @@ impl WorkflowMachines { task_started_event_id, time, } => { - self.task_started(task_started_event_id, time)?; + self.task_started(task_started_event_id, time); } } } diff --git a/src/pollers/mod.rs b/src/pollers/mod.rs index a03ff8281..f0a46b5e6 100644 --- a/src/pollers/mod.rs +++ b/src/pollers/mod.rs @@ -60,6 +60,7 @@ impl ServerGatewayOptions { /// This function will get called on each outbound request. Returning a /// `Status` here will cancel the request and have that status returned to /// the caller. +#[allow(clippy::unnecessary_wraps)] // Clippy lies because we need to pass to `with_interceptor` fn intercept(mut req: Request<()>) -> Result, Status> { let metadata = req.metadata_mut(); // TODO: Only apply this to long poll requests diff --git a/tests/integ_tests/poller_test.rs b/tests/integ_tests/poller_test.rs index e69de29bb..8b1378917 100644 --- a/tests/integ_tests/poller_test.rs +++ b/tests/integ_tests/poller_test.rs @@ -0,0 +1 @@ + From c03b30486efd93163d626f5116995f4e4f12fb0a Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Thu, 11 Feb 2021 14:01:41 -0800 Subject: [PATCH 10/51] Fix a todo --- src/machines/mod.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/machines/mod.rs b/src/machines/mod.rs index 57f771939..35473e12c 100644 --- a/src/machines/mod.rs +++ b/src/machines/mod.rs @@ -118,8 +118,7 @@ impl TryFrom for WFCommand { type Error = InconvertibleCommandError; fn try_from(c: coresdk::Command) -> Result { - // TODO: Return error without cloning - match c.variant.clone() { + match c.variant { Some(Variant::Api(Command { attributes: Some(attrs), .. From 7300cd6936e94c1b447a7fdd968d2ced553d5641 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Thu, 11 Feb 2021 14:03:19 -0800 Subject: [PATCH 11/51] Clean up uggo todos, it's clear we want to reduce test verbosity --- src/lib.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index e5e31e940..d0379593e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -376,7 +376,6 @@ mod test { let core = build_fake_core(wfid, run_id, &mut t, &[1, 2]); let res = core.poll_task(task_queue).unwrap(); - // TODO: uggo assert_matches!( res.get_wf_jobs().as_slice(), [WfActivationJob { @@ -397,7 +396,6 @@ mod test { .unwrap(); let res = core.poll_task(task_queue).unwrap(); - // TODO: uggo assert_matches!( res.get_wf_jobs().as_slice(), [WfActivationJob { @@ -457,7 +455,6 @@ mod test { let core = build_fake_core(wfid, run_id, &mut t, &[1, 2]); let res = core.poll_task(task_queue).unwrap(); - // TODO: uggo assert_matches!( res.get_wf_jobs().as_slice(), [WfActivationJob { @@ -485,7 +482,6 @@ mod test { .unwrap(); let res = core.poll_task(task_queue).unwrap(); - // TODO: uggo assert_matches!( res.get_wf_jobs().as_slice(), [ @@ -559,7 +555,6 @@ mod test { .unwrap(); let res = core.poll_task(task_queue).unwrap(); - // TODO: uggo assert_matches!( res.get_wf_jobs().as_slice(), [WfActivationJob { From f722f3e58eb80729237cabd1643a14a1263b1685 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Thu, 11 Feb 2021 14:28:43 -0800 Subject: [PATCH 12/51] Fix merge problem in integ tests --- tests/integ_tests/simple_wf_tests.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/integ_tests/simple_wf_tests.rs b/tests/integ_tests/simple_wf_tests.rs index 4ab4246c7..4aac503dc 100644 --- a/tests/integ_tests/simple_wf_tests.rs +++ b/tests/integ_tests/simple_wf_tests.rs @@ -3,7 +3,7 @@ use rand::{self, Rng}; use std::{convert::TryFrom, env, time::Duration}; use temporal_sdk_core::{ protos::{ - coresdk::{wf_activation_job, CompleteTaskReq, TimerFiredTaskAttributes, WfActivationJob}, + coresdk::{wf_activation_job, TaskCompletion, TimerFiredTaskAttributes, WfActivationJob}, temporal::api::command::v1::{ CompleteWorkflowExecutionCommandAttributes, StartTimerCommandAttributes, }, @@ -44,7 +44,7 @@ fn timer_workflow() { dbg!(create_workflow(&core, &workflow_id.to_string())); let timer_id: String = rng.gen::().to_string(); let task = core.poll_task(TASK_QUEUE).unwrap(); - core.complete_task(CompleteTaskReq::ok_from_api_attrs( + core.complete_task(TaskCompletion::ok_from_api_attrs( vec![StartTimerCommandAttributes { timer_id: timer_id.to_string(), start_to_fire_timeout: Some(Duration::from_secs(1).into()), @@ -55,7 +55,7 @@ fn timer_workflow() { )) .unwrap(); let task = dbg!(core.poll_task(TASK_QUEUE).unwrap()); - core.complete_task(CompleteTaskReq::ok_from_api_attrs( + core.complete_task(TaskCompletion::ok_from_api_attrs( vec![CompleteWorkflowExecutionCommandAttributes { result: None }.into()], task.task_token, )) @@ -79,11 +79,11 @@ fn parallel_timer_workflow() { let core = temporal_sdk_core::init(CoreInitOptions { gateway_opts }).unwrap(); let mut rng = rand::thread_rng(); let workflow_id: u32 = rng.gen(); - let run_id = dbg!(create_workflow(&core, &workflow_id.to_string())); + dbg!(create_workflow(&core, &workflow_id.to_string())); let timer_id = "timer 1".to_string(); let timer_2_id = "timer 2".to_string(); let task = dbg!(core.poll_task(TASK_QUEUE).unwrap()); - core.complete_task(CompleteTaskReq::ok_from_api_attrs( + core.complete_task(TaskCompletion::ok_from_api_attrs( vec![ StartTimerCommandAttributes { timer_id: timer_id.clone(), @@ -122,7 +122,7 @@ fn parallel_timer_workflow() { assert_eq!(t2_id, &timer_2_id); } ); - core.complete_task(CompleteTaskReq::ok_from_api_attrs( + core.complete_task(TaskCompletion::ok_from_api_attrs( vec![CompleteWorkflowExecutionCommandAttributes { result: None }.into()], task.task_token, )) From d59c394c9cab47750047828862b1f8e911bbf3c9 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Thu, 11 Feb 2021 14:43:28 -0800 Subject: [PATCH 13/51] Use separate task queues to enable running integ tests in parallel --- tests/integ_tests/simple_wf_tests.rs | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/tests/integ_tests/simple_wf_tests.rs b/tests/integ_tests/simple_wf_tests.rs index 4aac503dc..e7d9f3f47 100644 --- a/tests/integ_tests/simple_wf_tests.rs +++ b/tests/integ_tests/simple_wf_tests.rs @@ -11,14 +11,17 @@ use temporal_sdk_core::{ Core, CoreInitOptions, ServerGatewayOptions, Url, }; -const TASK_QUEUE: &str = "test-tq"; +// TODO: These tests can get broken permanently if they break one time and the server is not +// restarted, because pulling from the same task queue produces tasks for the previous failed +// workflows. Fix that. + const NAMESPACE: &str = "default"; #[tokio::main] -async fn create_workflow(core: &dyn Core, workflow_id: &str) -> String { +async fn create_workflow(core: &dyn Core, task_q: &str, workflow_id: &str) -> String { core.server_gateway() .unwrap() - .start_workflow(NAMESPACE, TASK_QUEUE, workflow_id, "test-workflow") + .start_workflow(NAMESPACE, task_q, workflow_id, "test-workflow") .await .unwrap() .run_id @@ -26,6 +29,7 @@ async fn create_workflow(core: &dyn Core, workflow_id: &str) -> String { #[test] fn timer_workflow() { + let task_q = "timer_workflow"; let temporal_server_address = match env::var("TEMPORAL_SERVICE_ADDRESS") { Ok(addr) => addr, Err(_) => "http://localhost:7233".to_owned(), @@ -41,9 +45,9 @@ fn timer_workflow() { let core = temporal_sdk_core::init(CoreInitOptions { gateway_opts }).unwrap(); let mut rng = rand::thread_rng(); let workflow_id: u32 = rng.gen(); - dbg!(create_workflow(&core, &workflow_id.to_string())); + dbg!(create_workflow(&core, task_q, &workflow_id.to_string())); let timer_id: String = rng.gen::().to_string(); - let task = core.poll_task(TASK_QUEUE).unwrap(); + let task = core.poll_task(task_q).unwrap(); core.complete_task(TaskCompletion::ok_from_api_attrs( vec![StartTimerCommandAttributes { timer_id: timer_id.to_string(), @@ -54,7 +58,7 @@ fn timer_workflow() { task.task_token, )) .unwrap(); - let task = dbg!(core.poll_task(TASK_QUEUE).unwrap()); + let task = dbg!(core.poll_task(task_q).unwrap()); core.complete_task(TaskCompletion::ok_from_api_attrs( vec![CompleteWorkflowExecutionCommandAttributes { result: None }.into()], task.task_token, @@ -64,6 +68,7 @@ fn timer_workflow() { #[test] fn parallel_timer_workflow() { + let task_q = "parallel_timer_workflow"; let temporal_server_address = match env::var("TEMPORAL_SERVICE_ADDRESS") { Ok(addr) => addr, Err(_) => "http://localhost:7233".to_owned(), @@ -79,21 +84,21 @@ fn parallel_timer_workflow() { let core = temporal_sdk_core::init(CoreInitOptions { gateway_opts }).unwrap(); let mut rng = rand::thread_rng(); let workflow_id: u32 = rng.gen(); - dbg!(create_workflow(&core, &workflow_id.to_string())); + dbg!(create_workflow(&core, task_q, &workflow_id.to_string())); let timer_id = "timer 1".to_string(); let timer_2_id = "timer 2".to_string(); - let task = dbg!(core.poll_task(TASK_QUEUE).unwrap()); + let task = dbg!(core.poll_task(task_q).unwrap()); core.complete_task(TaskCompletion::ok_from_api_attrs( vec![ StartTimerCommandAttributes { timer_id: timer_id.clone(), - start_to_fire_timeout: Some(Duration::from_millis(100).into()), + start_to_fire_timeout: Some(Duration::from_millis(50).into()), ..Default::default() } .into(), StartTimerCommandAttributes { timer_id: timer_2_id.clone(), - start_to_fire_timeout: Some(Duration::from_millis(200).into()), + start_to_fire_timeout: Some(Duration::from_millis(100).into()), ..Default::default() } .into(), @@ -103,7 +108,7 @@ fn parallel_timer_workflow() { .unwrap(); // Wait long enough for both timers to complete std::thread::sleep(Duration::from_millis(400)); - let task = core.poll_task(TASK_QUEUE).unwrap(); + let task = core.poll_task(task_q).unwrap(); assert_matches!( task.get_wf_jobs().as_slice(), [ From 600e82bac990ea351b8660b4e102dbb9e6c43b04 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Thu, 11 Feb 2021 16:18:27 -0800 Subject: [PATCH 14/51] Use explicit rust image version to avoid clippy inconsistencies --- .buildkite/docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/docker/Dockerfile b/.buildkite/docker/Dockerfile index f32421f6e..4fe468ca8 100644 --- a/.buildkite/docker/Dockerfile +++ b/.buildkite/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM rust:latest +FROM rust:1.50 RUN rustup component add rustfmt && \ rustup component add clippy From e472fb3d2ec701549bcc592a51943349126fbb2f Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Fri, 12 Feb 2021 09:01:18 -0800 Subject: [PATCH 15/51] CI server needs a bit more time to fire timers under test --- tests/integ_tests/simple_wf_tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integ_tests/simple_wf_tests.rs b/tests/integ_tests/simple_wf_tests.rs index e7d9f3f47..2196140f7 100644 --- a/tests/integ_tests/simple_wf_tests.rs +++ b/tests/integ_tests/simple_wf_tests.rs @@ -107,7 +107,7 @@ fn parallel_timer_workflow() { )) .unwrap(); // Wait long enough for both timers to complete - std::thread::sleep(Duration::from_millis(400)); + std::thread::sleep(Duration::from_millis(800)); let task = core.poll_task(task_q).unwrap(); assert_matches!( task.get_wf_jobs().as_slice(), From edc325447df2a0f69fd363b8d8ff90c515cfb37c Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Fri, 12 Feb 2021 10:11:11 -0800 Subject: [PATCH 16/51] Simplify timer fsm code before proceeding --- protos/local/core_interface.proto | 8 +- src/machines/test_help/workflow_driver.rs | 18 ++- src/machines/timer_state_machine.rs | 136 ++++++++++++++-------- src/machines/workflow_machines.rs | 4 + 4 files changed, 111 insertions(+), 55 deletions(-) diff --git a/protos/local/core_interface.proto b/protos/local/core_interface.proto index f81dc63bd..f2ea50026 100644 --- a/protos/local/core_interface.proto +++ b/protos/local/core_interface.proto @@ -63,8 +63,10 @@ message WFActivationJob { StartWorkflowTaskAttributes start_workflow = 1; // A timer has fired, allowing whatever was waiting on it (if anything) to proceed TimerFiredTaskAttributes timer_fired = 2; + // A timer was canceled + TimerCanceledTaskAttributes timer_canceled = 3; - QueryWorkflowJob query_workflow = 3; + QueryWorkflowJob query_workflow = 4; } } @@ -84,6 +86,10 @@ message TimerFiredTaskAttributes { string timer_id = 1; } +message TimerCanceledTaskAttributes { + string timer_id = 1; +} + message QueryWorkflowJob { temporal.api.query.v1.WorkflowQuery query = 1; } diff --git a/src/machines/test_help/workflow_driver.rs b/src/machines/test_help/workflow_driver.rs index 670408595..b369b1fb4 100644 --- a/src/machines/test_help/workflow_driver.rs +++ b/src/machines/test_help/workflow_driver.rs @@ -23,6 +23,11 @@ use tracing::Level; /// over when commands are returned than a normal workflow would. /// /// It replaces "TestEnitityTestListenerBase" in java which is pretty hard to follow. +/// +/// It is important to understand that this driver doesn't work like a real workflow in the sense +/// that nothing in it ever blocks, or ever should block. Every workflow task will run through the +/// *entire* workflow, but any commands given to the sink after a `Waiting` command are simply +/// ignored, allowing you to simulate blocking without ever actually blocking. pub(in crate::machines) struct TestWorkflowDriver { wf_function: F, cache: Arc, @@ -59,6 +64,7 @@ where impl ActivationListener for TestWorkflowDriver { fn on_activation_job(&mut self, activation: &Attributes) { if let Attributes::TimerFired(TimerFiredTaskAttributes { timer_id }) = activation { + dbg!(&timer_id); Arc::get_mut(&mut self.cache) .unwrap() .unblocked_timers @@ -82,6 +88,8 @@ where // Call the closure that produces the workflow future let wf_future = (self.wf_function)(sender); + // TODO: This is pointless right now -- either actually use async and suspend on awaits + // or just remove it. // Create a tokio runtime to block on the future let rt = tokio::runtime::Runtime::new().unwrap(); rt.block_on(wf_future); @@ -134,8 +142,10 @@ impl CommandSender { (Self { chan, twd_cache }, rx) } - /// Request to create a timer, returning future which resolves when the timer completes - pub fn timer(&mut self, a: StartTimerCommandAttributes) -> impl Future + '_ { + /// Request to create a timer. Returns true if the timer has fired, false if it hasn't yet. + /// + /// If `do_wait` is true, issue a waiting command if the timer is not finished. + pub fn timer(&mut self, a: StartTimerCommandAttributes, do_wait: bool) -> bool { let finished = match self.twd_cache.unblocked_timers.entry(a.timer_id.clone()) { dashmap::mapref::entry::Entry::Occupied(existing) => *existing.get(), dashmap::mapref::entry::Entry::Vacant(v) => { @@ -145,10 +155,10 @@ impl CommandSender { false } }; - if !finished { + if !finished && do_wait { self.chan.send(TestWFCommand::Waiting).unwrap(); } - futures::future::ready(()) + finished } pub fn send(&mut self, c: WFCommand) { diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index e77b1d7c3..e2c434172 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -1,5 +1,6 @@ #![allow(clippy::large_enum_variant)] +use crate::protos::coresdk::TimerCanceledTaskAttributes; use crate::{ machines::{ workflow_machines::{WFMachinesError, WorkflowMachines, WorkflowTrigger}, @@ -13,7 +14,10 @@ use crate::{ StartTimerCommandAttributes, }, enums::v1::{CommandType, EventType}, - history::v1::{history_event, HistoryEvent, TimerCanceledEventAttributes}, + history::v1::{ + history_event, HistoryEvent, TimerCanceledEventAttributes, + TimerFiredEventAttributes, + }, }, }, }; @@ -32,14 +36,14 @@ fsm! { StartCommandCreated --(CommandStartTimer) --> StartCommandCreated; StartCommandCreated --(TimerStarted(HistoryEventId), on_timer_started) --> StartCommandRecorded; - StartCommandCreated --(Cancel, shared on_cancel) --> Canceled; + StartCommandCreated --(Cancel, on_cancel) --> Canceled; - StartCommandRecorded --(TimerFired(HistoryEvent), on_timer_fired) --> Fired; + StartCommandRecorded --(TimerFired(TimerFiredEventAttributes), shared on_timer_fired) --> Fired; StartCommandRecorded --(Cancel, shared on_cancel) --> CancelTimerCommandCreated; CancelTimerCommandCreated --(Cancel) --> CancelTimerCommandCreated; CancelTimerCommandCreated - --(CommandCancelTimer, shared on_command_cancel_timer) --> CancelTimerCommandSent; + --(CommandCancelTimer, on_command_cancel_timer) --> CancelTimerCommandSent; CancelTimerCommandSent --(TimerCanceled) --> Canceled; } @@ -47,7 +51,8 @@ fsm! { #[derive(Debug)] pub(super) enum TimerMachineCommand { AddCommand(AddCommand), - Complete(HistoryEvent), + Complete, + Canceled, } /// Creates a new, scheduled, timer as a [CancellableCommand] @@ -77,9 +82,7 @@ impl TimerMachine { fn new(attribs: StartTimerCommandAttributes) -> Self { Self { state: Created {}.into(), - shared_state: SharedState { - timer_attributes: attribs, - }, + shared_state: SharedState { attrs: attribs }, } } } @@ -91,7 +94,15 @@ impl TryFrom for TimerMachineEvents { Ok(match EventType::from_i32(e.event_type) { Some(EventType::TimerStarted) => Self::TimerStarted(e.event_id), Some(EventType::TimerCanceled) => Self::TimerCanceled, - Some(EventType::TimerFired) => Self::TimerFired(e), + Some(EventType::TimerFired) => { + if let Some(history_event::Attributes::TimerFiredEventAttributes(attrs)) = + e.attributes + { + Self::TimerFired(attrs) + } else { + return Err(()); + } + } _ => return Err(()), }) } @@ -111,25 +122,7 @@ impl TryFrom for TimerMachineEvents { #[derive(Default, Clone)] pub(super) struct SharedState { - timer_attributes: StartTimerCommandAttributes, -} - -impl SharedState { - fn into_timer_canceled_event_command(self) -> TimerMachineCommand { - let attrs = TimerCanceledEventAttributes { - identity: "workflow".to_string(), - timer_id: self.timer_attributes.timer_id, - ..Default::default() - }; - let event = HistoryEvent { - event_type: EventType::TimerCanceled as i32, - attributes: Some(history_event::Attributes::TimerCanceledEventAttributes( - attrs, - )), - ..Default::default() - }; - TimerMachineCommand::Complete(event) - } + attrs: StartTimerCommandAttributes, } #[derive(Default, Clone)] @@ -139,7 +132,7 @@ impl Created { pub(super) fn on_schedule(self, dat: SharedState) -> TimerMachineTransition { let cmd = Command { command_type: CommandType::StartTimer as i32, - attributes: Some(dat.timer_attributes.into()), + attributes: Some(dat.attrs.into()), }; TimerMachineTransition::commands::<_, StartCommandCreated>(vec![ TimerMachineCommand::AddCommand(cmd.into()), @@ -150,11 +143,8 @@ impl Created { #[derive(Default, Clone)] pub(super) struct CancelTimerCommandCreated {} impl CancelTimerCommandCreated { - pub(super) fn on_command_cancel_timer(self, dat: SharedState) -> TimerMachineTransition { - TimerMachineTransition::ok( - vec![dat.into_timer_canceled_event_command()], - Canceled::default(), - ) + pub(super) fn on_command_cancel_timer(self) -> TimerMachineTransition { + TimerMachineTransition::ok(vec![TimerMachineCommand::Canceled], Canceled::default()) } } @@ -180,13 +170,8 @@ impl StartCommandCreated { // TODO: Java recorded an initial event ID, but it seemingly was never used. TimerMachineTransition::default::() } - pub(super) fn on_cancel(mut self, dat: SharedState) -> TimerMachineTransition { - // Cancel the initial command - which just sets a "canceled" flag in a wrapper of a - // proto command. TODO: Does this make any sense? - no - propagate up - TimerMachineTransition::ok( - vec![dat.into_timer_canceled_event_command()], - Canceled::default(), - ) + pub(super) fn on_cancel(mut self) -> TimerMachineTransition { + TimerMachineTransition::ok(vec![TimerMachineCommand::Canceled], Canceled::default()) } } @@ -194,15 +179,26 @@ impl StartCommandCreated { pub(super) struct StartCommandRecorded {} impl StartCommandRecorded { - pub(super) fn on_timer_fired(self, event: HistoryEvent) -> TimerMachineTransition { - TimerMachineTransition::ok(vec![TimerMachineCommand::Complete(event)], Fired::default()) + pub(super) fn on_timer_fired( + self, + dat: SharedState, + attrs: TimerFiredEventAttributes, + ) -> TimerMachineTransition { + if dat.attrs.timer_id != attrs.timer_id { + TimerMachineTransition::Err(WFMachinesError::MalformedEventDetail(format!( + "Timer fired event did not have expected timer id {}!", + dat.attrs.timer_id + ))) + } else { + TimerMachineTransition::ok(vec![TimerMachineCommand::Complete], Fired::default()) + } } pub(super) fn on_cancel(self, dat: SharedState) -> TimerMachineTransition { let cmd = Command { command_type: CommandType::CancelTimer as i32, attributes: Some( CancelTimerCommandAttributes { - timer_id: dat.timer_attributes.timer_id, + timer_id: dat.attrs.timer_id, } .into(), ), @@ -223,8 +219,12 @@ impl WFMachinesAdapter for TimerMachine { ) -> Result, WFMachinesError> { match my_command { // Fire the completion - TimerMachineCommand::Complete(_event) => Ok(vec![TimerFiredTaskAttributes { - timer_id: self.shared_state.timer_attributes.timer_id.clone(), + TimerMachineCommand::Complete => Ok(vec![TimerFiredTaskAttributes { + timer_id: self.shared_state.attrs.timer_id.clone(), + } + .into()]), + TimerMachineCommand::Canceled => Ok(vec![TimerCanceledTaskAttributes { + timer_id: self.shared_state.attrs.timer_id.clone(), } .into()]), TimerMachineCommand::AddCommand(_) => { @@ -254,8 +254,7 @@ mod test { }; use futures::{channel::mpsc::Sender, FutureExt, SinkExt}; use rstest::{fixture, rstest}; - use std::sync::Arc; - use std::{error::Error, time::Duration}; + use std::{error::Error, sync::Arc, time::Duration}; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; #[fixture] @@ -281,10 +280,10 @@ mod test { */ let twd = TestWorkflowDriver::new(|mut command_sink: CommandSender| async move { let timer = StartTimerCommandAttributes { - timer_id: "Sometimer".to_string(), + timer_id: "timer1".to_string(), start_to_fire_timeout: Some(Duration::from_secs(5).into()), }; - command_sink.timer(timer).await; + command_sink.timer(timer, true); let complete = CompleteWorkflowExecutionCommandAttributes::default(); command_sink.send(complete.into()); @@ -349,4 +348,41 @@ mod test { CommandType::CompleteWorkflowExecution as i32 ); } + + #[test] + fn cancellation() { + let twd = TestWorkflowDriver::new(|mut command_sink: CommandSender| async move { + let timer = StartTimerCommandAttributes { + timer_id: "timer1".to_string(), + start_to_fire_timeout: Some(Duration::from_secs(5).into()), + }; + let cancel_timer = StartTimerCommandAttributes { + timer_id: "cancel_timer".to_string(), + start_to_fire_timeout: Some(Duration::from_secs(500).into()), + }; + let cancel_this = command_sink.timer(cancel_timer, false); + command_sink.timer(timer, true); + // cancel_this.cancel(); + + let complete = CompleteWorkflowExecutionCommandAttributes::default(); + command_sink.send(complete.into()); + }); + + let mut t = TestHistoryBuilder::default(); + let mut state_machines = + WorkflowMachines::new("wfid".to_string(), "runid".to_string(), Box::new(twd)); + + t.add_by_type(EventType::WorkflowExecutionStarted); + t.add_workflow_task(); + let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); + t.add( + EventType::TimerFired, + history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { + started_event_id: timer_started_event_id, + timer_id: "timer1".to_string(), + }), + ); + t.add_workflow_task_scheduled_and_started(); + assert_eq!(2, t.as_history().get_workflow_task_count(None).unwrap()); + } } diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index 0a34c6332..ada963f77 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -84,6 +84,10 @@ pub enum WFMachinesError { UnexpectedEvent(HistoryEvent), #[error("Event {0:?} was malformed: {1}")] MalformedEvent(HistoryEvent, String), + // Expected to be transformed into a `MalformedEvent` with the full event by workflow machines, + // when emitted by a sub-machine + #[error("{0}")] + MalformedEventDetail(String), #[error("Command type {0:?} was not expected")] UnexpectedCommand(CommandType), #[error("No command was scheduled for event {0:?}")] From b7499e5a2ba161aa5e9a04a6c1aa34c3e30c896b Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Fri, 12 Feb 2021 10:13:52 -0800 Subject: [PATCH 17/51] Ensure wf machines attaches full event to malformed details --- src/machines/workflow_machines.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index ada963f77..09898c2a7 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -400,7 +400,13 @@ impl WorkflowMachines { event: &HistoryEvent, has_next_event: bool, ) -> Result<()> { - let triggers = sm.handle_event(event, has_next_event)?; + let triggers = sm.handle_event(event, has_next_event).map_err(|e| { + if let WFMachinesError::MalformedEventDetail(s) = e { + WFMachinesError::MalformedEvent(event.clone(), s) + } else { + e + } + })?; event!(Level::DEBUG, msg = "Machine produced triggers", ?triggers); for trigger in triggers { match trigger { From 8e8aac647205dbc940b82a636fa2a6803411adaa Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Fri, 12 Feb 2021 10:35:28 -0800 Subject: [PATCH 18/51] Test for mismatched id error --- src/machines/timer_state_machine.rs | 32 +++++++++++++++++++++++++++++ src/protosext/history_info.rs | 2 +- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index e2c434172..84c194cc4 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -349,6 +349,38 @@ mod test { ); } + #[test] + fn mismatched_timer_ids_errors() { + let twd = TestWorkflowDriver::new(|mut command_sink: CommandSender| async move { + let timer = StartTimerCommandAttributes { + timer_id: "realid".to_string(), + start_to_fire_timeout: Some(Duration::from_secs(5).into()), + }; + command_sink.timer(timer, true); + }); + + let mut t = TestHistoryBuilder::default(); + let mut state_machines = + WorkflowMachines::new("wfid".to_string(), "runid".to_string(), Box::new(twd)); + + t.add_by_type(EventType::WorkflowExecutionStarted); + t.add_workflow_task(); + let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); + t.add( + EventType::TimerFired, + history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { + started_event_id: timer_started_event_id, + timer_id: "badid".to_string(), + }), + ); + t.add_workflow_task_scheduled_and_started(); + assert!(t + .handle_workflow_task_take_cmds(&mut state_machines, None) + .unwrap_err() + .to_string() + .contains("Timer fired event did not have expected timer id realid!")) + } + #[test] fn cancellation() { let twd = TestWorkflowDriver::new(|mut command_sink: CommandSender| async move { diff --git a/src/protosext/history_info.rs b/src/protosext/history_info.rs index 4ef36537b..870af3743 100644 --- a/src/protosext/history_info.rs +++ b/src/protosext/history_info.rs @@ -24,7 +24,7 @@ pub enum HistoryInfoError { FailedOrTimeout(HistoryEvent), #[error("Last item in history wasn't WorkflowTaskStarted")] HistoryEndsUnexpectedly, - #[error("Underlying error in workflow machine")] + #[error("Underlying error in workflow machine: {0:?}")] UnderlyingMachineError(#[from] WFMachinesError), } From d9d516789c7e6f5a2a5c1083c5a2fe5961d01004 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Fri, 12 Feb 2021 16:50:43 -0800 Subject: [PATCH 19/51] Some various error enhancements --- .gitignore | 1 + fsm/Cargo.toml | 2 -- .../complete_workflow_state_machine.rs | 9 ++++-- src/machines/mod.rs | 29 +++++++++---------- src/machines/timer_state_machine.rs | 14 +++++++-- src/machines/workflow_machines.rs | 16 ++++++---- src/machines/workflow_task_state_machine.rs | 15 ++++++++-- 7 files changed, 56 insertions(+), 30 deletions(-) diff --git a/.gitignore b/.gitignore index e8609ab8c..07b4d6d81 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ Cargo.lock # Ignore generated protobuf files src/protos/*.rs !src/protos/mod.rs +/tarpaulin-report.html diff --git a/fsm/Cargo.toml b/fsm/Cargo.toml index 534af9f77..291343391 100644 --- a/fsm/Cargo.toml +++ b/fsm/Cargo.toml @@ -7,7 +7,5 @@ license = "MIT" license-file = "LICENSE.txt" [dependencies] -thiserror = "1.0" -derive_more = "0.99" state_machine_procmacro = { path = "state_machine_procmacro" } state_machine_trait = { path = "state_machine_trait" } diff --git a/src/machines/complete_workflow_state_machine.rs b/src/machines/complete_workflow_state_machine.rs index 6bc3266d2..b48b939ad 100644 --- a/src/machines/complete_workflow_state_machine.rs +++ b/src/machines/complete_workflow_state_machine.rs @@ -67,12 +67,17 @@ impl CompleteWorkflowMachine { } impl TryFrom for CompleteWorkflowMachineEvents { - type Error = (); + type Error = WFMachinesError; fn try_from(e: HistoryEvent) -> Result { Ok(match EventType::from_i32(e.event_type) { Some(EventType::WorkflowExecutionCompleted) => Self::WorkflowExecutionCompleted, - _ => return Err(()), + _ => { + return Err(WFMachinesError::UnexpectedEvent( + e, + "Complete workflow machine does not handle this event", + )) + } }) } } diff --git a/src/machines/mod.rs b/src/machines/mod.rs index 35473e12c..eec921114 100644 --- a/src/machines/mod.rs +++ b/src/machines/mod.rs @@ -155,6 +155,7 @@ where SM: StateMachine + CheckStateMachineInFinal + WFMachinesAdapter + Clone + Send, ::Event: TryFrom, ::Event: TryFrom, + WFMachinesError: From<<::Event as TryFrom>::Error>, ::Command: Debug, ::Error: Into + 'static + Send + Sync, { @@ -193,23 +194,21 @@ where %event, machine_name = %self.name() ); - if let Ok(converted_event) = event.clone().try_into() { - match self.on_event_mut(converted_event) { - Ok(c) => { - event!(Level::DEBUG, msg = "Machine produced commands", ?c); - let mut triggers = vec![]; - for cmd in c { - triggers.extend(self.adapt_response(event, has_next_event, cmd)?); - } - Ok(triggers) + let converted_event = event.clone().try_into()?; + match self.on_event_mut(converted_event) { + Ok(c) => { + event!(Level::DEBUG, msg = "Machine produced commands", ?c); + let mut triggers = vec![]; + for cmd in c { + triggers.extend(self.adapt_response(event, has_next_event, cmd)?); } - Err(MachineError::InvalidTransition) => { - Err(WFMachinesError::UnexpectedEvent(event.clone())) - } - Err(MachineError::Underlying(e)) => Err(e.into()), + Ok(triggers) } - } else { - Err(WFMachinesError::UnexpectedEvent(event.clone())) + Err(MachineError::InvalidTransition) => Err(WFMachinesError::UnexpectedEvent( + event.clone(), + "The handling machine says the transition is invalid", + )), + Err(MachineError::Underlying(e)) => Err(e.into()), } } } diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index 84c194cc4..c15b0962f 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -88,7 +88,7 @@ impl TimerMachine { } impl TryFrom for TimerMachineEvents { - type Error = (); + type Error = WFMachinesError; fn try_from(e: HistoryEvent) -> Result { Ok(match EventType::from_i32(e.event_type) { @@ -100,10 +100,18 @@ impl TryFrom for TimerMachineEvents { { Self::TimerFired(attrs) } else { - return Err(()); + return Err(WFMachinesError::MalformedEvent( + e, + "Timer fired attribs were unset".to_string(), + )); } } - _ => return Err(()), + _ => { + return Err(WFMachinesError::UnexpectedEvent( + e, + "Timer machine does not handle this event", + )) + } }) } } diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index 09898c2a7..4d145d3ea 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -80,8 +80,8 @@ pub(super) enum WorkflowTrigger { #[derive(thiserror::Error, Debug)] pub enum WFMachinesError { - #[error("Event {0:?} was not expected")] - UnexpectedEvent(HistoryEvent), + #[error("Event {0:?} was not expected: {1}")] + UnexpectedEvent(HistoryEvent, &'static str), #[error("Event {0:?} was malformed: {1}")] MalformedEvent(HistoryEvent, String), // Expected to be transformed into a `MalformedEvent` with the full event by workflow machines, @@ -147,8 +147,9 @@ impl WorkflowMachines { self.handle_command_event(event)?; return Ok(()); } - let event_type = EventType::from_i32(event.event_type) - .ok_or_else(|| WFMachinesError::UnexpectedEvent(event.clone()))?; + let event_type = EventType::from_i32(event.event_type).ok_or_else(|| { + WFMachinesError::UnexpectedEvent(event.clone(), "The event type is unknown") + })?; if self.replaying && self.current_started_event_id >= self.previous_started_event_id @@ -325,7 +326,12 @@ impl WorkflowMachines { Some(EventType::WorkflowExecutionCancelRequested) => { // TODO: Cancel callbacks } - _ => return Err(WFMachinesError::UnexpectedEvent(event.clone())), + _ => { + return Err(WFMachinesError::UnexpectedEvent( + event.clone(), + "The event is non a non-stateful event, but we tried to handle it as one", + )) + } } Ok(()) } diff --git a/src/machines/workflow_task_state_machine.rs b/src/machines/workflow_task_state_machine.rs index 5a2ea6698..a8e494d8d 100644 --- a/src/machines/workflow_task_state_machine.rs +++ b/src/machines/workflow_task_state_machine.rs @@ -63,8 +63,12 @@ impl WFMachinesAdapter for WorkflowTaskMachine { task_started_event_id, time, } => { - let event_type = EventType::from_i32(event.event_type) - .ok_or_else(|| WFMachinesError::UnexpectedEvent(event.clone()))?; + let event_type = EventType::from_i32(event.event_type).ok_or_else(|| { + WFMachinesError::UnexpectedEvent( + event.clone(), + "WfTask machine could not interpret event type", + ) + })?; let cur_event_past_or_at_start = event.event_id >= task_started_event_id; if event_type == EventType::WorkflowTaskStarted && (!cur_event_past_or_at_start || has_next_event) @@ -100,7 +104,12 @@ impl TryFrom for WorkflowTaskMachineEvents { Some(EventType::WorkflowTaskTimedOut) => Self::WorkflowTaskTimedOut, Some(EventType::WorkflowTaskCompleted) => Self::WorkflowTaskCompleted, Some(EventType::WorkflowTaskFailed) => Self::WorkflowTaskFailed, - _ => return Err(WFMachinesError::UnexpectedEvent(e)), + _ => { + return Err(WFMachinesError::UnexpectedEvent( + e, + "Event does not apply to a wf task machine", + )) + } }) } } From 5c7901fe4bd53dea7089d551b959b816baaba5ce Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Tue, 16 Feb 2021 14:40:55 -0800 Subject: [PATCH 20/51] Rename WorkflowTrigger -> MachineResponse --- src/machines/complete_workflow_state_machine.rs | 4 ++-- src/machines/mod.rs | 8 ++++---- src/machines/timer_state_machine.rs | 4 ++-- src/machines/workflow_machines.rs | 6 +++--- src/machines/workflow_task_state_machine.rs | 6 +++--- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/machines/complete_workflow_state_machine.rs b/src/machines/complete_workflow_state_machine.rs index b48b939ad..0c2f6a835 100644 --- a/src/machines/complete_workflow_state_machine.rs +++ b/src/machines/complete_workflow_state_machine.rs @@ -1,4 +1,4 @@ -use crate::machines::workflow_machines::WorkflowTrigger; +use crate::machines::workflow_machines::MachineResponse; use crate::{ machines::{ workflow_machines::WorkflowMachines, AddCommand, CancellableCommand, WFCommand, @@ -132,7 +132,7 @@ impl WFMachinesAdapter for CompleteWorkflowMachine { _event: &HistoryEvent, _has_next_event: bool, _my_command: CompleteWFCommand, - ) -> Result, WFMachinesError> { + ) -> Result, WFMachinesError> { Ok(vec![]) } } diff --git a/src/machines/mod.rs b/src/machines/mod.rs index eec921114..fc29e4cdf 100644 --- a/src/machines/mod.rs +++ b/src/machines/mod.rs @@ -38,7 +38,7 @@ pub(crate) mod test_help; pub(crate) use workflow_machines::{WFMachinesError, WorkflowMachines}; use crate::{ - machines::workflow_machines::WorkflowTrigger, + machines::workflow_machines::MachineResponse, protos::{ coresdk::{self, command::Variant, wf_activation_job}, temporal::api::{ @@ -147,7 +147,7 @@ trait TemporalStateMachine: CheckStateMachineInFinal + Send { &mut self, event: &HistoryEvent, has_next_event: bool, - ) -> Result, WFMachinesError>; + ) -> Result, WFMachinesError>; } impl TemporalStateMachine for SM @@ -187,7 +187,7 @@ where &mut self, event: &HistoryEvent, has_next_event: bool, - ) -> Result, WFMachinesError> { + ) -> Result, WFMachinesError> { event!( Level::DEBUG, msg = "handling event", @@ -240,7 +240,7 @@ trait WFMachinesAdapter: StateMachine { event: &HistoryEvent, has_next_event: bool, my_command: Self::Command, - ) -> Result, WFMachinesError>; + ) -> Result, WFMachinesError>; } /// A command which can be cancelled, associated with the state machine that produced it diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index c15b0962f..c7e0294a9 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -3,7 +3,7 @@ use crate::protos::coresdk::TimerCanceledTaskAttributes; use crate::{ machines::{ - workflow_machines::{WFMachinesError, WorkflowMachines, WorkflowTrigger}, + workflow_machines::{MachineResponse, WFMachinesError, WorkflowMachines}, AddCommand, CancellableCommand, WFCommand, WFMachinesAdapter, }, protos::{ @@ -224,7 +224,7 @@ impl WFMachinesAdapter for TimerMachine { _event: &HistoryEvent, _has_next_event: bool, my_command: TimerMachineCommand, - ) -> Result, WFMachinesError> { + ) -> Result, WFMachinesError> { match my_command { // Fire the completion TimerMachineCommand::Complete => Ok(vec![TimerFiredTaskAttributes { diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index 4d145d3ea..8a6c998cc 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -70,7 +70,7 @@ pub(crate) struct WorkflowMachines { /// Returned by [TemporalStateMachine]s when handling events #[derive(Debug, derive_more::From)] #[must_use] -pub(super) enum WorkflowTrigger { +pub(super) enum MachineResponse { PushWFJob(#[from(forward)] wf_activation_job::Attributes), TriggerWFTaskStarted { task_started_event_id: i64, @@ -416,11 +416,11 @@ impl WorkflowMachines { event!(Level::DEBUG, msg = "Machine produced triggers", ?triggers); for trigger in triggers { match trigger { - WorkflowTrigger::PushWFJob(a) => { + MachineResponse::PushWFJob(a) => { self.drive_me.on_activation_job(&a); self.outgoing_wf_activation_jobs.push_back(a); } - WorkflowTrigger::TriggerWFTaskStarted { + MachineResponse::TriggerWFTaskStarted { task_started_event_id, time, } => { diff --git a/src/machines/workflow_task_state_machine.rs b/src/machines/workflow_task_state_machine.rs index a8e494d8d..a2c64c22d 100644 --- a/src/machines/workflow_task_state_machine.rs +++ b/src/machines/workflow_task_state_machine.rs @@ -1,6 +1,6 @@ #![allow(clippy::enum_variant_names)] -use crate::machines::workflow_machines::WorkflowTrigger; +use crate::machines::workflow_machines::MachineResponse; use crate::{ machines::{ workflow_machines::{WFMachinesError, WorkflowMachines}, @@ -57,7 +57,7 @@ impl WFMachinesAdapter for WorkflowTaskMachine { event: &HistoryEvent, has_next_event: bool, my_command: WFTaskMachineCommand, - ) -> Result, WFMachinesError> { + ) -> Result, WFMachinesError> { match my_command { WFTaskMachineCommand::WFTaskStartedTrigger { task_started_event_id, @@ -77,7 +77,7 @@ impl WFMachinesAdapter for WorkflowTaskMachine { // want to iterate. return Ok(vec![]); } - Ok(vec![WorkflowTrigger::TriggerWFTaskStarted { + Ok(vec![MachineResponse::TriggerWFTaskStarted { task_started_event_id, time, }]) From e1f7627339d3004463028a80b4ae13f3296fc3e7 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Wed, 17 Feb 2021 08:59:25 -0800 Subject: [PATCH 21/51] Moving to desktop --- src/lib.rs | 22 ++++++------- src/machines/test_help/mod.rs | 4 +-- src/workflow/concurrency_manager.rs | 51 +++++++++++++++++++++++++++++ src/workflow/mod.rs | 24 +++++--------- 4 files changed, 72 insertions(+), 29 deletions(-) create mode 100644 src/workflow/concurrency_manager.rs diff --git a/src/lib.rs b/src/lib.rs index d0379593e..c865bc7b2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,6 +19,7 @@ mod workflow; pub use pollers::{ServerGateway, ServerGatewayApis, ServerGatewayOptions}; pub use url::Url; +use crate::workflow::WorkflowConcurrencyManager; use crate::{ machines::{InconvertibleCommandError, WFCommand}, protos::{ @@ -104,7 +105,7 @@ where /// Provides work in the form of responses the server would send from polling task Qs server_gateway: Arc, /// Key is run id - workflow_machines: DashMap, + workflow_machines: WorkflowConcurrencyManager, /// Maps task tokens to workflow run ids workflow_task_tokens: DashMap, String>, @@ -185,13 +186,12 @@ where match wfstatus { Status::Successful(success) => { self.push_lang_commands(&run_id, success)?; - self.access_machine(&run_id, |mgr| { - let commands = mgr.machines.get_commands(); - self.runtime.block_on( - self.server_gateway - .complete_workflow_task(task_token, commands), - ) - })?; + let commands = + self.access_machine(&run_id, |mgr| Ok(mgr.machines.get_commands()))?; + self.runtime.block_on( + self.server_gateway + .complete_workflow_task(task_token, commands), + )?; } Status::Failed(_) => {} } @@ -382,7 +382,7 @@ mod test { attributes: Some(wf_activation_job::Attributes::StartWorkflow(_)), }] ); - assert!(core.workflow_machines.get(run_id).is_some()); + assert!(core.workflow_machines.exists(run_id)); let task_tok = res.task_token; core.complete_task(TaskCompletion::ok_from_api_attrs( @@ -461,7 +461,7 @@ mod test { attributes: Some(wf_activation_job::Attributes::StartWorkflow(_)), }] ); - assert!(core.workflow_machines.get(run_id).is_some()); + assert!(core.workflow_machines.exists(run_id)); let task_tok = res.task_token; core.complete_task(TaskCompletion::ok_from_api_attrs( @@ -541,7 +541,7 @@ mod test { attributes: Some(wf_activation_job::Attributes::StartWorkflow(_)), }] ); - assert!(core.workflow_machines.get(run_id).is_some()); + assert!(core.workflow_machines.exists(run_id)); let task_tok = res.task_token; core.complete_task(TaskCompletion::ok_from_api_attrs( diff --git a/src/machines/test_help/mod.rs b/src/machines/test_help/mod.rs index 9c4a73df8..86852375f 100644 --- a/src/machines/test_help/mod.rs +++ b/src/machines/test_help/mod.rs @@ -66,8 +66,8 @@ pub(crate) fn build_fake_core( CoreSDK { runtime, server_gateway: Arc::new(mock_gateway), - workflow_machines: DashMap::new(), - workflow_task_tokens: DashMap::new(), + workflow_machines: Default::default(), + workflow_task_tokens: Default::default(), pending_activations: Default::default(), } } diff --git a/src/workflow/concurrency_manager.rs b/src/workflow/concurrency_manager.rs new file mode 100644 index 000000000..a6c210368 --- /dev/null +++ b/src/workflow/concurrency_manager.rs @@ -0,0 +1,51 @@ +use crate::{ + machines::ProtoCommand, + workflow::{NextWfActivation, WfManagerProtected, WorkflowManager}, + CoreError, Result, +}; +use crossbeam::channel::Sender; +use dashmap::DashMap; + +type MachineSender = Sender Result<()> + Send>>; + +// TODO: If can't be totally generic, could do enum of Fn's with defined responses +/// Responses that workflow machines can respond with from their thread +enum WfMgrResponse { + Nothing, + Activation(NextWfActivation), + Commands(Vec), +} + +#[derive(Default)] +pub(crate) struct WorkflowConcurrencyManager { + machines: DashMap, +} + +impl WorkflowConcurrencyManager { + pub fn exists(&self, run_id: &str) -> bool { + self.machines.contains_key(run_id) + } + + pub fn create_or_update(&self, run_id: &str) -> Result { + unimplemented!() + } + + /// Access a workflow manager to do something with it. Ideally, the return type could be generic + /// but in practice I couldn't find a way to do it. If we need more and more response types, + /// it's worth trying again. + pub fn access(&self, run_id: &str, mutator: F) -> Result + where + F: FnOnce(&mut WfManagerProtected) -> Result, + F: Send, + { + if let Some(m) = self.machines.get(run_id) { + // m.value().send(Box::new(mutator)).unwrap(); + unimplemented!() + } else { + Err(CoreError::MissingMachines(run_id.to_string())) + } + } +} + +trait BeSendSync: Send + Sync {} +impl BeSendSync for WorkflowConcurrencyManager {} diff --git a/src/workflow/mod.rs b/src/workflow/mod.rs index 8b03c5588..1a0b98a22 100644 --- a/src/workflow/mod.rs +++ b/src/workflow/mod.rs @@ -1,6 +1,8 @@ mod bridge; +mod concurrency_manager; pub(crate) use bridge::WorkflowBridge; +pub(crate) use concurrency_manager::WorkflowConcurrencyManager; use crate::{ machines::{ProtoCommand, WFCommand, WorkflowMachines}, @@ -73,10 +75,12 @@ pub(crate) struct WfManagerProtected { /// The last recorded history we received from the server for this workflow run. This must be /// kept because the lang side polls & completes for every workflow task, but we do not need /// to poll the server that often during replay. - pub last_history_from_server: History, - pub last_history_task_count: usize, + last_history_from_server: History, + last_history_task_count: usize, /// The current workflow task number this run is on. Starts at one and monotonically increases. - pub current_wf_task_num: usize, + current_wf_task_num: usize, + + _temp: std::rc::Rc, } impl WorkflowManager { @@ -101,6 +105,7 @@ impl WorkflowManager { last_history_task_count: history.get_workflow_task_count(None)?, last_history_from_server: history, current_wf_task_num: 1, + _temp: std::rc::Rc::new(8), }; Ok(Self { data: Arc::new(Mutex::new(protected)), @@ -166,16 +171,3 @@ impl WfManagerProtected { }) } } - -#[cfg(test)] -mod tests { - use super::*; - - fn enforcer(_: W) {} - - // Enforce thread-safeness of wf manager - #[test] - fn is_threadsafe() { - enforcer(WorkflowManager::new(Default::default())); - } -} From 23b2f4731526bc34c85ebe2600bd06b37b98f2c3 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Wed, 17 Feb 2021 13:20:33 -0800 Subject: [PATCH 22/51] Most of the way there. Fixing deadlocks. --- src/lib.rs | 45 +++----- src/machines/test_help/mod.rs | 4 +- src/workflow/concurrency_manager.rs | 168 +++++++++++++++++++++++----- src/workflow/mod.rs | 31 +---- 4 files changed, 163 insertions(+), 85 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c865bc7b2..4f9a1c80c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,7 +19,7 @@ mod workflow; pub use pollers::{ServerGateway, ServerGatewayApis, ServerGatewayOptions}; pub use url::Url; -use crate::workflow::WorkflowConcurrencyManager; +use crate::workflow::{WorkflowConcurrencyManager, WorkflowManager}; use crate::{ machines::{InconvertibleCommandError, WFCommand}, protos::{ @@ -30,10 +30,11 @@ use crate::{ temporal::api::workflowservice::v1::PollWorkflowTaskQueueResponse, }, protosext::HistoryInfoError, - workflow::{NextWfActivation, WfManagerProtected, WorkflowManager}, + workflow::NextWfActivation, }; use crossbeam::queue::SegQueue; -use dashmap::{mapref::entry::Entry, DashMap}; +use dashmap::DashMap; +use std::fmt::Debug; use std::{convert::TryInto, sync::mpsc::SendError, sync::Arc}; use tokio::runtime::Runtime; use tonic::codegen::http::uri::InvalidUri; @@ -83,7 +84,7 @@ pub fn init(opts: CoreInitOptions) -> Result { Ok(CoreSDK { runtime, server_gateway: Arc::new(work_provider), - workflow_machines: Default::default(), + workflow_machines: WorkflowConcurrencyManager::new(), workflow_task_tokens: Default::default(), pending_activations: Default::default(), }) @@ -235,25 +236,10 @@ impl CoreSDK { self.workflow_task_tokens .insert(task_token.clone(), run_id.clone()); - match self.workflow_machines.entry(run_id.clone()) { - Entry::Occupied(mut existing) => { - if let Some(history) = poll_wf_resp.history { - let activation = existing - .get_mut() - .lock()? - .feed_history_from_server(history)?; - Ok((activation, run_id)) - } else { - Err(CoreError::BadDataFromWorkProvider(poll_wf_resp)) - } - } - Entry::Vacant(vacant) => { - let wfm = WorkflowManager::new(poll_wf_resp)?; - let activation = wfm.lock()?.get_next_activation()?; - vacant.insert(wfm); - Ok((activation, run_id)) - } - } + let activation = self + .workflow_machines + .create_or_update(&run_id, poll_wf_resp)?; + Ok((activation, run_id)) } else { Err(CoreError::BadDataFromWorkProvider(poll_wf_resp)) } @@ -279,14 +265,10 @@ impl CoreSDK { /// machines. fn access_machine(&self, run_id: &str, mutator: F) -> Result where - F: FnOnce(&mut WfManagerProtected) -> Result, + F: FnOnce(&mut WorkflowManager) -> Result + Send + 'static, + Fout: Send + Debug + 'static, { - if let Some(mut machines) = self.workflow_machines.get_mut(run_id) { - let mut mgr = machines.value_mut().lock()?; - mutator(&mut mgr) - } else { - Err(CoreError::MissingMachines(run_id.to_string())) - } + self.workflow_machines.access(run_id, mutator) } } @@ -343,7 +325,7 @@ mod test { }; #[test] - fn timer_test_across_wf_bridge() { + fn single_timer_test_across_wf_bridge() { let wfid = "fake_wf_id"; let run_id = "fake_run_id"; let timer_id = "fake_timer".to_string(); @@ -408,6 +390,7 @@ mod test { task_tok, )) .unwrap(); + dbg!("Done!!!!"); } #[test] diff --git a/src/machines/test_help/mod.rs b/src/machines/test_help/mod.rs index 86852375f..f669750d9 100644 --- a/src/machines/test_help/mod.rs +++ b/src/machines/test_help/mod.rs @@ -6,6 +6,7 @@ mod workflow_driver; pub(crate) use history_builder::TestHistoryBuilder; pub(super) use workflow_driver::{CommandSender, TestWFCommand, TestWorkflowDriver}; +use crate::workflow::WorkflowConcurrencyManager; use crate::{ pollers::MockServerGateway, protos::temporal::api::common::v1::WorkflowExecution, @@ -15,7 +16,6 @@ use crate::{ }, CoreSDK, }; -use dashmap::DashMap; use rand::{thread_rng, Rng}; use std::{collections::VecDeque, sync::Arc}; use tokio::runtime::Runtime; @@ -66,7 +66,7 @@ pub(crate) fn build_fake_core( CoreSDK { runtime, server_gateway: Arc::new(mock_gateway), - workflow_machines: Default::default(), + workflow_machines: WorkflowConcurrencyManager::new(), workflow_task_tokens: Default::default(), pending_activations: Default::default(), } diff --git a/src/workflow/concurrency_manager.rs b/src/workflow/concurrency_manager.rs index a6c210368..2c39db312 100644 --- a/src/workflow/concurrency_manager.rs +++ b/src/workflow/concurrency_manager.rs @@ -1,51 +1,165 @@ use crate::{ - machines::ProtoCommand, - workflow::{NextWfActivation, WfManagerProtected, WorkflowManager}, + protos::temporal::api::workflowservice::v1::PollWorkflowTaskQueueResponse, + workflow::{NextWfActivation, WorkflowManager}, CoreError, Result, }; -use crossbeam::channel::Sender; -use dashmap::DashMap; +use crossbeam::channel::{bounded, unbounded, Select, Sender, TryRecvError}; +use dashmap::{mapref::entry::Entry, DashMap}; +use std::fmt::Debug; +use std::time::Duration; +use std::{thread, thread::JoinHandle}; -type MachineSender = Sender Result<()> + Send>>; +type MachineSender = Sender>; +type MachineCreatorMsg = ( + PollWorkflowTaskQueueResponse, + Sender, +); +type MachineCreatorResponseMsg = Result<(NextWfActivation, MachineSender)>; -// TODO: If can't be totally generic, could do enum of Fn's with defined responses -/// Responses that workflow machines can respond with from their thread -enum WfMgrResponse { - Nothing, - Activation(NextWfActivation), - Commands(Vec), -} - -#[derive(Default)] pub(crate) struct WorkflowConcurrencyManager { machines: DashMap, + wf_thread: JoinHandle<()>, + machine_creator: Sender, } impl WorkflowConcurrencyManager { + pub fn new() -> Self { + let (machine_creator, create_rcv) = unbounded::(); + + let wf_thread = thread::spawn(move || { + let mut machine_rcvs = vec![]; + loop { + // If there's a message ready on the creation channel, make a new machine + // and put it's receiver into the list, replying with the machine's activation and + // a channel to send requests to it, or an error otherwise. + // TODO: handle disconnected / other channel errors + match create_rcv.try_recv() { + Ok((pwtqr, resp_chan)) => match WorkflowManager::new(pwtqr) + .and_then(|mut wfm| Ok((wfm.get_next_activation()?, wfm))) + { + Ok((activation, wfm)) => { + dbg!("Creating machine"); + let (machine_sender, machine_rcv) = unbounded(); + machine_rcvs.push((machine_rcv, wfm)); + resp_chan.send(Ok((activation, machine_sender))).unwrap(); + } + Err(e) => { + resp_chan.send(Err(e)).unwrap(); + } + }, + Err(TryRecvError::Disconnected) => { + dbg!("Channel disconnected!"); + break; + } + Err(TryRecvError::Empty) => {} + } + + // Having created any new machines, we now check if there are any pending requests + // to interact with the machines. If multiple requests are pending they are dealt + // with in random order. + let mut sel = Select::new(); + for (rcv, _) in machine_rcvs.iter() { + sel.recv(rcv); + } + match sel.try_ready() { + Ok(index) => match machine_rcvs[index].0.try_recv() { + Ok(func) => { + dbg!("Blorgp"); + // Recall that calling this function also sends the response + func(&mut machine_rcvs[index].1); + } + Err(_) => {} + }, + Err(_) => {} + } + // TODO: remove probably + std::thread::sleep(Duration::from_millis(10)); + } + }); + + Self { + machines: Default::default(), + wf_thread, + machine_creator, + } + } + pub fn exists(&self, run_id: &str) -> bool { self.machines.contains_key(run_id) } - pub fn create_or_update(&self, run_id: &str) -> Result { - unimplemented!() + pub fn create_or_update( + &self, + run_id: &str, + poll_wf_resp: PollWorkflowTaskQueueResponse, + ) -> Result { + match self.machines.entry(run_id.to_string()) { + Entry::Occupied(_existing) => { + dbg!("Existing machine"); + if let Some(history) = poll_wf_resp.history { + // TODO: Sort of dumb to use entry here now + let activation = self.access(run_id, |wfm: &mut WorkflowManager| { + wfm.feed_history_from_server(history) + })?; + dbg!("Past access"); + Ok(activation) + } else { + Err(CoreError::BadDataFromWorkProvider(poll_wf_resp)) + } + } + Entry::Vacant(vacant) => { + dbg!("Create machine"); + // Creates a channel for the response to attempting to create the machine, sends + // the task q response, and waits for the result of machine creation along with + // the activation + let (resp_send, resp_rcv) = bounded(1); + self.machine_creator + .send((poll_wf_resp, resp_send)) + .unwrap(); + let (activation, machine_sender) = resp_rcv.recv().unwrap()?; + vacant.insert(machine_sender); + Ok(activation) + } + } } - /// Access a workflow manager to do something with it. Ideally, the return type could be generic - /// but in practice I couldn't find a way to do it. If we need more and more response types, - /// it's worth trying again. pub fn access(&self, run_id: &str, mutator: F) -> Result where - F: FnOnce(&mut WfManagerProtected) -> Result, - F: Send, + F: FnOnce(&mut WorkflowManager) -> Result + Send + 'static, + Fout: Send + Debug + 'static, { - if let Some(m) = self.machines.get(run_id) { - // m.value().send(Box::new(mutator)).unwrap(); - unimplemented!() - } else { - Err(CoreError::MissingMachines(run_id.to_string())) - } + dbg!("Machines get in access"); + let m = self + .machines + .get(run_id) + .ok_or_else(|| CoreError::MissingMachines(run_id.to_string()))?; + dbg!("completed Machines get in access"); + + // This code fetches the channel for a workflow manager and sends it a modified version of + // of closure the caller provided which includes a channel for the response, and sends + // the result of the user-provided closure down that response channel. + let (sender, receiver) = bounded(1); + let f = move |x: &mut WorkflowManager| { + dbg!("Trying to send response"); + let _ = sender.send(dbg!(mutator(x))); + dbg!("sent it"); + }; + // TODO: Clean up unwraps + m.send(Box::new(f)).unwrap(); + receiver.recv().unwrap() + } +} + +impl Drop for WorkflowConcurrencyManager { + fn drop(&mut self) { + dbg!("Droppin bruh"); } } trait BeSendSync: Send + Sync {} impl BeSendSync for WorkflowConcurrencyManager {} + +#[cfg(test)] +mod tests { + use super::*; +} diff --git a/src/workflow/mod.rs b/src/workflow/mod.rs index 1a0b98a22..a85723197 100644 --- a/src/workflow/mod.rs +++ b/src/workflow/mod.rs @@ -19,10 +19,7 @@ use crate::{ protosext::HistoryInfo, CoreError, Result, }; -use std::{ - ops::DerefMut, - sync::{mpsc::Sender, Arc, Mutex}, -}; +use std::sync::mpsc::Sender; use tracing::Level; /// Implementors can provide new workflow tasks to the SDK. The connection to the server is the real @@ -63,13 +60,9 @@ pub trait StartWorkflowExecutionApi { ) -> Result; } -/// Manages concurrent access to an instance of a [WorkflowMachines], which is not thread-safe, -/// as well as other data associated with that specific workflow run. +/// Manages an instance of a [WorkflowMachines], which is not thread-safe, as well as other data +/// associated with that specific workflow run. pub(crate) struct WorkflowManager { - data: Arc>, -} -/// Inner data for [WorkflowManager] -pub(crate) struct WfManagerProtected { pub machines: WorkflowMachines, pub command_sink: Sender>, /// The last recorded history we received from the server for this workflow run. This must be @@ -99,36 +92,24 @@ impl WorkflowManager { let (wfb, cmd_sink) = WorkflowBridge::new(); let state_machines = WorkflowMachines::new(we.workflow_id, we.run_id, Box::new(wfb)); - let protected = WfManagerProtected { + Ok(Self { machines: state_machines, command_sink: cmd_sink, last_history_task_count: history.get_workflow_task_count(None)?, last_history_from_server: history, current_wf_task_num: 1, _temp: std::rc::Rc::new(8), - }; - Ok(Self { - data: Arc::new(Mutex::new(protected)), }) } - - pub fn lock(&self) -> Result + '_> { - Ok(self.data.lock().map_err(|_| { - CoreError::LockPoisoned( - "A workflow manager lock was poisoned. This should be impossible since they \ - are run on one thread." - .to_string(), - ) - })?) - } } +#[derive(Debug)] pub(crate) struct NextWfActivation { pub activation: Option, pub more_activations_needed: bool, } -impl WfManagerProtected { +impl WorkflowManager { /// Given history that was just obtained from the server, pipe it into this workflow's machines. /// /// Should only be called when a workflow has caught up on replay. It will return a workflow From 0155521329c0f893d7668c82080100d96bb29b4c Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Wed, 17 Feb 2021 13:46:25 -0800 Subject: [PATCH 23/51] Deadlocks are fixed! --- src/workflow/concurrency_manager.rs | 71 ++++++++++++++--------------- 1 file changed, 33 insertions(+), 38 deletions(-) diff --git a/src/workflow/concurrency_manager.rs b/src/workflow/concurrency_manager.rs index 2c39db312..d22bb5d93 100644 --- a/src/workflow/concurrency_manager.rs +++ b/src/workflow/concurrency_manager.rs @@ -4,10 +4,12 @@ use crate::{ CoreError, Result, }; use crossbeam::channel::{bounded, unbounded, Select, Sender, TryRecvError}; -use dashmap::{mapref::entry::Entry, DashMap}; -use std::fmt::Debug; -use std::time::Duration; -use std::{thread, thread::JoinHandle}; +use dashmap::DashMap; +use std::{ + fmt::Debug, + thread::{self, JoinHandle}, + time::Duration, +}; type MachineSender = Sender>; type MachineCreatorMsg = ( @@ -38,7 +40,6 @@ impl WorkflowConcurrencyManager { .and_then(|mut wfm| Ok((wfm.get_next_activation()?, wfm))) { Ok((activation, wfm)) => { - dbg!("Creating machine"); let (machine_sender, machine_rcv) = unbounded(); machine_rcvs.push((machine_rcv, wfm)); resp_chan.send(Ok((activation, machine_sender))).unwrap(); @@ -93,33 +94,27 @@ impl WorkflowConcurrencyManager { run_id: &str, poll_wf_resp: PollWorkflowTaskQueueResponse, ) -> Result { - match self.machines.entry(run_id.to_string()) { - Entry::Occupied(_existing) => { - dbg!("Existing machine"); - if let Some(history) = poll_wf_resp.history { - // TODO: Sort of dumb to use entry here now - let activation = self.access(run_id, |wfm: &mut WorkflowManager| { - wfm.feed_history_from_server(history) - })?; - dbg!("Past access"); - Ok(activation) - } else { - Err(CoreError::BadDataFromWorkProvider(poll_wf_resp)) - } - } - Entry::Vacant(vacant) => { - dbg!("Create machine"); - // Creates a channel for the response to attempting to create the machine, sends - // the task q response, and waits for the result of machine creation along with - // the activation - let (resp_send, resp_rcv) = bounded(1); - self.machine_creator - .send((poll_wf_resp, resp_send)) - .unwrap(); - let (activation, machine_sender) = resp_rcv.recv().unwrap()?; - vacant.insert(machine_sender); + if self.exists(run_id) { + if let Some(history) = poll_wf_resp.history { + // TODO: Sort of dumb to use entry here now + let activation = self.access(run_id, |wfm: &mut WorkflowManager| { + wfm.feed_history_from_server(history) + })?; Ok(activation) + } else { + Err(CoreError::BadDataFromWorkProvider(poll_wf_resp)) } + } else { + // Creates a channel for the response to attempting to create the machine, sends + // the task q response, and waits for the result of machine creation along with + // the activation + let (resp_send, resp_rcv) = bounded(1); + self.machine_creator + .send((poll_wf_resp, resp_send)) + .unwrap(); + let (activation, machine_sender) = resp_rcv.recv().unwrap()?; + self.machines.insert(run_id.to_string(), machine_sender); + Ok(activation) } } @@ -128,31 +123,31 @@ impl WorkflowConcurrencyManager { F: FnOnce(&mut WorkflowManager) -> Result + Send + 'static, Fout: Send + Debug + 'static, { - dbg!("Machines get in access"); let m = self .machines .get(run_id) .ok_or_else(|| CoreError::MissingMachines(run_id.to_string()))?; - dbg!("completed Machines get in access"); // This code fetches the channel for a workflow manager and sends it a modified version of // of closure the caller provided which includes a channel for the response, and sends // the result of the user-provided closure down that response channel. let (sender, receiver) = bounded(1); let f = move |x: &mut WorkflowManager| { - dbg!("Trying to send response"); let _ = sender.send(dbg!(mutator(x))); - dbg!("sent it"); }; // TODO: Clean up unwraps m.send(Box::new(f)).unwrap(); receiver.recv().unwrap() } -} -impl Drop for WorkflowConcurrencyManager { - fn drop(&mut self) { - dbg!("Droppin bruh"); + /// Attempt to join the thread where the workflow machines live. + /// + /// # Panics + /// If the workflow machine thread panicked + pub fn shutdown(self) { + self.wf_thread + .join() + .expect("Workflow manager thread should shut down cleanly"); } } From b23ae725a7106e4a8393fd553394b0b859828b89 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Wed, 17 Feb 2021 16:23:24 -0800 Subject: [PATCH 24/51] Some cleanup and UTs --- src/lib.rs | 25 +++---- src/workflow/concurrency_manager.rs | 103 +++++++++++++++++++++++----- 2 files changed, 94 insertions(+), 34 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 4f9a1c80c..1ca858ecd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,7 +19,6 @@ mod workflow; pub use pollers::{ServerGateway, ServerGatewayApis, ServerGatewayOptions}; pub use url::Url; -use crate::workflow::{WorkflowConcurrencyManager, WorkflowManager}; use crate::{ machines::{InconvertibleCommandError, WFCommand}, protos::{ @@ -30,7 +29,7 @@ use crate::{ temporal::api::workflowservice::v1::PollWorkflowTaskQueueResponse, }, protosext::HistoryInfoError, - workflow::NextWfActivation, + workflow::{NextWfActivation, WorkflowConcurrencyManager}, }; use crossbeam::queue::SegQueue; use dashmap::DashMap; @@ -131,8 +130,9 @@ where // replaying, and issue those tasks before bothering the server. if let Some(pa) = self.pending_activations.pop() { event!(Level::DEBUG, msg = "Applying pending activations", ?pa); - let next_activation = - self.access_machine(&pa.run_id, |mgr| mgr.get_next_activation())?; + let next_activation = self + .workflow_machines + .access(&pa.run_id, |mgr| mgr.get_next_activation())?; let task_token = pa.task_token.clone(); if next_activation.more_activations_needed { self.pending_activations.push(pa); @@ -187,8 +187,9 @@ where match wfstatus { Status::Successful(success) => { self.push_lang_commands(&run_id, success)?; - let commands = - self.access_machine(&run_id, |mgr| Ok(mgr.machines.get_commands()))?; + let commands = self + .workflow_machines + .access(&run_id, |mgr| Ok(mgr.machines.get_commands()))?; self.runtime.block_on( self.server_gateway .complete_workflow_task(task_token, commands), @@ -253,23 +254,13 @@ impl CoreSDK { .into_iter() .map(|c| c.try_into().map_err(Into::into)) .collect::>>()?; - self.access_machine(run_id, |mgr| { + self.workflow_machines.access(run_id, |mgr| { mgr.command_sink.send(cmds)?; mgr.machines.event_loop(); Ok(()) })?; Ok(()) } - - /// Use a closure to access the machines for a workflow run, handles locking and missing - /// machines. - fn access_machine(&self, run_id: &str, mutator: F) -> Result - where - F: FnOnce(&mut WorkflowManager) -> Result + Send + 'static, - Fout: Send + Debug + 'static, - { - self.workflow_machines.access(run_id, mutator) - } } /// The error type returned by interactions with [Core] diff --git a/src/workflow/concurrency_manager.rs b/src/workflow/concurrency_manager.rs index d22bb5d93..cd51077c4 100644 --- a/src/workflow/concurrency_manager.rs +++ b/src/workflow/concurrency_manager.rs @@ -1,3 +1,6 @@ +//! Ultimately it would be nice to make this generic and push it out into it's own crate but +//! doing so is nontrivial + use crate::{ protos::temporal::api::workflowservice::v1::PollWorkflowTaskQueueResponse, workflow::{NextWfActivation, WorkflowManager}, @@ -7,34 +10,49 @@ use crossbeam::channel::{bounded, unbounded, Select, Sender, TryRecvError}; use dashmap::DashMap; use std::{ fmt::Debug, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, thread::{self, JoinHandle}, time::Duration, }; +use tracing::Level; -type MachineSender = Sender>; -type MachineCreatorMsg = ( - PollWorkflowTaskQueueResponse, - Sender, -); -type MachineCreatorResponseMsg = Result<(NextWfActivation, MachineSender)>; - +/// Provides a thread-safe way to access workflow machines which live exclusively on one thread +/// managed by this struct. We could make this generic for any collection of things which need +/// to live on one thread, if desired. pub(crate) struct WorkflowConcurrencyManager { - machines: DashMap, + machines: DashMap, wf_thread: JoinHandle<()>, machine_creator: Sender, + shutdown_flag: Arc, } +/// The tx side of a channel which accepts closures to mutably operate on a workflow manager +type MachineMutationSender = Sender>; +/// This is +type MachineCreatorMsg = ( + PollWorkflowTaskQueueResponse, + Sender, +); +type MachineCreatorResponseMsg = Result<(NextWfActivation, MachineMutationSender)>; + impl WorkflowConcurrencyManager { pub fn new() -> Self { let (machine_creator, create_rcv) = unbounded::(); + let shutdown_flag = Arc::new(AtomicBool::new(false)); + let shutdown_flag_for_thread = shutdown_flag.clone(); let wf_thread = thread::spawn(move || { let mut machine_rcvs = vec![]; loop { + if shutdown_flag_for_thread.load(Ordering::Relaxed) { + break; + } // If there's a message ready on the creation channel, make a new machine // and put it's receiver into the list, replying with the machine's activation and // a channel to send requests to it, or an error otherwise. - // TODO: handle disconnected / other channel errors match create_rcv.try_recv() { Ok((pwtqr, resp_chan)) => match WorkflowManager::new(pwtqr) .and_then(|mut wfm| Ok((wfm.get_next_activation()?, wfm))) @@ -49,7 +67,12 @@ impl WorkflowConcurrencyManager { } }, Err(TryRecvError::Disconnected) => { - dbg!("Channel disconnected!"); + event!( + Level::WARN, + "Sending side of workflow machine creator was dropped. Likely the \ + WorkflowConcurrencyManager was dropped. This indicates a failure to \ + call shutdown." + ); break; } Err(TryRecvError::Empty) => {} @@ -62,16 +85,17 @@ impl WorkflowConcurrencyManager { for (rcv, _) in machine_rcvs.iter() { sel.recv(rcv); } - match sel.try_ready() { - Ok(index) => match machine_rcvs[index].0.try_recv() { + if let Ok(index) = sel.try_ready() { + match machine_rcvs[index].0.try_recv() { Ok(func) => { - dbg!("Blorgp"); // Recall that calling this function also sends the response func(&mut machine_rcvs[index].1); } - Err(_) => {} - }, - Err(_) => {} + Err(TryRecvError::Disconnected) => { + panic!("Individual workflow machine channels should never be dropped"); + } + Err(TryRecvError::Empty) => {} + } } // TODO: remove probably std::thread::sleep(Duration::from_millis(10)); @@ -82,6 +106,7 @@ impl WorkflowConcurrencyManager { machines: Default::default(), wf_thread, machine_creator, + shutdown_flag, } } @@ -96,7 +121,6 @@ impl WorkflowConcurrencyManager { ) -> Result { if self.exists(run_id) { if let Some(history) = poll_wf_resp.history { - // TODO: Sort of dumb to use entry here now let activation = self.access(run_id, |wfm: &mut WorkflowManager| { wfm.feed_history_from_server(history) })?; @@ -145,6 +169,7 @@ impl WorkflowConcurrencyManager { /// # Panics /// If the workflow machine thread panicked pub fn shutdown(self) { + self.shutdown_flag.store(true, Ordering::Relaxed); self.wf_thread .join() .expect("Workflow manager thread should shut down cleanly"); @@ -157,4 +182,48 @@ impl BeSendSync for WorkflowConcurrencyManager {} #[cfg(test)] mod tests { use super::*; + use crate::machines::test_help::TestHistoryBuilder; + use crate::protos::temporal::api::common::v1::WorkflowExecution; + use crate::protos::temporal::api::enums::v1::EventType; + use crate::protos::temporal::api::history::v1::History; + + // We test mostly error paths here since the happy paths are well covered by the tests of the + // core sdk itself, and setting up the fake data is onerous here. If we make the concurrency + // manager generic, testing the happy path is simpler. + + #[test] + fn can_shutdown_after_creating_machine() { + let mgr = WorkflowConcurrencyManager::new(); + let mut t = TestHistoryBuilder::default(); + t.add_by_type(EventType::WorkflowExecutionStarted); + t.add_workflow_task(); + + let activation = mgr + .create_or_update( + "some_run_id", + PollWorkflowTaskQueueResponse { + history: Some(History { + events: t.get_history_info(1).unwrap().events, + }), + workflow_execution: Some(WorkflowExecution { + workflow_id: "wid".to_string(), + run_id: "rid".to_string(), + }), + task_token: vec![1], + ..Default::default() + }, + ) + .unwrap(); + assert!(activation.activation.is_some()); + + mgr.shutdown(); + } + + #[test] + fn returns_errors_on_creation() { + let mgr = WorkflowConcurrencyManager::new(); + let res = mgr.create_or_update("some_run_id", PollWorkflowTaskQueueResponse::default()); + // Should whine that we didn't provide history + assert_matches!(res.unwrap_err(), CoreError::BadDataFromWorkProvider(_)) + } } From ace7a1c62c6a00f13a16009d6b4d6d0ea7db0ad0 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Wed, 17 Feb 2021 16:51:32 -0800 Subject: [PATCH 25/51] Replace all unwraps with expectations --- src/lib.rs | 8 +++++--- src/workflow/concurrency_manager.rs | 29 ++++++++++++++++++++--------- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 1ca858ecd..056b03ef8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -33,8 +33,11 @@ use crate::{ }; use crossbeam::queue::SegQueue; use dashmap::DashMap; -use std::fmt::Debug; -use std::{convert::TryInto, sync::mpsc::SendError, sync::Arc}; +use std::{ + convert::TryInto, + fmt::Debug, + sync::{mpsc::SendError, Arc}, +}; use tokio::runtime::Runtime; use tonic::codegen::http::uri::InvalidUri; use tracing::Level; @@ -381,7 +384,6 @@ mod test { task_tok, )) .unwrap(); - dbg!("Done!!!!"); } #[test] diff --git a/src/workflow/concurrency_manager.rs b/src/workflow/concurrency_manager.rs index cd51077c4..8e69c5bd1 100644 --- a/src/workflow/concurrency_manager.rs +++ b/src/workflow/concurrency_manager.rs @@ -45,6 +45,9 @@ impl WorkflowConcurrencyManager { let shutdown_flag_for_thread = shutdown_flag.clone(); let wf_thread = thread::spawn(move || { + // TODO: We need to remove things from here at some point, but that wasn't implemented + // in core SDK yet either - once we're ready to remove things it should be simple to + // add a removal method. let mut machine_rcvs = vec![]; loop { if shutdown_flag_for_thread.load(Ordering::Relaxed) { @@ -60,10 +63,14 @@ impl WorkflowConcurrencyManager { Ok((activation, wfm)) => { let (machine_sender, machine_rcv) = unbounded(); machine_rcvs.push((machine_rcv, wfm)); - resp_chan.send(Ok((activation, machine_sender))).unwrap(); + resp_chan + .send(Ok((activation, machine_sender))) + .expect("wfm create resp rx side can't be dropped"); } Err(e) => { - resp_chan.send(Err(e)).unwrap(); + resp_chan + .send(Err(e)) + .expect("wfm create resp rx side can't be dropped"); } }, Err(TryRecvError::Disconnected) => { @@ -135,8 +142,10 @@ impl WorkflowConcurrencyManager { let (resp_send, resp_rcv) = bounded(1); self.machine_creator .send((poll_wf_resp, resp_send)) - .unwrap(); - let (activation, machine_sender) = resp_rcv.recv().unwrap()?; + .expect("wfm creation channel can't be dropped if we are inside this method"); + let (activation, machine_sender) = resp_rcv + .recv() + .expect("wfm create resp channel can't be dropped, it is in this stackframe")?; self.machines.insert(run_id.to_string(), machine_sender); Ok(activation) } @@ -155,13 +164,15 @@ impl WorkflowConcurrencyManager { // This code fetches the channel for a workflow manager and sends it a modified version of // of closure the caller provided which includes a channel for the response, and sends // the result of the user-provided closure down that response channel. - let (sender, receiver) = bounded(1); + let (resp_tx, resp_rx) = bounded(1); let f = move |x: &mut WorkflowManager| { - let _ = sender.send(dbg!(mutator(x))); + let _ = resp_tx.send(mutator(x)); }; - // TODO: Clean up unwraps - m.send(Box::new(f)).unwrap(); - receiver.recv().unwrap() + m.send(Box::new(f)) + .expect("wfm mutation send can't fail, if it does a wfm is missing from their thread"); + resp_rx + .recv() + .expect("wfm access resp channel can't be dropped, it is in this stackframe") } /// Attempt to join the thread where the workflow machines live. From 2d171ba7cf8bbfba83996a6a2b08724e82ded56d Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Thu, 18 Feb 2021 09:59:47 -0800 Subject: [PATCH 26/51] Clean up unused warning --- src/workflow/concurrency_manager.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/workflow/concurrency_manager.rs b/src/workflow/concurrency_manager.rs index 8e69c5bd1..27ac569bf 100644 --- a/src/workflow/concurrency_manager.rs +++ b/src/workflow/concurrency_manager.rs @@ -179,6 +179,7 @@ impl WorkflowConcurrencyManager { /// /// # Panics /// If the workflow machine thread panicked + #[allow(unused)] // TODO: Will be used when other shutdown PR is merged pub fn shutdown(self) { self.shutdown_flag.store(true, Ordering::Relaxed); self.wf_thread From 75e3da4f58124f5f90438f269b4885f31e4233ac Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Thu, 18 Feb 2021 10:26:53 -0800 Subject: [PATCH 27/51] Sensibly drop workflow managers when send side is dropped * doc cleanup --- src/workflow/concurrency_manager.rs | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/src/workflow/concurrency_manager.rs b/src/workflow/concurrency_manager.rs index 27ac569bf..7d4aef793 100644 --- a/src/workflow/concurrency_manager.rs +++ b/src/workflow/concurrency_manager.rs @@ -6,7 +6,7 @@ use crate::{ workflow::{NextWfActivation, WorkflowManager}, CoreError, Result, }; -use crossbeam::channel::{bounded, unbounded, Select, Sender, TryRecvError}; +use crossbeam::channel::{bounded, unbounded, Receiver, Select, Sender, TryRecvError}; use dashmap::DashMap; use std::{ fmt::Debug, @@ -23,6 +23,9 @@ use tracing::Level; /// managed by this struct. We could make this generic for any collection of things which need /// to live on one thread, if desired. pub(crate) struct WorkflowConcurrencyManager { + // TODO: We need to remove things from here at some point, but that wasn't implemented + // in core SDK yet either - once we're ready to remove things, they can be removed from this + // map and the wfm thread will drop the machines. machines: DashMap, wf_thread: JoinHandle<()>, machine_creator: Sender, @@ -31,11 +34,15 @@ pub(crate) struct WorkflowConcurrencyManager { /// The tx side of a channel which accepts closures to mutably operate on a workflow manager type MachineMutationSender = Sender>; -/// This is +type MachineMutationReceiver = Receiver>; +/// This is the message sent from the concurrency manager to the dedicated thread in order to +/// instantiate a new workflow manager type MachineCreatorMsg = ( PollWorkflowTaskQueueResponse, Sender, ); +/// The response to [MachineCreatorMsg], which includes the wf activation and the channel to +/// send requests to the newly instantiated workflow manager. type MachineCreatorResponseMsg = Result<(NextWfActivation, MachineMutationSender)>; impl WorkflowConcurrencyManager { @@ -45,10 +52,7 @@ impl WorkflowConcurrencyManager { let shutdown_flag_for_thread = shutdown_flag.clone(); let wf_thread = thread::spawn(move || { - // TODO: We need to remove things from here at some point, but that wasn't implemented - // in core SDK yet either - once we're ready to remove things it should be simple to - // add a removal method. - let mut machine_rcvs = vec![]; + let mut machine_rcvs: Vec<(MachineMutationReceiver, WorkflowManager)> = vec![]; loop { if shutdown_flag_for_thread.load(Ordering::Relaxed) { break; @@ -99,7 +103,17 @@ impl WorkflowConcurrencyManager { func(&mut machine_rcvs[index].1); } Err(TryRecvError::Disconnected) => { - panic!("Individual workflow machine channels should never be dropped"); + // This is expected when core is done with a workflow manager. IE: is + // ready to remove it from the cache. It dropping the send side from the + // concurrency manager is the signal to this thread that the workflow + // manager can be dropped. + let wfid = &machine_rcvs[index].1.machines.workflow_id; + event!( + Level::DEBUG, + "Workflow manager thread done with workflow id {}", + wfid + ); + machine_rcvs.remove(index); } Err(TryRecvError::Empty) => {} } From 42feb9ba1ef1aaec4b7ee8efe72af3a984b725ad Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Thu, 18 Feb 2021 10:42:26 -0800 Subject: [PATCH 28/51] Extract function for dedicated thread --- src/lib.rs | 27 ++++++ src/workflow/concurrency_manager.rs | 145 ++++++++++++++-------------- 2 files changed, 102 insertions(+), 70 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 056b03ef8..a96609a60 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -317,6 +317,7 @@ mod test { }, }, }; + use std::time::Duration; #[test] fn single_timer_test_across_wf_bridge() { @@ -544,4 +545,30 @@ mod test { )) .unwrap(); } + + // #[test] + // fn test_cpu() { + // let wfid = "fake_wf_id"; + // let run_id = "fake_run_id"; + // let timer_1_id = "timer1".to_string(); + // let task_queue = "test-task-queue"; + // + // let mut t = TestHistoryBuilder::default(); + // t.add_by_type(EventType::WorkflowExecutionStarted); + // t.add_workflow_task(); + // let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); + // t.add( + // EventType::TimerFired, + // history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { + // started_event_id: timer_started_event_id, + // timer_id: timer_1_id.clone(), + // }), + // ); + // t.add_workflow_task_scheduled_and_started(); + // // NOTE! What makes this a replay test is the server only responds with *one* batch here. + // // So, server is polled once, but lang->core interactions look just like non-replay test. + // let core = build_fake_core(wfid, run_id, &mut t, &[2]); + // + // std::thread::sleep(Duration::from_secs(10000000)); + // } } diff --git a/src/workflow/concurrency_manager.rs b/src/workflow/concurrency_manager.rs index 7d4aef793..31b92f7c9 100644 --- a/src/workflow/concurrency_manager.rs +++ b/src/workflow/concurrency_manager.rs @@ -15,7 +15,6 @@ use std::{ Arc, }, thread::{self, JoinHandle}, - time::Duration, }; use tracing::Level; @@ -52,75 +51,7 @@ impl WorkflowConcurrencyManager { let shutdown_flag_for_thread = shutdown_flag.clone(); let wf_thread = thread::spawn(move || { - let mut machine_rcvs: Vec<(MachineMutationReceiver, WorkflowManager)> = vec![]; - loop { - if shutdown_flag_for_thread.load(Ordering::Relaxed) { - break; - } - // If there's a message ready on the creation channel, make a new machine - // and put it's receiver into the list, replying with the machine's activation and - // a channel to send requests to it, or an error otherwise. - match create_rcv.try_recv() { - Ok((pwtqr, resp_chan)) => match WorkflowManager::new(pwtqr) - .and_then(|mut wfm| Ok((wfm.get_next_activation()?, wfm))) - { - Ok((activation, wfm)) => { - let (machine_sender, machine_rcv) = unbounded(); - machine_rcvs.push((machine_rcv, wfm)); - resp_chan - .send(Ok((activation, machine_sender))) - .expect("wfm create resp rx side can't be dropped"); - } - Err(e) => { - resp_chan - .send(Err(e)) - .expect("wfm create resp rx side can't be dropped"); - } - }, - Err(TryRecvError::Disconnected) => { - event!( - Level::WARN, - "Sending side of workflow machine creator was dropped. Likely the \ - WorkflowConcurrencyManager was dropped. This indicates a failure to \ - call shutdown." - ); - break; - } - Err(TryRecvError::Empty) => {} - } - - // Having created any new machines, we now check if there are any pending requests - // to interact with the machines. If multiple requests are pending they are dealt - // with in random order. - let mut sel = Select::new(); - for (rcv, _) in machine_rcvs.iter() { - sel.recv(rcv); - } - if let Ok(index) = sel.try_ready() { - match machine_rcvs[index].0.try_recv() { - Ok(func) => { - // Recall that calling this function also sends the response - func(&mut machine_rcvs[index].1); - } - Err(TryRecvError::Disconnected) => { - // This is expected when core is done with a workflow manager. IE: is - // ready to remove it from the cache. It dropping the send side from the - // concurrency manager is the signal to this thread that the workflow - // manager can be dropped. - let wfid = &machine_rcvs[index].1.machines.workflow_id; - event!( - Level::DEBUG, - "Workflow manager thread done with workflow id {}", - wfid - ); - machine_rcvs.remove(index); - } - Err(TryRecvError::Empty) => {} - } - } - // TODO: remove probably - std::thread::sleep(Duration::from_millis(10)); - } + WorkflowConcurrencyManager::workflow_thread(create_rcv, shutdown_flag_for_thread) }); Self { @@ -200,6 +131,80 @@ impl WorkflowConcurrencyManager { .join() .expect("Workflow manager thread should shut down cleanly"); } + + /// The implementation of the dedicated thread workflow managers live on + fn workflow_thread( + create_rcv: Receiver, + shutdown_flag_for_thread: Arc, + ) { + let mut machine_rcvs: Vec<(MachineMutationReceiver, WorkflowManager)> = vec![]; + loop { + if shutdown_flag_for_thread.load(Ordering::Relaxed) { + break; + } + // If there's a message ready on the creation channel, make a new machine + // and put it's receiver into the list, replying with the machine's activation and + // a channel to send requests to it, or an error otherwise. + match create_rcv.try_recv() { + Ok((pwtqr, resp_chan)) => match WorkflowManager::new(pwtqr) + .and_then(|mut wfm| Ok((wfm.get_next_activation()?, wfm))) + { + Ok((activation, wfm)) => { + let (machine_sender, machine_rcv) = unbounded(); + machine_rcvs.push((machine_rcv, wfm)); + resp_chan + .send(Ok((activation, machine_sender))) + .expect("wfm create resp rx side can't be dropped"); + } + Err(e) => { + resp_chan + .send(Err(e)) + .expect("wfm create resp rx side can't be dropped"); + } + }, + Err(TryRecvError::Disconnected) => { + event!( + Level::WARN, + "Sending side of workflow machine creator was dropped. Likely the \ + WorkflowConcurrencyManager was dropped. This indicates a failure to \ + call shutdown." + ); + break; + } + Err(TryRecvError::Empty) => {} + } + + // Having created any new machines, we now check if there are any pending requests + // to interact with the machines. If multiple requests are pending they are dealt + // with in random order. + let mut sel = Select::new(); + for (rcv, _) in machine_rcvs.iter() { + sel.recv(rcv); + } + if let Ok(index) = sel.try_ready() { + match machine_rcvs[index].0.try_recv() { + Ok(func) => { + // Recall that calling this function also sends the response + func(&mut machine_rcvs[index].1); + } + Err(TryRecvError::Disconnected) => { + // This is expected when core is done with a workflow manager. IE: is + // ready to remove it from the cache. It dropping the send side from the + // concurrency manager is the signal to this thread that the workflow + // manager can be dropped. + let wfid = &machine_rcvs[index].1.machines.workflow_id; + event!( + Level::DEBUG, + "Workflow manager thread done with workflow id {}", + wfid + ); + machine_rcvs.remove(index); + } + Err(TryRecvError::Empty) => {} + } + } + } + } } trait BeSendSync: Send + Sync {} From 139d43f11d82be9ecff5b0ff8073d93811bc9a1f Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Thu, 18 Feb 2021 11:40:16 -0800 Subject: [PATCH 29/51] Function breakdown --- src/workflow/concurrency_manager.rs | 121 +++++++++++++++++----------- 1 file changed, 72 insertions(+), 49 deletions(-) diff --git a/src/workflow/concurrency_manager.rs b/src/workflow/concurrency_manager.rs index 31b92f7c9..f07ab58b5 100644 --- a/src/workflow/concurrency_manager.rs +++ b/src/workflow/concurrency_manager.rs @@ -145,33 +145,13 @@ impl WorkflowConcurrencyManager { // If there's a message ready on the creation channel, make a new machine // and put it's receiver into the list, replying with the machine's activation and // a channel to send requests to it, or an error otherwise. - match create_rcv.try_recv() { - Ok((pwtqr, resp_chan)) => match WorkflowManager::new(pwtqr) - .and_then(|mut wfm| Ok((wfm.get_next_activation()?, wfm))) - { - Ok((activation, wfm)) => { - let (machine_sender, machine_rcv) = unbounded(); - machine_rcvs.push((machine_rcv, wfm)); - resp_chan - .send(Ok((activation, machine_sender))) - .expect("wfm create resp rx side can't be dropped"); - } - Err(e) => { - resp_chan - .send(Err(e)) - .expect("wfm create resp rx side can't be dropped"); - } - }, - Err(TryRecvError::Disconnected) => { - event!( - Level::WARN, - "Sending side of workflow machine creator was dropped. Likely the \ - WorkflowConcurrencyManager was dropped. This indicates a failure to \ - call shutdown." - ); - break; - } - Err(TryRecvError::Empty) => {} + let maybe_create_chan_msg = create_rcv.try_recv(); + let should_break = WorkflowConcurrencyManager::handle_creation_message( + &mut machine_rcvs, + maybe_create_chan_msg, + ); + if should_break { + break; } // Having created any new machines, we now check if there are any pending requests @@ -182,34 +162,77 @@ impl WorkflowConcurrencyManager { sel.recv(rcv); } if let Ok(index) = sel.try_ready() { - match machine_rcvs[index].0.try_recv() { - Ok(func) => { - // Recall that calling this function also sends the response - func(&mut machine_rcvs[index].1); - } - Err(TryRecvError::Disconnected) => { - // This is expected when core is done with a workflow manager. IE: is - // ready to remove it from the cache. It dropping the send side from the - // concurrency manager is the signal to this thread that the workflow - // manager can be dropped. - let wfid = &machine_rcvs[index].1.machines.workflow_id; - event!( - Level::DEBUG, - "Workflow manager thread done with workflow id {}", - wfid - ); - machine_rcvs.remove(index); - } - Err(TryRecvError::Empty) => {} + WorkflowConcurrencyManager::handle_access_msg(index, &mut machine_rcvs) + } + } + } + + /// Handles requests to access/mutate a workflow manager. The passed in index indicates which + /// machine in the `machine_rcvs` vec is ready to be read from. + fn handle_access_msg( + index: usize, + machine_rcvs: &mut Vec<(MachineMutationReceiver, WorkflowManager)>, + ) { + match machine_rcvs[index].0.try_recv() { + Ok(func) => { + // Recall that calling this function also sends the response + func(&mut machine_rcvs[index].1); + } + Err(TryRecvError::Disconnected) => { + // This is expected when core is done with a workflow manager. IE: is + // ready to remove it from the cache. It dropping the send side from the + // concurrency manager is the signal to this thread that the workflow + // manager can be dropped. + let wfid = &machine_rcvs[index].1.machines.workflow_id; + event!( + Level::DEBUG, + "Workflow manager thread done with workflow id {}", + wfid + ); + machine_rcvs.remove(index); + } + Err(TryRecvError::Empty) => {} + } + } + + /// Handle requests to create new workflow managers. Returns true if the creation channel + /// was dropped and dedicated thread loop should be exited. + fn handle_creation_message( + machine_rcvs: &mut Vec<(MachineMutationReceiver, WorkflowManager)>, + maybe_create_chan_msg: Result, + ) -> bool { + match maybe_create_chan_msg { + Ok((pwtqr, resp_chan)) => match WorkflowManager::new(pwtqr) + .and_then(|mut wfm| Ok((wfm.get_next_activation()?, wfm))) + { + Ok((activation, wfm)) => { + let (machine_sender, machine_rcv) = unbounded(); + machine_rcvs.push((machine_rcv, wfm)); + resp_chan + .send(Ok((activation, machine_sender))) + .expect("wfm create resp rx side can't be dropped"); } + Err(e) => { + resp_chan + .send(Err(e)) + .expect("wfm create resp rx side can't be dropped"); + } + }, + Err(TryRecvError::Disconnected) => { + event!( + Level::WARN, + "Sending side of workflow machine creator was dropped. Likely the \ + WorkflowConcurrencyManager was dropped. This indicates a failure to \ + call shutdown." + ); + return true; } + Err(TryRecvError::Empty) => {} } + false } } -trait BeSendSync: Send + Sync {} -impl BeSendSync for WorkflowConcurrencyManager {} - #[cfg(test)] mod tests { use super::*; From 39f75ca1757c82e8f962bd2227f2c06b8f65ad9a Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Thu, 18 Feb 2021 12:03:13 -0800 Subject: [PATCH 30/51] Fix busy looping --- src/lib.rs | 27 ------ src/workflow/concurrency_manager.rs | 122 ++++++++++++++-------------- 2 files changed, 63 insertions(+), 86 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index a96609a60..056b03ef8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -317,7 +317,6 @@ mod test { }, }, }; - use std::time::Duration; #[test] fn single_timer_test_across_wf_bridge() { @@ -545,30 +544,4 @@ mod test { )) .unwrap(); } - - // #[test] - // fn test_cpu() { - // let wfid = "fake_wf_id"; - // let run_id = "fake_run_id"; - // let timer_1_id = "timer1".to_string(); - // let task_queue = "test-task-queue"; - // - // let mut t = TestHistoryBuilder::default(); - // t.add_by_type(EventType::WorkflowExecutionStarted); - // t.add_workflow_task(); - // let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); - // t.add( - // EventType::TimerFired, - // history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { - // started_event_id: timer_started_event_id, - // timer_id: timer_1_id.clone(), - // }), - // ); - // t.add_workflow_task_scheduled_and_started(); - // // NOTE! What makes this a replay test is the server only responds with *one* batch here. - // // So, server is polled once, but lang->core interactions look just like non-replay test. - // let core = build_fake_core(wfid, run_id, &mut t, &[2]); - // - // std::thread::sleep(Duration::from_secs(10000000)); - // } } diff --git a/src/workflow/concurrency_manager.rs b/src/workflow/concurrency_manager.rs index f07ab58b5..9abc4c901 100644 --- a/src/workflow/concurrency_manager.rs +++ b/src/workflow/concurrency_manager.rs @@ -10,10 +10,6 @@ use crossbeam::channel::{bounded, unbounded, Receiver, Select, Sender, TryRecvEr use dashmap::DashMap; use std::{ fmt::Debug, - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, thread::{self, JoinHandle}, }; use tracing::Level; @@ -28,7 +24,7 @@ pub(crate) struct WorkflowConcurrencyManager { machines: DashMap, wf_thread: JoinHandle<()>, machine_creator: Sender, - shutdown_flag: Arc, + shutdown_chan: Sender, } /// The tx side of a channel which accepts closures to mutably operate on a workflow manager @@ -47,18 +43,17 @@ type MachineCreatorResponseMsg = Result<(NextWfActivation, MachineMutationSender impl WorkflowConcurrencyManager { pub fn new() -> Self { let (machine_creator, create_rcv) = unbounded::(); - let shutdown_flag = Arc::new(AtomicBool::new(false)); - let shutdown_flag_for_thread = shutdown_flag.clone(); + let (shutdown_chan, shutdown_rx) = bounded(1); let wf_thread = thread::spawn(move || { - WorkflowConcurrencyManager::workflow_thread(create_rcv, shutdown_flag_for_thread) + WorkflowConcurrencyManager::workflow_thread(create_rcv, shutdown_rx) }); Self { machines: Default::default(), wf_thread, machine_creator, - shutdown_flag, + shutdown_chan, } } @@ -126,72 +121,53 @@ impl WorkflowConcurrencyManager { /// If the workflow machine thread panicked #[allow(unused)] // TODO: Will be used when other shutdown PR is merged pub fn shutdown(self) { - self.shutdown_flag.store(true, Ordering::Relaxed); + let _ = self.shutdown_chan.send(true); self.wf_thread .join() .expect("Workflow manager thread should shut down cleanly"); } /// The implementation of the dedicated thread workflow managers live on - fn workflow_thread( - create_rcv: Receiver, - shutdown_flag_for_thread: Arc, - ) { + fn workflow_thread(create_rcv: Receiver, shutdown_rx: Receiver) { let mut machine_rcvs: Vec<(MachineMutationReceiver, WorkflowManager)> = vec![]; loop { - if shutdown_flag_for_thread.load(Ordering::Relaxed) { - break; - } - // If there's a message ready on the creation channel, make a new machine - // and put it's receiver into the list, replying with the machine's activation and - // a channel to send requests to it, or an error otherwise. - let maybe_create_chan_msg = create_rcv.try_recv(); - let should_break = WorkflowConcurrencyManager::handle_creation_message( - &mut machine_rcvs, - maybe_create_chan_msg, - ); - if should_break { - break; - } + // To avoid needing to busy loop, we want to block until either a creation message + // arrives, or any machine access request arrives, so we cram all of them into a big + // select. If multiple messages are ready at once they're handled in random order. This + // is OK because they all go to independent workflows. - // Having created any new machines, we now check if there are any pending requests - // to interact with the machines. If multiple requests are pending they are dealt - // with in random order. + // **IMPORTANT** the first operation in the select is always reading from the shutdown + // channel, and the second is always reading from the creation channel. let mut sel = Select::new(); + sel.recv(&shutdown_rx); + sel.recv(&create_rcv); for (rcv, _) in machine_rcvs.iter() { sel.recv(rcv); } - if let Ok(index) = sel.try_ready() { - WorkflowConcurrencyManager::handle_access_msg(index, &mut machine_rcvs) - } - } - } - /// Handles requests to access/mutate a workflow manager. The passed in index indicates which - /// machine in the `machine_rcvs` vec is ready to be read from. - fn handle_access_msg( - index: usize, - machine_rcvs: &mut Vec<(MachineMutationReceiver, WorkflowManager)>, - ) { - match machine_rcvs[index].0.try_recv() { - Ok(func) => { - // Recall that calling this function also sends the response - func(&mut machine_rcvs[index].1); - } - Err(TryRecvError::Disconnected) => { - // This is expected when core is done with a workflow manager. IE: is - // ready to remove it from the cache. It dropping the send side from the - // concurrency manager is the signal to this thread that the workflow - // manager can be dropped. - let wfid = &machine_rcvs[index].1.machines.workflow_id; - event!( - Level::DEBUG, - "Workflow manager thread done with workflow id {}", - wfid + let index = sel.ready(); + if index == 0 { + // Shutdown seen + break; + } else if index == 1 { + // If there's a message ready on the creation channel, make a new machine + // and put it's receiver into the list, replying with the machine's activation and + // a channel to send requests to it, or an error otherwise. + let maybe_create_chan_msg = create_rcv.try_recv(); + let should_break = WorkflowConcurrencyManager::handle_creation_message( + &mut machine_rcvs, + maybe_create_chan_msg, ); - machine_rcvs.remove(index); + if should_break { + break; + } + } else { + // If there's a message ready on the creation channel, make a new machine + + // We must subtract two to account for the shutdown and creation channels reads + // being the first two operations in the select + WorkflowConcurrencyManager::handle_access_msg(index - 2, &mut machine_rcvs) } - Err(TryRecvError::Empty) => {} } } @@ -231,6 +207,34 @@ impl WorkflowConcurrencyManager { } false } + + /// Handles requests to access/mutate a workflow manager. The passed in index indicates which + /// machine in the `machine_rcvs` vec is ready to be read from. + fn handle_access_msg( + index: usize, + machine_rcvs: &mut Vec<(MachineMutationReceiver, WorkflowManager)>, + ) { + match machine_rcvs[index].0.try_recv() { + Ok(func) => { + // Recall that calling this function also sends the response + func(&mut machine_rcvs[index].1); + } + Err(TryRecvError::Disconnected) => { + // This is expected when core is done with a workflow manager. IE: is + // ready to remove it from the cache. It dropping the send side from the + // concurrency manager is the signal to this thread that the workflow + // manager can be dropped. + let wfid = &machine_rcvs[index].1.machines.workflow_id; + event!( + Level::DEBUG, + "Workflow manager thread done with workflow id {}", + wfid + ); + machine_rcvs.remove(index); + } + Err(TryRecvError::Empty) => {} + } + } } #[cfg(test)] From e2566eb7f1c87a13551a8cbe43f0203b8b9e90c6 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Thu, 18 Feb 2021 13:48:17 -0800 Subject: [PATCH 31/51] Server is a bit inconsistent w/ timing in parallel timer test. --- tests/integ_tests/simple_wf_tests.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integ_tests/simple_wf_tests.rs b/tests/integ_tests/simple_wf_tests.rs index 7218fb6c5..95109e3ae 100644 --- a/tests/integ_tests/simple_wf_tests.rs +++ b/tests/integ_tests/simple_wf_tests.rs @@ -106,8 +106,9 @@ fn parallel_timer_workflow() { task.task_token, )) .unwrap(); - // Wait long enough for both timers to complete - std::thread::sleep(Duration::from_millis(1000)); + // Wait long enough for both timers to complete. Server seems to be a bit weird about actually + // sending both of these in one go, so we need to wait longer than you would expect. + std::thread::sleep(Duration::from_millis(1500)); let task = core.poll_task(task_q).unwrap(); assert_matches!( task.get_wf_jobs().as_slice(), From bb97e12f223ee5ac2adfcb07af46e825b9989b38 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Thu, 18 Feb 2021 13:49:14 -0800 Subject: [PATCH 32/51] Remove leftover temp RC variable --- src/workflow/mod.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/workflow/mod.rs b/src/workflow/mod.rs index a85723197..40cc3fb55 100644 --- a/src/workflow/mod.rs +++ b/src/workflow/mod.rs @@ -72,8 +72,6 @@ pub(crate) struct WorkflowManager { last_history_task_count: usize, /// The current workflow task number this run is on. Starts at one and monotonically increases. current_wf_task_num: usize, - - _temp: std::rc::Rc, } impl WorkflowManager { @@ -98,7 +96,6 @@ impl WorkflowManager { last_history_task_count: history.get_workflow_task_count(None)?, last_history_from_server: history, current_wf_task_num: 1, - _temp: std::rc::Rc::new(8), }) } } From fa15c4960701f70f52c023e7dec0275e5e867eef Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Thu, 18 Feb 2021 14:08:15 -0800 Subject: [PATCH 33/51] Comment correction --- src/workflow/concurrency_manager.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/workflow/concurrency_manager.rs b/src/workflow/concurrency_manager.rs index 9abc4c901..2e8a241bf 100644 --- a/src/workflow/concurrency_manager.rs +++ b/src/workflow/concurrency_manager.rs @@ -1,4 +1,4 @@ -//! Ultimately it would be nice to make this generic and push it out into it's own crate but +//! Ultimately it would be nice to make this generic and push it out into its own crate but //! doing so is nontrivial use crate::{ @@ -151,7 +151,7 @@ impl WorkflowConcurrencyManager { break; } else if index == 1 { // If there's a message ready on the creation channel, make a new machine - // and put it's receiver into the list, replying with the machine's activation and + // and put its receiver into the list, replying with the machine's activation and // a channel to send requests to it, or an error otherwise. let maybe_create_chan_msg = create_rcv.try_recv(); let should_break = WorkflowConcurrencyManager::handle_creation_message( @@ -162,7 +162,7 @@ impl WorkflowConcurrencyManager { break; } } else { - // If there's a message ready on the creation channel, make a new machine + // If we're here, a request to access a workflow manager is ready. // We must subtract two to account for the shutdown and creation channels reads // being the first two operations in the select From 8f1cb1b3a361c3ab8fb55e008f9730fbb8f4c404 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Thu, 18 Feb 2021 17:14:15 -0800 Subject: [PATCH 34/51] Got things compiling with Rc but it's a mess --- Cargo.toml | 2 - src/lib.rs | 6 +- .../complete_workflow_state_machine.rs | 21 +-- src/machines/mod.rs | 87 +++++++--- src/machines/test_help/history_builder.rs | 2 +- src/machines/test_help/workflow_driver.rs | 22 +-- src/machines/timer_state_machine.rs | 117 ++++++++----- src/machines/workflow_machines.rs | 156 +++++++++++++----- src/machines/workflow_task_state_machine.rs | 3 + src/protosext/history_info.rs | 2 +- src/workflow/concurrency_manager.rs | 2 +- 11 files changed, 284 insertions(+), 136 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 75dd0de6b..442df0564 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,8 +19,6 @@ displaydoc = "0.1" env_logger = "0.8" futures = "0.3" log = "0.4" -opentelemetry-jaeger = "0.10" -opentelemetry = "0.11.2" prost = "0.7" prost-types = "0.7" thiserror = "1.0" diff --git a/src/lib.rs b/src/lib.rs index 056b03ef8..2ea1bbbf6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -327,7 +327,7 @@ mod test { let mut t = TestHistoryBuilder::default(); t.add_by_type(EventType::WorkflowExecutionStarted); - t.add_workflow_task(); + t.add_full_wf_task(); let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); t.add( EventType::TimerFired, @@ -396,7 +396,7 @@ mod test { let mut t = TestHistoryBuilder::default(); t.add_by_type(EventType::WorkflowExecutionStarted); - t.add_workflow_task(); + t.add_full_wf_task(); let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); let timer_2_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); t.add( @@ -496,7 +496,7 @@ mod test { let mut t = TestHistoryBuilder::default(); t.add_by_type(EventType::WorkflowExecutionStarted); - t.add_workflow_task(); + t.add_full_wf_task(); let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); t.add( EventType::TimerFired, diff --git a/src/machines/complete_workflow_state_machine.rs b/src/machines/complete_workflow_state_machine.rs index 0c2f6a835..fc64209db 100644 --- a/src/machines/complete_workflow_state_machine.rs +++ b/src/machines/complete_workflow_state_machine.rs @@ -1,8 +1,8 @@ -use crate::machines::workflow_machines::MachineResponse; +use crate::machines::CommandAndMachine; use crate::{ machines::{ - workflow_machines::WorkflowMachines, AddCommand, CancellableCommand, WFCommand, - WFMachinesAdapter, WFMachinesError, + workflow_machines::MachineResponse, workflow_machines::WorkflowMachines, Cancellable, + WFCommand, WFMachinesAdapter, WFMachinesError, }, protos::temporal::api::{ command::v1::{Command, CompleteWorkflowExecutionCommandAttributes}, @@ -11,7 +11,6 @@ use crate::{ }, }; use rustfsm::{fsm, StateMachine, TransitionResult}; -use std::cell::RefCell; use std::{convert::TryFrom, rc::Rc}; fsm! { @@ -31,17 +30,17 @@ fsm! { #[derive(Debug)] pub(super) enum CompleteWFCommand { - AddCommand(AddCommand), + AddCommand(Command), } /// Complete a workflow pub(super) fn complete_workflow( attribs: CompleteWorkflowExecutionCommandAttributes, -) -> CancellableCommand { +) -> CommandAndMachine { let (machine, add_cmd) = CompleteWorkflowMachine::new_scheduled(attribs); - CancellableCommand::Active { - command: add_cmd.command, - machine: Box::new(machine), + CommandAndMachine { + command: add_cmd, + machine: Rc::new(machine), } } @@ -49,7 +48,7 @@ impl CompleteWorkflowMachine { /// Create a new WF machine and schedule it pub(crate) fn new_scheduled( attribs: CompleteWorkflowExecutionCommandAttributes, - ) -> (Self, AddCommand) { + ) -> (Self, Command) { let mut s = Self { state: Created {}.into(), shared_state: attribs, @@ -136,3 +135,5 @@ impl WFMachinesAdapter for CompleteWorkflowMachine { Ok(vec![]) } } + +impl Cancellable for CompleteWorkflowMachine {} diff --git a/src/machines/mod.rs b/src/machines/mod.rs index fc29e4cdf..fd0852201 100644 --- a/src/machines/mod.rs +++ b/src/machines/mod.rs @@ -1,6 +1,7 @@ #[allow(unused)] mod workflow_machines; +// TODO: Move all these inside a submachines module #[allow(unused)] mod activity_state_machine; #[allow(unused)] @@ -43,8 +44,8 @@ use crate::{ coresdk::{self, command::Variant, wf_activation_job}, temporal::api::{ command::v1::{ - command::Attributes, Command, CompleteWorkflowExecutionCommandAttributes, - StartTimerCommandAttributes, + command::Attributes, CancelTimerCommandAttributes, Command, + CompleteWorkflowExecutionCommandAttributes, StartTimerCommandAttributes, }, enums::v1::CommandType, history::v1::{ @@ -56,6 +57,7 @@ use crate::{ }; use prost::alloc::fmt::Formatter; use rustfsm::{MachineError, StateMachine}; +use std::rc::Rc; use std::{ convert::{TryFrom, TryInto}, fmt::Debug, @@ -93,13 +95,6 @@ pub(crate) trait ActivationListener { fn on_activation_job(&mut self, _activation: &wf_activation_job::Attributes) {} } -/// The struct for [WFCommand::AddCommand] -#[derive(Debug, derive_more::From)] -pub(crate) struct AddCommand { - /// The protobuf command - pub(crate) command: Command, -} - /// [DrivenWorkflow]s respond with these when called, to indicate what they want to do next. /// EX: Create a new timer, complete the workflow, etc. #[derive(Debug, derive_more::From)] @@ -107,6 +102,7 @@ pub enum WFCommand { /// Returned when we need to wait for the lang sdk to send us something NoCommandsFromLang, AddTimer(StartTimerCommandAttributes), + CancelTimer(CancelTimerCommandAttributes), CompleteWorkflow(CompleteWorkflowExecutionCommandAttributes), } @@ -148,11 +144,14 @@ trait TemporalStateMachine: CheckStateMachineInFinal + Send { event: &HistoryEvent, has_next_event: bool, ) -> Result, WFMachinesError>; + + /// Attempt to cancel the command associated with this state machine, if it is cancellable + fn cancel(&mut self) -> Result; } impl TemporalStateMachine for SM where - SM: StateMachine + CheckStateMachineInFinal + WFMachinesAdapter + Clone + Send, + SM: StateMachine + CheckStateMachineInFinal + WFMachinesAdapter + Cancellable + Clone + Send, ::Event: TryFrom, ::Event: TryFrom, WFMachinesError: From<<::Event as TryFrom>::Error>, @@ -211,6 +210,17 @@ where Err(MachineError::Underlying(e)) => Err(e.into()), } } + + fn cancel(&mut self) -> Result { + let res = self.cancel(); + dbg!(&res); + res.map_err(|e| match e { + MachineError::InvalidTransition => { + WFMachinesError::InvalidTransition("while attempting to cancel") + } + MachineError::Underlying(e) => e.into(), + }) + } } /// Exists purely to allow generic implementation of `is_final_state` for all [StateMachine] @@ -243,25 +253,52 @@ trait WFMachinesAdapter: StateMachine { ) -> Result, WFMachinesError>; } -/// A command which can be cancelled, associated with the state machine that produced it +trait Cancellable: StateMachine { + /// Cancel the machine / the command represented by the machine. + /// + /// # Panics + /// * If the machine is not cancellable. It's a logic error on our part to call it on such + /// machines. + fn cancel(&mut self) -> Result> { + // It's a logic error on our part if this is ever called on a machine that can't actually + // be cancelled TODO: Result instead? + panic!(format!("This type of machine cannot be cancelled")) + } +} + +// TODO: Distinction is maybe unimportant #[derive(Debug)] -#[allow(clippy::large_enum_variant)] -enum CancellableCommand { - // TODO: You'll be used soon, friend. - #[allow(dead_code)] - Cancelled, - Active { - /// The inner protobuf command, if None, command has been cancelled - command: ProtoCommand, - machine: Box, - }, +enum NewOrExistingCommand { + New(CommandAndMachine), + Existing(CommandAndMachine), } -impl CancellableCommand { - #[allow(dead_code)] // TODO: Use - pub(super) fn cancel(&mut self) { - *self = CancellableCommand::Cancelled; +impl NewOrExistingCommand { + fn machine(&self) -> &dyn TemporalStateMachine { + match self { + NewOrExistingCommand::New(n) => &*n.machine, + NewOrExistingCommand::Existing(e) => &*e.machine, + } + } + fn machine_mut(&mut self) -> &mut dyn TemporalStateMachine { + match self { + // TODO: Not this + NewOrExistingCommand::New(n) => Rc::get_mut(&mut n.machine).unwrap(), + NewOrExistingCommand::Existing(e) => Rc::get_mut(&mut e.machine).unwrap(), + } } + fn command(&self) -> &ProtoCommand { + match self { + NewOrExistingCommand::New(n) => &n.command, + NewOrExistingCommand::Existing(e) => &e.command, + } + } +} + +#[derive(Debug)] +struct CommandAndMachine { + command: ProtoCommand, + machine: Rc, } impl Debug for dyn TemporalStateMachine { diff --git a/src/machines/test_help/history_builder.rs b/src/machines/test_help/history_builder.rs index 8a3230790..7ff37e3f4 100644 --- a/src/machines/test_help/history_builder.rs +++ b/src/machines/test_help/history_builder.rs @@ -55,7 +55,7 @@ impl TestHistoryBuilder { /// EVENT_TYPE_WORKFLOW_TASK_STARTED /// EVENT_TYPE_WORKFLOW_TASK_COMPLETED /// ``` - pub fn add_workflow_task(&mut self) { + pub fn add_full_wf_task(&mut self) { self.add_workflow_task_scheduled_and_started(); self.add_workflow_task_completed(); } diff --git a/src/machines/test_help/workflow_driver.rs b/src/machines/test_help/workflow_driver.rs index b369b1fb4..05496b2a3 100644 --- a/src/machines/test_help/workflow_driver.rs +++ b/src/machines/test_help/workflow_driver.rs @@ -1,3 +1,4 @@ +use crate::protos::temporal::api::command::v1::CancelTimerCommandAttributes; use crate::{ machines::{ActivationListener, DrivenWorkflow, WFCommand}, protos::{ @@ -64,7 +65,6 @@ where impl ActivationListener for TestWorkflowDriver { fn on_activation_job(&mut self, activation: &Attributes) { if let Attributes::TimerFired(TimerFiredTaskAttributes { timer_id }) = activation { - dbg!(&timer_id); Arc::get_mut(&mut self.cache) .unwrap() .unblocked_timers @@ -96,10 +96,10 @@ where let cmds = receiver.into_iter(); - let mut last_cmd = None; + let mut emit_these = vec![]; for cmd in cmds { match cmd { - TestWFCommand::WFCommand(c) => last_cmd = Some(c), + TestWFCommand::WFCommand(c) => emit_these.push(c), TestWFCommand::Waiting => { // Ignore further commands since we're waiting on something break; @@ -107,14 +107,9 @@ where } } - event!(Level::DEBUG, msg = "Test wf driver emitting", ?last_cmd); + event!(Level::DEBUG, msg = "Test wf driver emitting", ?emit_these); - // Return only the last command, since that's what would've been yielded in a real wf - if let Some(c) = last_cmd { - vec![c] - } else { - vec![] - } + emit_these } fn signal(&mut self, _attribs: WorkflowExecutionSignaledEventAttributes) {} @@ -161,6 +156,13 @@ impl CommandSender { finished } + pub fn cancel_timer(&mut self, timer_id: &str) { + let c = WFCommand::CancelTimer(CancelTimerCommandAttributes { + timer_id: timer_id.to_string(), + }); + self.chan.send(c.into()).unwrap(); + } + pub fn send(&mut self, c: WFCommand) { self.chan.send(c.into()).unwrap(); } diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index c7e0294a9..977b317af 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -1,13 +1,15 @@ #![allow(clippy::large_enum_variant)] -use crate::protos::coresdk::TimerCanceledTaskAttributes; +use crate::machines::CommandAndMachine; use crate::{ machines::{ workflow_machines::{MachineResponse, WFMachinesError, WorkflowMachines}, - AddCommand, CancellableCommand, WFCommand, WFMachinesAdapter, + Cancellable, TemporalStateMachine, WFCommand, WFMachinesAdapter, }, protos::{ - coresdk::{HistoryEventId, TimerFiredTaskAttributes, WfActivation}, + coresdk::{ + HistoryEventId, TimerCanceledTaskAttributes, TimerFiredTaskAttributes, WfActivation, + }, temporal::api::{ command::v1::{ command::Attributes, CancelTimerCommandAttributes, Command, @@ -21,9 +23,13 @@ use crate::{ }, }, }; -use rustfsm::{fsm, StateMachine, TransitionResult}; -use std::sync::Arc; -use std::{cell::RefCell, convert::TryFrom, rc::Rc, sync::atomic::Ordering}; +use rustfsm::{fsm, MachineError, StateMachine, TransitionResult}; +use std::{ + cell::RefCell, + convert::TryFrom, + rc::Rc, + sync::{atomic::Ordering, Arc}, +}; use tracing::Level; fsm! { @@ -50,29 +56,32 @@ fsm! { #[derive(Debug)] pub(super) enum TimerMachineCommand { - AddCommand(AddCommand), + // TODO: Perhaps just remove this + AddCommand(Command), Complete, Canceled, + IssueCancelCmd(Command), } /// Creates a new, scheduled, timer as a [CancellableCommand] -pub(super) fn new_timer(attribs: StartTimerCommandAttributes) -> CancellableCommand { +pub(super) fn new_timer(attribs: StartTimerCommandAttributes) -> CommandAndMachine { let (timer, add_cmd) = TimerMachine::new_scheduled(attribs); - CancellableCommand::Active { - command: add_cmd.command, - machine: Box::new(timer), + CommandAndMachine { + command: add_cmd, + machine: Rc::new(timer), } } impl TimerMachine { /// Create a new timer and immediately schedule it - pub(crate) fn new_scheduled(attribs: StartTimerCommandAttributes) -> (Self, AddCommand) { + pub(crate) fn new_scheduled(attribs: StartTimerCommandAttributes) -> (Self, Command) { let mut s = Self::new(attribs); let cmd = match s .on_event_mut(TimerMachineEvents::Schedule) .expect("Scheduling timers doesn't fail") .pop() { + // TODO: This seems silly - why bother with the command at all? Some(TimerMachineCommand::AddCommand(c)) => c, _ => panic!("Timer on_schedule must produce command"), }; @@ -201,6 +210,7 @@ impl StartCommandRecorded { TimerMachineTransition::ok(vec![TimerMachineCommand::Complete], Fired::default()) } } + pub(super) fn on_cancel(self, dat: SharedState) -> TimerMachineTransition { let cmd = Command { command_type: CommandType::CancelTimer as i32, @@ -212,7 +222,7 @@ impl StartCommandRecorded { ), }; TimerMachineTransition::ok( - vec![TimerMachineCommand::AddCommand(cmd.into())], + vec![TimerMachineCommand::IssueCancelCmd(cmd)], CancelTimerCommandCreated::default(), ) } @@ -225,19 +235,31 @@ impl WFMachinesAdapter for TimerMachine { _has_next_event: bool, my_command: TimerMachineCommand, ) -> Result, WFMachinesError> { - match my_command { + Ok(match my_command { // Fire the completion - TimerMachineCommand::Complete => Ok(vec![TimerFiredTaskAttributes { + TimerMachineCommand::Complete => vec![TimerFiredTaskAttributes { timer_id: self.shared_state.attrs.timer_id.clone(), } - .into()]), - TimerMachineCommand::Canceled => Ok(vec![TimerCanceledTaskAttributes { + .into()], + TimerMachineCommand::Canceled => vec![TimerCanceledTaskAttributes { timer_id: self.shared_state.attrs.timer_id.clone(), } - .into()]), + .into()], + TimerMachineCommand::IssueCancelCmd(c) => vec![MachineResponse::IssueNewCommand(c)], TimerMachineCommand::AddCommand(_) => { unreachable!() } + }) + } +} + +impl Cancellable for TimerMachine { + fn cancel(&mut self) -> Result> { + match self.on_event_mut(TimerMachineEvents::Cancel)?.pop() { + Some(TimerMachineCommand::IssueCancelCmd(cmd)) => { + Ok(MachineResponse::IssueNewCommand(cmd)) + } + _ => panic!("Invalid cancel event response"), } } } @@ -262,6 +284,7 @@ mod test { }; use futures::{channel::mpsc::Sender, FutureExt, SinkExt}; use rstest::{fixture, rstest}; + use rustfsm::MachineError; use std::{error::Error, sync::Arc, time::Duration}; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; @@ -302,7 +325,7 @@ mod test { WorkflowMachines::new("wfid".to_string(), "runid".to_string(), Box::new(twd)); t.add_by_type(EventType::WorkflowExecutionStarted); - t.add_workflow_task(); + t.add_full_wf_task(); let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); t.add( EventType::TimerFired, @@ -372,7 +395,7 @@ mod test { WorkflowMachines::new("wfid".to_string(), "runid".to_string(), Box::new(twd)); t.add_by_type(EventType::WorkflowExecutionStarted); - t.add_workflow_task(); + t.add_full_wf_task(); let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); t.add( EventType::TimerFired, @@ -391,21 +414,26 @@ mod test { #[test] fn cancellation() { - let twd = TestWorkflowDriver::new(|mut command_sink: CommandSender| async move { - let timer = StartTimerCommandAttributes { - timer_id: "timer1".to_string(), - start_to_fire_timeout: Some(Duration::from_secs(5).into()), - }; - let cancel_timer = StartTimerCommandAttributes { - timer_id: "cancel_timer".to_string(), - start_to_fire_timeout: Some(Duration::from_secs(500).into()), - }; - let cancel_this = command_sink.timer(cancel_timer, false); - command_sink.timer(timer, true); - // cancel_this.cancel(); + let twd = TestWorkflowDriver::new(|mut cmd_sink: CommandSender| async move { + let cancel_this = cmd_sink.timer( + StartTimerCommandAttributes { + timer_id: "cancel_timer".to_string(), + start_to_fire_timeout: Some(Duration::from_secs(500).into()), + }, + false, + ); + cmd_sink.timer( + StartTimerCommandAttributes { + timer_id: "wait_timer".to_string(), + start_to_fire_timeout: Some(Duration::from_secs(5).into()), + }, + true, + ); + // Cancel the first timer after having waited on the second + cmd_sink.cancel_timer("cancel_timer"); let complete = CompleteWorkflowExecutionCommandAttributes::default(); - command_sink.send(complete.into()); + cmd_sink.send(complete.into()); }); let mut t = TestHistoryBuilder::default(); @@ -413,16 +441,29 @@ mod test { WorkflowMachines::new("wfid".to_string(), "runid".to_string(), Box::new(twd)); t.add_by_type(EventType::WorkflowExecutionStarted); - t.add_workflow_task(); - let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); + t.add_full_wf_task(); + let cancel_timer_started_id = t.add_get_event_id(EventType::TimerStarted, None); + let wait_timer_started_id = t.add_get_event_id(EventType::TimerStarted, None); t.add( EventType::TimerFired, history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { - started_event_id: timer_started_event_id, - timer_id: "timer1".to_string(), + started_event_id: wait_timer_started_id, + timer_id: "wait_timer".to_string(), + }), + ); + t.add_full_wf_task(); + t.add( + EventType::TimerCanceled, + history_event::Attributes::TimerCanceledEventAttributes(TimerCanceledEventAttributes { + started_event_id: cancel_timer_started_id, + timer_id: "cancel_timer".to_string(), + ..Default::default() }), ); t.add_workflow_task_scheduled_and_started(); - assert_eq!(2, t.as_history().get_workflow_task_count(None).unwrap()); + // dbg!(t.as_history()); + let commands = t + .handle_workflow_task_take_cmds(&mut state_machines, None) + .unwrap(); } } diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index 8a6c998cc..7a46ff05e 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -1,14 +1,16 @@ +use crate::machines::{CommandAndMachine, NewOrExistingCommand}; +use crate::protos::temporal::api::command::v1::Command; use crate::{ machines::{ complete_workflow_state_machine::complete_workflow, timer_state_machine::new_timer, - workflow_task_state_machine::WorkflowTaskMachine, ActivationListener, CancellableCommand, - DrivenWorkflow, ProtoCommand, TemporalStateMachine, WFCommand, + workflow_task_state_machine::WorkflowTaskMachine, ActivationListener, DrivenWorkflow, + ProtoCommand, TemporalStateMachine, WFCommand, }, protos::coresdk::WfActivationJob, protos::{ coresdk::{wf_activation_job, StartWorkflowTaskAttributes, WfActivation}, temporal::api::{ - command::v1::StartTimerCommandAttributes, + command::v1::{command, StartTimerCommandAttributes}, common::v1::WorkflowExecution, enums::v1::{CommandType, EventType}, history::v1::{history_event, HistoryEvent}, @@ -16,12 +18,12 @@ use crate::{ }, }; use futures::Future; -use rustfsm::StateMachine; +use rustfsm::{MachineError, StateMachine}; use std::{ borrow::BorrowMut, - cell::RefCell, collections::{HashMap, HashSet, VecDeque}, ops::DerefMut, + rc::Rc, sync::{atomic::AtomicBool, Arc}, time::SystemTime, }; @@ -51,15 +53,23 @@ pub(crate) struct WorkflowMachines { /// A mapping for accessing all the machines, where the key is the id of the initiating event /// for that machine. - machines_by_id: HashMap>, - - /// Queued commands which have been produced by machines and await processing - commands: VecDeque, - /// Commands generated by the currently processing workflow task. + machines_by_id: HashMap>, + + /// Maps timer ids as created by workflow authors to their initiating event IDs. There is no + /// reason to force the lang side to track event IDs, so we do it for them. + /// TODO: Make this apply to *all* cancellable things + /// TODO: Rc (Weak, really?) to machine, rather than this map-to-map nonsense + timer_id_to_initiating_event: HashMap, + + /// Queued commands which have been produced by machines and await processing / being sent to + /// the server. + commands: VecDeque, + /// Commands generated by the currently processing workflow task, which will eventually be + /// transferred to `commands` /// /// Old note: It is a queue as commands can be added (due to marker based commands) while /// iterating over already added commands. - current_wf_task_commands: VecDeque, + current_wf_task_commands: VecDeque, /// Outgoing activation jobs that need to be sent to the lang sdk outgoing_wf_activation_jobs: VecDeque, @@ -72,6 +82,7 @@ pub(crate) struct WorkflowMachines { #[must_use] pub(super) enum MachineResponse { PushWFJob(#[from(forward)] wf_activation_job::Attributes), + IssueNewCommand(Command), TriggerWFTaskStarted { task_started_event_id: i64, time: SystemTime, @@ -93,6 +104,10 @@ pub enum WFMachinesError { #[error("No command was scheduled for event {0:?}")] NoCommandScheduledForEvent(HistoryEvent), + #[error("Machine encountered an invalid transition: {0}")] + InvalidTransition(&'static str), + + // TODO: Don't really need anyhow here? #[error("Underlying error {0:?}")] Underlying(#[from] anyhow::Error), } @@ -114,20 +129,13 @@ impl WorkflowMachines { replaying: false, current_wf_time: None, machines_by_id: Default::default(), + timer_id_to_initiating_event: Default::default(), commands: Default::default(), current_wf_task_commands: Default::default(), outgoing_wf_activation_jobs: Default::default(), } } - /// Create a new timer for this workflow with the provided attributes and sender. The sender - /// is sent `true` when the timer completes. - /// - /// Returns the command and a future that will resolve when the timer completes - pub(super) fn new_timer(&mut self, attribs: StartTimerCommandAttributes) -> CancellableCommand { - new_timer(attribs) - } - /// Returns the id of the last seen WorkflowTaskStarted event pub(crate) fn get_last_started_event_id(&self) -> i64 { self.current_started_event_id @@ -165,7 +173,11 @@ impl WorkflowMachines { // borrowing from ourself mutably. let mut maybe_machine = self.machines_by_id.remove(&initial_cmd_id); if let Some(mut sm) = maybe_machine.as_mut() { - self.submachine_handle_event((*sm).borrow_mut(), event, has_next_event)?; + self.submachine_handle_event( + Rc::get_mut(sm).expect("TODO: Fix"), + event, + has_next_event, + )?; } else { event!( Level::ERROR, @@ -218,9 +230,9 @@ impl WorkflowMachines { self.event_loop(); } - /// A command event is an event which is generated from a command emitted by a past decision. - /// Each command has a correspondent event. For example ScheduleActivityTaskCommand is recorded - /// to the history as ActivityTaskScheduledEvent. + /// A command event is an event which is generated from a command emitted as a result of + /// performing a workflow task. Each command has a corresponding event. For example + /// ScheduleActivityTaskCommand is recorded to the history as ActivityTaskScheduledEvent. /// /// Command events always follow WorkflowTaskCompletedEvent. /// @@ -233,7 +245,7 @@ impl WorkflowMachines { // return; // } - let consumed_cmd = loop { + let mut consumed_cmd = loop { // handleVersionMarker can skip a marker event if the getVersion call was removed. // In this case we don't want to consume a command. -- we will need to replace it back // to the front when implementing, or something better @@ -246,16 +258,16 @@ impl WorkflowMachines { // Feed the machine the event let mut break_later = false; - if let CancellableCommand::Active { - ref mut machine, .. - } = &mut command - { - self.submachine_handle_event((*machine).borrow_mut(), event, true)?; - // TODO: Handle invalid event errors - // * More special handling for version machine - see java - // * Command/machine supposed to have cancelled itself + self.submachine_handle_event(command.machine_mut(), event, true)?; + + // TODO: Handle invalid event errors + // * More special handling for version machine - see java + // * Command/machine supposed to have cancelled itself + // If the command is cancelled or otherwise complete, move on to the next one. + // TODO: Why? + if !command.machine().is_final_state() { break_later = true; } @@ -266,9 +278,32 @@ impl WorkflowMachines { // TODO: validate command - if let CancellableCommand::Active { machine, .. } = consumed_cmd { - if !machine.is_final_state() { - self.machines_by_id.insert(event.event_id, machine); + match consumed_cmd { + // TODO: If new/existing distinction does not matter that lines up with this + NewOrExistingCommand::New(CommandAndMachine { command, machine }) => { + if !machine.is_final_state() { + self.machines_by_id.insert(event.event_id, machine); + // Additionally, some command types have user-created identifiers that may need to + // be associated with the event id, so that when (ex) a request to cancel them is + // issued we can identify them. + match command { + Command { + attributes: + Some(command::Attributes::StartTimerCommandAttributes( + StartTimerCommandAttributes { timer_id, .. }, + )), + .. + } => { + self.timer_id_to_initiating_event + .insert(timer_id, event.event_id); + } + _ => (), + } + } + } + NewOrExistingCommand::Existing(_) => { + dbg!("Broooo", consumed_cmd); + panic!("Ahh what's going on"); } } @@ -318,7 +353,7 @@ impl WorkflowMachines { has_next_event, )?; self.machines_by_id - .insert(event.event_id, Box::new(wf_task_sm)); + .insert(event.event_id, Rc::new(wf_task_sm)); } Some(EventType::WorkflowExecutionSignaled) => { // TODO: Signal callbacks @@ -336,13 +371,15 @@ impl WorkflowMachines { Ok(()) } - /// Fetches commands ready for processing from the state machines + /// Fetches commands which are ready for processing from the state machines, generally to be + /// sent off to the server. They are not removed from the internal queue, that happens when + /// corresponding history events from the server are being handled. pub(crate) fn get_commands(&mut self) -> Vec { self.commands .iter() .filter_map(|c| { - if let CancellableCommand::Active { command, .. } = c { - Some(command.clone()) + if !c.machine().is_final_state() { + Some(c.command().clone()) } else { None } @@ -400,9 +437,9 @@ impl WorkflowMachines { /// Wrapper for calling [TemporalStateMachine::handle_event] which appropriately takes action /// on the returned triggers - fn submachine_handle_event>( + fn submachine_handle_event( &mut self, - mut sm: TSM, + sm: &mut dyn TemporalStateMachine, event: &HistoryEvent, has_next_event: bool, ) -> Result<()> { @@ -426,6 +463,7 @@ impl WorkflowMachines { } => { self.task_started(task_started_event_id, time); } + _ => panic!("TODO: Should anything else be possible here? Probably?"), } } Ok(()) @@ -433,23 +471,51 @@ impl WorkflowMachines { fn handle_driven_results(&mut self, results: Vec) { for cmd in results { - // I don't love how boilerplatey this is + // I don't love how boilerplate this is for just pushing new commands, and how + // weird it feels for cancels. match cmd { WFCommand::AddTimer(attrs) => { - let timer = self.new_timer(attrs); - self.current_wf_task_commands.push_back(timer); + dbg!(&attrs.timer_id); + let timer = new_timer(attrs); + self.current_wf_task_commands + .push_back(NewOrExistingCommand::New(timer)); + } + WFCommand::CancelTimer(attrs) => { + // TODO: real errors + if let Some(event_id) = self.timer_id_to_initiating_event.get(&attrs.timer_id) { + let machine = self.machines_by_id.get_mut(event_id); + if let Some(machine) = machine { + // TODO: Fix this all up + let res = dbg!(Rc::get_mut(machine).unwrap().cancel()); + if let Ok(MachineResponse::IssueNewCommand(c)) = res { + self.current_wf_task_commands.push_back( + NewOrExistingCommand::Existing(CommandAndMachine { + command: c, + machine: machine.clone(), + }), + ) + } + } else { + panic!("Ahh no associated timer machine") + } + } else { + panic!("Ahh no associated timer") + } } WFCommand::CompleteWorkflow(attrs) => { self.current_wf_task_commands - .push_back(complete_workflow(attrs)); + .push_back(NewOrExistingCommand::New(complete_workflow(attrs))); } WFCommand::NoCommandsFromLang => (), } } } + /// Transfer commands from `current_wf_task_commands` to `commands`, so they may be sent off + /// to the server. fn prepare_commands(&mut self) { while let Some(c) = self.current_wf_task_commands.pop_front() { + dbg!(&c); // TODO - some special case stuff that can maybe be managed differently? // handleCommand should be called even on canceled ones to support mutableSideEffect // command.handleCommand(command.getCommandType()); diff --git a/src/machines/workflow_task_state_machine.rs b/src/machines/workflow_task_state_machine.rs index a2c64c22d..494a6c5e7 100644 --- a/src/machines/workflow_task_state_machine.rs +++ b/src/machines/workflow_task_state_machine.rs @@ -1,6 +1,7 @@ #![allow(clippy::enum_variant_names)] use crate::machines::workflow_machines::MachineResponse; +use crate::machines::Cancellable; use crate::{ machines::{ workflow_machines::{WFMachinesError, WorkflowMachines}, @@ -122,6 +123,8 @@ impl TryFrom for WorkflowTaskMachineEvents { } } +impl Cancellable for WorkflowTaskMachine {} + #[derive(Debug, Clone)] pub(super) struct SharedState { wf_task_started_event_id: i64, diff --git a/src/protosext/history_info.rs b/src/protosext/history_info.rs index 870af3743..3ce264f1e 100644 --- a/src/protosext/history_info.rs +++ b/src/protosext/history_info.rs @@ -179,7 +179,7 @@ mod tests { let mut t = TestHistoryBuilder::default(); t.add_by_type(EventType::WorkflowExecutionStarted); - t.add_workflow_task(); + t.add_full_wf_task(); let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); t.add( EventType::TimerFired, diff --git a/src/workflow/concurrency_manager.rs b/src/workflow/concurrency_manager.rs index 2e8a241bf..1daaca3b4 100644 --- a/src/workflow/concurrency_manager.rs +++ b/src/workflow/concurrency_manager.rs @@ -254,7 +254,7 @@ mod tests { let mgr = WorkflowConcurrencyManager::new(); let mut t = TestHistoryBuilder::default(); t.add_by_type(EventType::WorkflowExecutionStarted); - t.add_workflow_task(); + t.add_full_wf_task(); let activation = mgr .create_or_update( From 7e7b44a4750255136daa0247db94dcb0dbb2a395 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Thu, 18 Feb 2021 17:50:54 -0800 Subject: [PATCH 35/51] Fixed exploding Rc, still a mess. --- .../complete_workflow_state_machine.rs | 3 ++- src/machines/mod.rs | 19 ++++++++------ src/machines/timer_state_machine.rs | 2 +- src/machines/workflow_machines.rs | 25 ++++++++----------- 4 files changed, 25 insertions(+), 24 deletions(-) diff --git a/src/machines/complete_workflow_state_machine.rs b/src/machines/complete_workflow_state_machine.rs index fc64209db..93bc95461 100644 --- a/src/machines/complete_workflow_state_machine.rs +++ b/src/machines/complete_workflow_state_machine.rs @@ -11,6 +11,7 @@ use crate::{ }, }; use rustfsm::{fsm, StateMachine, TransitionResult}; +use std::cell::RefCell; use std::{convert::TryFrom, rc::Rc}; fsm! { @@ -40,7 +41,7 @@ pub(super) fn complete_workflow( let (machine, add_cmd) = CompleteWorkflowMachine::new_scheduled(attribs); CommandAndMachine { command: add_cmd, - machine: Rc::new(machine), + machine: Rc::new(RefCell::new(machine)), } } diff --git a/src/machines/mod.rs b/src/machines/mod.rs index fd0852201..944a770ab 100644 --- a/src/machines/mod.rs +++ b/src/machines/mod.rs @@ -57,6 +57,8 @@ use crate::{ }; use prost::alloc::fmt::Formatter; use rustfsm::{MachineError, StateMachine}; +use std::cell::RefCell; +use std::ops::{Deref, DerefMut}; use std::rc::Rc; use std::{ convert::{TryFrom, TryInto}, @@ -274,17 +276,16 @@ enum NewOrExistingCommand { } impl NewOrExistingCommand { - fn machine(&self) -> &dyn TemporalStateMachine { + fn machine(&self) -> impl Deref + '_ { match self { - NewOrExistingCommand::New(n) => &*n.machine, - NewOrExistingCommand::Existing(e) => &*e.machine, + NewOrExistingCommand::New(n) => n.machine.borrow(), + NewOrExistingCommand::Existing(e) => e.machine.borrow(), } } - fn machine_mut(&mut self) -> &mut dyn TemporalStateMachine { + fn machine_mut(&mut self) -> impl DerefMut + '_ { match self { - // TODO: Not this - NewOrExistingCommand::New(n) => Rc::get_mut(&mut n.machine).unwrap(), - NewOrExistingCommand::Existing(e) => Rc::get_mut(&mut e.machine).unwrap(), + NewOrExistingCommand::New(n) => n.machine.borrow_mut(), + NewOrExistingCommand::Existing(e) => e.machine.borrow_mut(), } } fn command(&self) -> &ProtoCommand { @@ -295,10 +296,12 @@ impl NewOrExistingCommand { } } +type MachineRef = Rc>; + #[derive(Debug)] struct CommandAndMachine { command: ProtoCommand, - machine: Rc, + machine: MachineRef, } impl Debug for dyn TemporalStateMachine { diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index 977b317af..1fd758bb7 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -68,7 +68,7 @@ pub(super) fn new_timer(attribs: StartTimerCommandAttributes) -> CommandAndMachi let (timer, add_cmd) = TimerMachine::new_scheduled(attribs); CommandAndMachine { command: add_cmd, - machine: Rc::new(timer), + machine: Rc::new(RefCell::new(timer)), } } diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index 1f6931112..c16a75dec 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -1,4 +1,4 @@ -use crate::machines::{CommandAndMachine, NewOrExistingCommand}; +use crate::machines::{CommandAndMachine, MachineRef, NewOrExistingCommand}; use crate::protos::temporal::api::command::v1::Command; use crate::{ machines::{ @@ -19,6 +19,7 @@ use crate::{ }; use futures::Future; use rustfsm::{MachineError, StateMachine}; +use std::cell::RefCell; use std::{ borrow::BorrowMut, collections::{HashMap, HashSet, VecDeque}, @@ -53,7 +54,7 @@ pub(crate) struct WorkflowMachines { /// A mapping for accessing all the machines, where the key is the id of the initiating event /// for that machine. - machines_by_id: HashMap>, + machines_by_id: HashMap, /// Maps timer ids as created by workflow authors to their initiating event IDs. There is no /// reason to force the lang side to track event IDs, so we do it for them. @@ -172,12 +173,8 @@ impl WorkflowMachines { // We remove the machine while we it handles events, then return it, to avoid // borrowing from ourself mutably. let mut maybe_machine = self.machines_by_id.remove(&initial_cmd_id); - if let Some(mut sm) = maybe_machine.as_mut() { - self.submachine_handle_event( - Rc::get_mut(sm).expect("TODO: Fix"), - event, - has_next_event, - )?; + if let Some(sm) = maybe_machine.as_ref() { + self.submachine_handle_event((**sm).borrow_mut(), event, has_next_event)?; } else { event!( Level::ERROR, @@ -189,7 +186,7 @@ impl WorkflowMachines { // Restore machine if not in it's final state if let Some(sm) = maybe_machine { - if !sm.is_final_state() { + if !sm.borrow().is_final_state() { self.machines_by_id.insert(initial_cmd_id, sm); } } @@ -281,7 +278,7 @@ impl WorkflowMachines { match consumed_cmd { // TODO: If new/existing distinction does not matter that lines up with this NewOrExistingCommand::New(CommandAndMachine { command, machine }) => { - if !machine.is_final_state() { + if !machine.borrow().is_final_state() { self.machines_by_id.insert(event.event_id, machine); // Additionally, some command types have user-created identifiers that may need to // be associated with the event id, so that when (ex) a request to cancel them is @@ -353,7 +350,7 @@ impl WorkflowMachines { has_next_event, )?; self.machines_by_id - .insert(event.event_id, Rc::new(wf_task_sm)); + .insert(event.event_id, Rc::new(RefCell::new(wf_task_sm))); } Some(EventType::WorkflowExecutionSignaled) => { // TODO: Signal callbacks @@ -439,7 +436,7 @@ impl WorkflowMachines { /// on the returned triggers fn submachine_handle_event( &mut self, - sm: &mut dyn TemporalStateMachine, + mut sm: impl DerefMut, event: &HistoryEvent, has_next_event: bool, ) -> Result<()> { @@ -483,10 +480,10 @@ impl WorkflowMachines { WFCommand::CancelTimer(attrs) => { // TODO: real errors if let Some(event_id) = self.timer_id_to_initiating_event.get(&attrs.timer_id) { - let machine = self.machines_by_id.get_mut(event_id); + let machine = self.machines_by_id.get(event_id); if let Some(machine) = machine { // TODO: Fix this all up - let res = dbg!(Rc::get_mut(machine).unwrap().cancel()); + let res = dbg!((**machine).borrow_mut().cancel()); if let Ok(MachineResponse::IssueNewCommand(c)) = res { self.current_wf_task_commands.push_back( NewOrExistingCommand::Existing(CommandAndMachine { From 0f4a32cc3fefedea4089eb0d00b8346c96239b9d Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Fri, 19 Feb 2021 10:07:13 -0800 Subject: [PATCH 36/51] Incremental timer cancel UT is working --- fsm/state_machine_procmacro/src/lib.rs | 4 +++- src/machines/mod.rs | 16 ++++++++++++---- src/machines/timer_state_machine.rs | 10 +++++++++- src/machines/workflow_machines.rs | 2 ++ 4 files changed, 26 insertions(+), 6 deletions(-) diff --git a/fsm/state_machine_procmacro/src/lib.rs b/fsm/state_machine_procmacro/src/lib.rs index 12a9a9d6c..6a19af7ea 100644 --- a/fsm/state_machine_procmacro/src/lib.rs +++ b/fsm/state_machine_procmacro/src/lib.rs @@ -329,7 +329,9 @@ impl StateMachineDefinition { .flat_map(|t| vec![t.from.clone(), t.to.clone()]) .collect(); let state_variants = states.iter().map(|s| { + let statestr = s.to_string(); quote! { + #[display(fmt=#statestr)] #s(#s) } }); @@ -349,7 +351,7 @@ impl StateMachineDefinition { } }; let states_enum = quote! { - #[derive(::derive_more::From, Clone)] + #[derive(::derive_more::From, Clone, ::derive_more::Display)] #visibility enum #state_enum_name { #(#state_variants),* } diff --git a/src/machines/mod.rs b/src/machines/mod.rs index 944a770ab..1b2f36630 100644 --- a/src/machines/mod.rs +++ b/src/machines/mod.rs @@ -58,6 +58,7 @@ use crate::{ use prost::alloc::fmt::Formatter; use rustfsm::{MachineError, StateMachine}; use std::cell::RefCell; +use std::fmt::Display; use std::ops::{Deref, DerefMut}; use std::rc::Rc; use std::{ @@ -158,6 +159,7 @@ where ::Event: TryFrom, WFMachinesError: From<<::Event as TryFrom>::Error>, ::Command: Debug, + ::State: Display, ::Error: Into + 'static + Send + Sync, { fn name(&self) -> &str { @@ -205,10 +207,16 @@ where } Ok(triggers) } - Err(MachineError::InvalidTransition) => Err(WFMachinesError::UnexpectedEvent( - event.clone(), - "The handling machine says the transition is invalid", - )), + Err(MachineError::InvalidTransition) => { + Err(WFMachinesError::InvalidTransitionDuringEvent( + event.clone(), + format!( + "{} in state {} says the transition is invalid", + self.name(), + self.state() + ), + )) + } Err(MachineError::Underlying(e)) => Err(e.into()), } } diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index 1fd758bb7..5e741daca 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -463,7 +463,15 @@ mod test { t.add_workflow_task_scheduled_and_started(); // dbg!(t.as_history()); let commands = t - .handle_workflow_task_take_cmds(&mut state_machines, None) + .handle_workflow_task_take_cmds(&mut state_machines, Some(1)) .unwrap(); + assert_eq!(commands.len(), 2); + assert_eq!(commands[0].command_type, CommandType::StartTimer as i32); + assert_eq!(commands[1].command_type, CommandType::StartTimer as i32); + let commands = t + .handle_workflow_task_take_cmds(&mut state_machines, Some(2)) + .unwrap(); + dbg!(&commands); + assert_eq!(commands.len(), 2); } } diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index c16a75dec..ceb14a68c 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -94,6 +94,8 @@ pub(super) enum MachineResponse { pub enum WFMachinesError { #[error("Event {0:?} was not expected: {1}")] UnexpectedEvent(HistoryEvent, &'static str), + #[error("Event {0:?} was not expected: {1}")] + InvalidTransitionDuringEvent(HistoryEvent, String), #[error("Event {0:?} was malformed: {1}")] MalformedEvent(HistoryEvent, String), // Expected to be transformed into a `MalformedEvent` with the full event by workflow machines, From fe41ba20ae10dc0b40dfd42f658e1cebee44bb0b Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Fri, 19 Feb 2021 10:13:02 -0800 Subject: [PATCH 37/51] :tada: Working! Needed to refine what a "cancelled command" really meant to get things going here. Also fixed an incorrect transition in timer fsm And needed to fix test wf driver to handle wf exe completions --- Cargo.toml | 5 +- src/core_tracing.rs | 21 ++++ src/lib.rs | 1 + src/machines/mod.rs | 59 ++++------- src/machines/test_help/history_builder.rs | 16 ++- src/machines/test_help/workflow_driver.rs | 19 +++- src/machines/timer_state_machine.rs | 90 +++++++++++++--- src/machines/workflow_machines.rs | 122 ++++++++++++---------- src/protos/mod.rs | 9 ++ src/workflow/bridge.rs | 3 +- 10 files changed, 229 insertions(+), 116 deletions(-) create mode 100644 src/core_tracing.rs diff --git a/Cargo.toml b/Cargo.toml index 60e4be72b..85d4cddb2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,12 +18,15 @@ displaydoc = "0.1" env_logger = "0.8" futures = "0.3" log = "0.4" +once_cell = "1.5" +opentelemetry-jaeger = "0.11" +opentelemetry = "0.12" prost = "0.7" prost-types = "0.7" thiserror = "1.0" tokio = { version = "1.1", features = ["rt", "rt-multi-thread"] } tracing = { version = "0.1", features = ["log"] } -tracing-opentelemetry = "0.10" +tracing-opentelemetry = "0.11" tracing-subscriber = "0.2" url = "2.2" rand = "0.8.3" diff --git a/src/core_tracing.rs b/src/core_tracing.rs new file mode 100644 index 000000000..7d3eb0cc6 --- /dev/null +++ b/src/core_tracing.rs @@ -0,0 +1,21 @@ +use once_cell::sync::OnceCell; +use opentelemetry_jaeger::Uninstall; +use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; + +static TRACING_INIT: OnceCell = OnceCell::new(); + +pub(crate) fn tracing_init() { + let _ = env_logger::try_init(); + // TRACING_INIT.get_or_init(|| { + // let (tracer, uninstall) = opentelemetry_jaeger::new_pipeline() + // .with_service_name("coresdk") + // .install() + // .unwrap(); + // let opentelemetry = tracing_opentelemetry::layer().with_tracer(tracer); + // tracing_subscriber::registry() + // .with(opentelemetry) + // .try_init() + // .unwrap(); + // uninstall + // }); +} diff --git a/src/lib.rs b/src/lib.rs index 0df9106f4..3b5025609 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,6 +11,7 @@ extern crate tracing; pub mod protos; +pub(crate) mod core_tracing; mod machines; mod pollers; mod protosext; diff --git a/src/machines/mod.rs b/src/machines/mod.rs index 1b2f36630..7b692a555 100644 --- a/src/machines/mod.rs +++ b/src/machines/mod.rs @@ -57,13 +57,11 @@ use crate::{ }; use prost::alloc::fmt::Formatter; use rustfsm::{MachineError, StateMachine}; -use std::cell::RefCell; -use std::fmt::Display; -use std::ops::{Deref, DerefMut}; -use std::rc::Rc; use std::{ + cell::RefCell, convert::{TryFrom, TryInto}, - fmt::Debug, + fmt::{Debug, Display}, + rc::Rc, }; use tracing::Level; @@ -73,7 +71,7 @@ pub(crate) type ProtoCommand = Command; /// drive it, start it, signal it, cancel it, etc. pub(crate) trait DrivenWorkflow: ActivationListener + Send { /// Start the workflow - fn start(&mut self, attribs: WorkflowExecutionStartedEventAttributes) -> Vec; + fn start(&mut self, attribs: WorkflowExecutionStartedEventAttributes); /// Obtain any output from the workflow's recent execution(s). Because the lang sdk is /// responsible for calling workflow code as a result of receiving tasks from @@ -150,6 +148,10 @@ trait TemporalStateMachine: CheckStateMachineInFinal + Send { /// Attempt to cancel the command associated with this state machine, if it is cancellable fn cancel(&mut self) -> Result; + + /// Should return true if the command was cancelled before we sent it to the server. Always + /// returns false for non-cancellable machines + fn was_cancelled_before_sent_to_server(&self) -> bool; } impl TemporalStateMachine for SM @@ -171,7 +173,8 @@ where Level::DEBUG, msg = "handling command", ?command_type, - machine_name = %self.name() + machine_name = %self.name(), + state = %self.state() ); if let Ok(converted_command) = command_type.try_into() { match self.on_event_mut(converted_command) { @@ -195,12 +198,15 @@ where Level::DEBUG, msg = "handling event", %event, - machine_name = %self.name() + machine_name = %self.name(), + state = %self.state() ); let converted_event = event.clone().try_into()?; match self.on_event_mut(converted_event) { Ok(c) => { - event!(Level::DEBUG, msg = "Machine produced commands", ?c); + if !c.is_empty() { + event!(Level::DEBUG, msg = "Machine produced commands", ?c, state = %self.state()); + } let mut triggers = vec![]; for cmd in c { triggers.extend(self.adapt_response(event, has_next_event, cmd)?); @@ -223,7 +229,6 @@ where fn cancel(&mut self) -> Result { let res = self.cancel(); - dbg!(&res); res.map_err(|e| match e { MachineError::InvalidTransition => { WFMachinesError::InvalidTransition("while attempting to cancel") @@ -231,6 +236,10 @@ where MachineError::Underlying(e) => e.into(), }) } + + fn was_cancelled_before_sent_to_server(&self) -> bool { + self.was_cancelled_before_sent_to_server() + } } /// Exists purely to allow generic implementation of `is_final_state` for all [StateMachine] @@ -274,38 +283,14 @@ trait Cancellable: StateMachine { // be cancelled TODO: Result instead? panic!(format!("This type of machine cannot be cancelled")) } -} - -// TODO: Distinction is maybe unimportant -#[derive(Debug)] -enum NewOrExistingCommand { - New(CommandAndMachine), - Existing(CommandAndMachine), -} -impl NewOrExistingCommand { - fn machine(&self) -> impl Deref + '_ { - match self { - NewOrExistingCommand::New(n) => n.machine.borrow(), - NewOrExistingCommand::Existing(e) => e.machine.borrow(), - } - } - fn machine_mut(&mut self) -> impl DerefMut + '_ { - match self { - NewOrExistingCommand::New(n) => n.machine.borrow_mut(), - NewOrExistingCommand::Existing(e) => e.machine.borrow_mut(), - } - } - fn command(&self) -> &ProtoCommand { - match self { - NewOrExistingCommand::New(n) => &n.command, - NewOrExistingCommand::Existing(e) => &e.command, - } + /// Should return true if the command was cancelled before we sent it to the server + fn was_cancelled_before_sent_to_server(&self) -> bool { + false } } type MachineRef = Rc>; - #[derive(Debug)] struct CommandAndMachine { command: ProtoCommand, diff --git a/src/machines/test_help/history_builder.rs b/src/machines/test_help/history_builder.rs index 7ff37e3f4..3474f3964 100644 --- a/src/machines/test_help/history_builder.rs +++ b/src/machines/test_help/history_builder.rs @@ -1,5 +1,7 @@ use super::Result; -use crate::protos::temporal::api::history::v1::History; +use crate::protos::temporal::api::history::v1::{ + History, WorkflowExecutionCompletedEventAttributes, +}; use crate::{ machines::{workflow_machines::WorkflowMachines, ProtoCommand}, protos::temporal::api::{ @@ -23,6 +25,7 @@ pub struct TestHistoryBuilder { current_event_id: i64, workflow_task_scheduled_event_id: i64, previous_started_event_id: i64, + previous_task_completed_id: i64, } impl TestHistoryBuilder { @@ -85,7 +88,16 @@ impl TestHistoryBuilder { scheduled_event_id: self.workflow_task_scheduled_event_id, ..Default::default() }; - self.build_and_push_event(EventType::WorkflowTaskCompleted, attrs.into()); + let id = self.add_get_event_id(EventType::WorkflowTaskCompleted, Some(attrs.into())); + self.previous_task_completed_id = id; + } + + pub fn add_workflow_execution_completed(&mut self) { + let attrs = WorkflowExecutionCompletedEventAttributes { + workflow_task_completed_event_id: self.previous_task_completed_id, + ..Default::default() + }; + self.build_and_push_event(EventType::WorkflowExecutionCompleted, attrs.into()); } pub fn as_history(&self) -> History { diff --git a/src/machines/test_help/workflow_driver.rs b/src/machines/test_help/workflow_driver.rs index 05496b2a3..29bcb21a4 100644 --- a/src/machines/test_help/workflow_driver.rs +++ b/src/machines/test_help/workflow_driver.rs @@ -32,6 +32,8 @@ use tracing::Level; pub(in crate::machines) struct TestWorkflowDriver { wf_function: F, cache: Arc, + /// Set to true if a workflow execution completed/failed/cancelled/etc has been issued + sent_final_execution: bool, } #[derive(Default, Debug)] @@ -58,6 +60,7 @@ where Self { wf_function: workflow_fn, cache: Default::default(), + sent_final_execution: false, } } } @@ -78,12 +81,17 @@ where F: Fn(CommandSender) -> Fut + Send + Sync, Fut: Future, { - fn start(&mut self, _attribs: WorkflowExecutionStartedEventAttributes) -> Vec { + fn start(&mut self, _attribs: WorkflowExecutionStartedEventAttributes) { event!(Level::DEBUG, msg = "Test WF driver start called"); - vec![] } fn fetch_workflow_iteration_output(&mut self) -> Vec { + // If we have already sent the command to complete the workflow, we don't want + // to re-run the worfklow again. + if self.sent_final_execution { + return vec![]; + } + let (sender, receiver) = CommandSender::new(self.cache.clone()); // Call the closure that produces the workflow future let wf_future = (self.wf_function)(sender); @@ -99,7 +107,12 @@ where let mut emit_these = vec![]; for cmd in cmds { match cmd { - TestWFCommand::WFCommand(c) => emit_these.push(c), + TestWFCommand::WFCommand(c) => { + if let WFCommand::CompleteWorkflow(_) = &c { + self.sent_final_execution = true; + } + emit_these.push(c); + } TestWFCommand::Waiting => { // Ignore further commands since we're waiting on something break; diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index 5e741daca..0341088f0 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -23,6 +23,7 @@ use crate::{ }, }, }; +use futures::FutureExt; use rustfsm::{fsm, MachineError, StateMachine, TransitionResult}; use std::{ cell::RefCell, @@ -42,7 +43,7 @@ fsm! { StartCommandCreated --(CommandStartTimer) --> StartCommandCreated; StartCommandCreated --(TimerStarted(HistoryEventId), on_timer_started) --> StartCommandRecorded; - StartCommandCreated --(Cancel, on_cancel) --> Canceled; + StartCommandCreated --(Cancel, shared on_cancel) --> Canceled; StartCommandRecorded --(TimerFired(TimerFiredEventAttributes), shared on_timer_fired) --> Fired; StartCommandRecorded --(Cancel, shared on_cancel) --> CancelTimerCommandCreated; @@ -91,7 +92,10 @@ impl TimerMachine { fn new(attribs: StartTimerCommandAttributes) -> Self { Self { state: Created {}.into(), - shared_state: SharedState { attrs: attribs }, + shared_state: SharedState { + attrs: attribs, + cancelled_before_sent: false, + }, } } } @@ -140,6 +144,7 @@ impl TryFrom for TimerMachineEvents { #[derive(Default, Clone)] pub(super) struct SharedState { attrs: StartTimerCommandAttributes, + cancelled_before_sent: bool, } #[derive(Default, Clone)] @@ -161,7 +166,10 @@ impl Created { pub(super) struct CancelTimerCommandCreated {} impl CancelTimerCommandCreated { pub(super) fn on_command_cancel_timer(self) -> TimerMachineTransition { - TimerMachineTransition::ok(vec![TimerMachineCommand::Canceled], Canceled::default()) + TimerMachineTransition::ok( + vec![TimerMachineCommand::Canceled], + CancelTimerCommandSent::default(), + ) } } @@ -187,8 +195,15 @@ impl StartCommandCreated { // TODO: Java recorded an initial event ID, but it seemingly was never used. TimerMachineTransition::default::() } - pub(super) fn on_cancel(mut self) -> TimerMachineTransition { - TimerMachineTransition::ok(vec![TimerMachineCommand::Canceled], Canceled::default()) + pub(super) fn on_cancel(mut self, dat: SharedState) -> TimerMachineTransition { + TimerMachineTransition::ok_shared( + vec![TimerMachineCommand::Canceled], + Canceled::default(), + SharedState { + cancelled_before_sent: true, + ..dat + }, + ) } } @@ -212,6 +227,7 @@ impl StartCommandRecorded { } pub(super) fn on_cancel(self, dat: SharedState) -> TimerMachineTransition { + dbg!("On cancel!"); let cmd = Command { command_type: CommandType::CancelTimer as i32, attributes: Some( @@ -259,9 +275,13 @@ impl Cancellable for TimerMachine { Some(TimerMachineCommand::IssueCancelCmd(cmd)) => { Ok(MachineResponse::IssueNewCommand(cmd)) } - _ => panic!("Invalid cancel event response"), + x => panic!(format!("Invalid cancel event response {:?}", x)), } } + + fn was_cancelled_before_sent_to_server(&self) -> bool { + self.shared_state().cancelled_before_sent + } } #[cfg(test)] @@ -286,10 +306,10 @@ mod test { use rstest::{fixture, rstest}; use rustfsm::MachineError; use std::{error::Error, sync::Arc, time::Duration}; - use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; #[fixture] fn fire_happy_hist() -> (TestHistoryBuilder, WorkflowMachines) { + crate::core_tracing::tracing_init(); /* 1: EVENT_TYPE_WORKFLOW_EXECUTION_STARTED 2: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED @@ -341,7 +361,7 @@ mod test { #[rstest] fn test_fire_happy_path_inc(fire_happy_hist: (TestHistoryBuilder, WorkflowMachines)) { - let s = span!(Level::DEBUG, "Test start", t = "inc"); + let s = span!(Level::DEBUG, "Test start", t = "happy_inc"); let _enter = s.enter(); let (t, mut state_machines) = fire_happy_hist; @@ -366,12 +386,12 @@ mod test { #[rstest] fn test_fire_happy_path_full(fire_happy_hist: (TestHistoryBuilder, WorkflowMachines)) { - let s = span!(Level::DEBUG, "Test start", t = "full"); + let s = span!(Level::DEBUG, "Test start", t = "happy_full"); let _enter = s.enter(); let (t, mut state_machines) = fire_happy_hist; let commands = t - .handle_workflow_task_take_cmds(&mut state_machines, Some(2)) + .handle_workflow_task_take_cmds(&mut state_machines, None) .unwrap(); assert_eq!(commands.len(), 1); assert_eq!( @@ -412,8 +432,10 @@ mod test { .contains("Timer fired event did not have expected timer id realid!")) } - #[test] - fn cancellation() { + #[fixture] + fn cancellation_setup() -> (TestHistoryBuilder, WorkflowMachines) { + crate::core_tracing::tracing_init(); + let twd = TestWorkflowDriver::new(|mut cmd_sink: CommandSender| async move { let cancel_this = cmd_sink.timer( StartTimerCommandAttributes { @@ -451,7 +473,9 @@ mod test { timer_id: "wait_timer".to_string(), }), ); + // 8 t.add_full_wf_task(); + // 11 t.add( EventType::TimerCanceled, history_event::Attributes::TimerCanceledEventAttributes(TimerCanceledEventAttributes { @@ -460,8 +484,17 @@ mod test { ..Default::default() }), ); - t.add_workflow_task_scheduled_and_started(); - // dbg!(t.as_history()); + // 12 + t.add_workflow_execution_completed(); + (t, state_machines) + } + + #[rstest] + fn incremental_cancellation(cancellation_setup: (TestHistoryBuilder, WorkflowMachines)) { + let s = span!(Level::DEBUG, "Test start", t = "cancel_inc"); + let _enter = s.enter(); + + let (t, mut state_machines) = cancellation_setup; let commands = t .handle_workflow_task_take_cmds(&mut state_machines, Some(1)) .unwrap(); @@ -471,7 +504,34 @@ mod test { let commands = t .handle_workflow_task_take_cmds(&mut state_machines, Some(2)) .unwrap(); - dbg!(&commands); assert_eq!(commands.len(), 2); + assert_eq!(commands[0].command_type, CommandType::CancelTimer as i32); + assert_eq!( + commands[1].command_type, + CommandType::CompleteWorkflowExecution as i32 + ); + // TODO in Java no commands are prepared or anything for 11 and 12 + // but I'm screwing up on event 10, the last WFTC. + // Problem seems to be timer machine's cancel gets called a second time, when it shouldn't + // be, which might really just be a problem with the way the test is driven, as it looks + // like no commands should be emitted on last (4th) iteration of wf. Think that's it. + let commands = t + .handle_workflow_task_take_cmds(&mut state_machines, None) + .unwrap(); + // There should be no commands - the wf completed at the same time the timer was cancelled + assert_eq!(commands.len(), 0); + } + + #[rstest] + fn full_cancellation(cancellation_setup: (TestHistoryBuilder, WorkflowMachines)) { + let s = span!(Level::DEBUG, "Test start", t = "cancel_full"); + let _enter = s.enter(); + + let (t, mut state_machines) = cancellation_setup; + let commands = t + .handle_workflow_task_take_cmds(&mut state_machines, None) + .unwrap(); + // There should be no commands - the wf completed at the same time the timer was cancelled + assert_eq!(commands.len(), 0); } } diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index ceb14a68c..178b7482b 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -1,4 +1,4 @@ -use crate::machines::{CommandAndMachine, MachineRef, NewOrExistingCommand}; +use crate::machines::{CommandAndMachine, MachineRef}; use crate::protos::temporal::api::command::v1::Command; use crate::{ machines::{ @@ -64,13 +64,13 @@ pub(crate) struct WorkflowMachines { /// Queued commands which have been produced by machines and await processing / being sent to /// the server. - commands: VecDeque, + commands: VecDeque, /// Commands generated by the currently processing workflow task, which will eventually be /// transferred to `commands` /// /// Old note: It is a queue as commands can be added (due to marker based commands) while /// iterating over already added commands. - current_wf_task_commands: VecDeque, + current_wf_task_commands: VecDeque, /// Outgoing activation jobs that need to be sent to the lang sdk outgoing_wf_activation_jobs: VecDeque, @@ -243,6 +243,7 @@ impl WorkflowMachines { // if (handleLocalActivityMarker(event)) { // return; // } + event!(Level::DEBUG, msg = "handling command event", current_commands = ?self.commands); let mut consumed_cmd = loop { // handleVersionMarker can skip a marker event if the getVersion call was removed. @@ -258,15 +259,22 @@ impl WorkflowMachines { // Feed the machine the event let mut break_later = false; - self.submachine_handle_event(command.machine_mut(), event, true)?; + self.submachine_handle_event((*command.machine).borrow_mut(), event, true)?; - // TODO: Handle invalid event errors + // TODO: // * More special handling for version machine - see java - // * Command/machine supposed to have cancelled itself - - // If the command is cancelled or otherwise complete, move on to the next one. - // TODO: Why? - if !command.machine().is_final_state() { + // * Commands cancelled this iteration are allowed to not match the event? + + // TODO: In java this is `if !command.isCancelled()`, using the old `CancellableCommand` + // Weirdly, a timer machine being in `CANCELED` does *not* count as `isCancelled`, and + // it specifically seems to mean that `cancel` was called this iteration on the + // cancellable command, so really this is some weird edge case about handling commands + // which were cancelled in this iteration differently -- recalling that `isCancelled` + // really only applies to "cancelled before we sent it to the server" + if !(*command.machine) + .borrow() + .was_cancelled_before_sent_to_server() + { break_later = true; } @@ -277,32 +285,24 @@ impl WorkflowMachines { // TODO: validate command - match consumed_cmd { - // TODO: If new/existing distinction does not matter that lines up with this - NewOrExistingCommand::New(CommandAndMachine { command, machine }) => { - if !machine.borrow().is_final_state() { - self.machines_by_id.insert(event.event_id, machine); - // Additionally, some command types have user-created identifiers that may need to - // be associated with the event id, so that when (ex) a request to cancel them is - // issued we can identify them. - match command { - Command { - attributes: - Some(command::Attributes::StartTimerCommandAttributes( - StartTimerCommandAttributes { timer_id, .. }, - )), - .. - } => { - self.timer_id_to_initiating_event - .insert(timer_id, event.event_id); - } - _ => (), - } + if !consumed_cmd.machine.borrow().is_final_state() { + self.machines_by_id + .insert(event.event_id, consumed_cmd.machine); + // Additionally, some command types have user-created identifiers that may need to + // be associated with the event id, so that when (ex) a request to cancel them is + // issued we can identify them. + match consumed_cmd.command { + Command { + attributes: + Some(command::Attributes::StartTimerCommandAttributes( + StartTimerCommandAttributes { timer_id, .. }, + )), + .. + } => { + self.timer_id_to_initiating_event + .insert(timer_id, event.event_id); } - } - NewOrExistingCommand::Existing(_) => { - dbg!("Broooo", consumed_cmd); - panic!("Ahh what's going on"); + _ => (), } } @@ -334,8 +334,7 @@ impl WorkflowMachines { } .into(), ); - let results = self.drive_me.start(attrs.clone()); - self.handle_driven_results(results); + self.drive_me.start(attrs.clone()); } else { return Err(WFMachinesError::MalformedEvent( event.clone(), @@ -377,9 +376,10 @@ impl WorkflowMachines { self.commands .iter() .filter_map(|c| { - if !c.machine().is_final_state() { - Some(c.command().clone()) + if !(*c.machine).borrow().is_final_state() { + Some(c.command.clone()) } else { + dbg!("Final state!!!!!"); None } }) @@ -449,7 +449,9 @@ impl WorkflowMachines { e } })?; - event!(Level::DEBUG, msg = "Machine produced triggers", ?triggers); + if !triggers.is_empty() { + event!(Level::DEBUG, msg = "Machine produced triggers", ?triggers); + } for trigger in triggers { match trigger { MachineResponse::PushWFJob(a) => { @@ -474,10 +476,8 @@ impl WorkflowMachines { // weird it feels for cancels. match cmd { WFCommand::AddTimer(attrs) => { - dbg!(&attrs.timer_id); let timer = new_timer(attrs); - self.current_wf_task_commands - .push_back(NewOrExistingCommand::New(timer)); + self.current_wf_task_commands.push_back(timer); } WFCommand::CancelTimer(attrs) => { // TODO: real errors @@ -485,14 +485,20 @@ impl WorkflowMachines { let machine = self.machines_by_id.get(event_id); if let Some(machine) = machine { // TODO: Fix this all up - let res = dbg!((**machine).borrow_mut().cancel()); - if let Ok(MachineResponse::IssueNewCommand(c)) = res { - self.current_wf_task_commands.push_back( - NewOrExistingCommand::Existing(CommandAndMachine { + let res = (**machine).borrow_mut().cancel(); + match res { + Ok(MachineResponse::IssueNewCommand(c)) => { + self.current_wf_task_commands.push_back(CommandAndMachine { command: c, machine: machine.clone(), - }), - ) + }) + } + Ok(v) => { + dbg!(v); + } + Err(e) => { + panic!(format!("Cancel timer error {:?}", e)); + } } } else { panic!("Ahh no associated timer machine") @@ -503,7 +509,7 @@ impl WorkflowMachines { } WFCommand::CompleteWorkflow(attrs) => { self.current_wf_task_commands - .push_back(NewOrExistingCommand::New(complete_workflow(attrs))); + .push_back(complete_workflow(attrs)); } WFCommand::NoCommandsFromLang => (), } @@ -511,14 +517,18 @@ impl WorkflowMachines { } /// Transfer commands from `current_wf_task_commands` to `commands`, so they may be sent off - /// to the server. + /// to the server. While doing so, [TemporalStateMachine::handle_command] is called on the + /// machine associated with the command. + #[instrument(level = "debug", skip(self))] fn prepare_commands(&mut self) { - while let Some(c) = self.current_wf_task_commands.pop_front() { - dbg!(&c); - // TODO - some special case stuff that can maybe be managed differently? - // handleCommand should be called even on canceled ones to support mutableSideEffect - // command.handleCommand(command.getCommandType()); + event!(Level::DEBUG, msg = "start prepare_commands", + cur_wf_task_cmds = ?self.current_wf_task_commands); + while let Some(mut c) = self.current_wf_task_commands.pop_front() { + // TODO: This conversion sux -- probably add to NewOrExistingCommand + let cmd_type = CommandType::from_i32(c.command.command_type).unwrap(); + (*c.machine).borrow_mut().handle_command(cmd_type); self.commands.push_back(c); } + event!(Level::DEBUG, msg = "end prepare_commands", commands = ?self.commands); } } diff --git a/src/protos/mod.rs b/src/protos/mod.rs index b0c223057..f7536a822 100644 --- a/src/protos/mod.rs +++ b/src/protos/mod.rs @@ -149,14 +149,23 @@ pub mod temporal { let mut history = self.events.iter().peekable(); while let Some(event) = history.next() { let next_event = history.peek(); + + if event.is_final_wf_execution_event() { + // If the workflow is complete, we're done. + // TODO: Should we throw err if next event is populated? + return Ok(count); + } + if let Some(upto) = up_to_event_id { if event.event_id > upto { return Ok(count); } } + let next_is_completed = next_event.map_or(false, |ne| { ne.event_type == EventType::WorkflowTaskCompleted as i32 }); + if event.event_type == EventType::WorkflowTaskStarted as i32 && (next_event.is_none() || next_is_completed) { diff --git a/src/workflow/bridge.rs b/src/workflow/bridge.rs index c4b8bc699..cc6f0895a 100644 --- a/src/workflow/bridge.rs +++ b/src/workflow/bridge.rs @@ -32,10 +32,9 @@ impl WorkflowBridge { } impl DrivenWorkflow for WorkflowBridge { - fn start(&mut self, attribs: WorkflowExecutionStartedEventAttributes) -> Vec { + fn start(&mut self, attribs: WorkflowExecutionStartedEventAttributes) { event!(Level::DEBUG, msg = "Workflow bridge start called", ?attribs); self.started_attrs = Some(attribs); - vec![] } fn fetch_workflow_iteration_output(&mut self) -> Vec { From 8495c982727fcbb4e5a4e819824ec31996046ce6 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Tue, 23 Feb 2021 10:28:20 -0800 Subject: [PATCH 38/51] Fix errors and remove extra indirection in timer machine lookup Next: Use weak ref instead --- src/machines/test_help/workflow_driver.rs | 4 +- src/machines/timer_state_machine.rs | 1 - src/machines/workflow_machines.rs | 68 +++++++++++------------ src/protos/mod.rs | 2 +- src/protosext/history_info.rs | 14 +++-- 5 files changed, 45 insertions(+), 44 deletions(-) diff --git a/src/machines/test_help/workflow_driver.rs b/src/machines/test_help/workflow_driver.rs index 29bcb21a4..d6edc8506 100644 --- a/src/machines/test_help/workflow_driver.rs +++ b/src/machines/test_help/workflow_driver.rs @@ -87,7 +87,9 @@ where fn fetch_workflow_iteration_output(&mut self) -> Vec { // If we have already sent the command to complete the workflow, we don't want - // to re-run the worfklow again. + // to re-run the workflow again. + // TODO: This would be better to solve by actually pausing the workflow properly rather + // than doing the re-run the whole thing every time deal. if self.sent_final_execution { return vec![]; } diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index 0341088f0..334aebc29 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -227,7 +227,6 @@ impl StartCommandRecorded { } pub(super) fn on_cancel(self, dat: SharedState) -> TimerMachineTransition { - dbg!("On cancel!"); let cmd = Command { command_type: CommandType::CancelTimer as i32, attributes: Some( diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index 178b7482b..489057431 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -59,8 +59,7 @@ pub(crate) struct WorkflowMachines { /// Maps timer ids as created by workflow authors to their initiating event IDs. There is no /// reason to force the lang side to track event IDs, so we do it for them. /// TODO: Make this apply to *all* cancellable things - /// TODO: Rc (Weak, really?) to machine, rather than this map-to-map nonsense - timer_id_to_initiating_event: HashMap, + timer_id_to_machine: HashMap, /// Queued commands which have been produced by machines and await processing / being sent to /// the server. @@ -81,7 +80,7 @@ pub(crate) struct WorkflowMachines { /// Returned by [TemporalStateMachine]s when handling events #[derive(Debug, derive_more::From)] #[must_use] -pub(super) enum MachineResponse { +pub(crate) enum MachineResponse { PushWFJob(#[from(forward)] wf_activation_job::Attributes), IssueNewCommand(Command), TriggerWFTaskStarted { @@ -91,7 +90,7 @@ pub(super) enum MachineResponse { } #[derive(thiserror::Error, Debug)] -pub enum WFMachinesError { +pub(crate) enum WFMachinesError { #[error("Event {0:?} was not expected: {1}")] UnexpectedEvent(HistoryEvent, &'static str), #[error("Event {0:?} was not expected: {1}")] @@ -106,6 +105,10 @@ pub enum WFMachinesError { UnexpectedCommand(CommandType), #[error("No command was scheduled for event {0:?}")] NoCommandScheduledForEvent(HistoryEvent), + #[error("Machine response {0:?} was not expected: {1}")] + UnexpectedMachineResponse(MachineResponse, &'static str), + #[error("Command was missing its associated machine: {0}")] + MissingAssociatedMachine(String), #[error("Machine encountered an invalid transition: {0}")] InvalidTransition(&'static str), @@ -132,7 +135,7 @@ impl WorkflowMachines { replaying: false, current_wf_time: None, machines_by_id: Default::default(), - timer_id_to_initiating_event: Default::default(), + timer_id_to_machine: Default::default(), commands: Default::default(), current_wf_task_commands: Default::default(), outgoing_wf_activation_jobs: Default::default(), @@ -265,12 +268,6 @@ impl WorkflowMachines { // * More special handling for version machine - see java // * Commands cancelled this iteration are allowed to not match the event? - // TODO: In java this is `if !command.isCancelled()`, using the old `CancellableCommand` - // Weirdly, a timer machine being in `CANCELED` does *not* count as `isCancelled`, and - // it specifically seems to mean that `cancel` was called this iteration on the - // cancellable command, so really this is some weird edge case about handling commands - // which were cancelled in this iteration differently -- recalling that `isCancelled` - // really only applies to "cancelled before we sent it to the server" if !(*command.machine) .borrow() .was_cancelled_before_sent_to_server() @@ -287,7 +284,7 @@ impl WorkflowMachines { if !consumed_cmd.machine.borrow().is_final_state() { self.machines_by_id - .insert(event.event_id, consumed_cmd.machine); + .insert(event.event_id, consumed_cmd.machine.clone()); // Additionally, some command types have user-created identifiers that may need to // be associated with the event id, so that when (ex) a request to cancel them is // issued we can identify them. @@ -299,8 +296,8 @@ impl WorkflowMachines { )), .. } => { - self.timer_id_to_initiating_event - .insert(timer_id, event.event_id); + self.timer_id_to_machine + .insert(timer_id, consumed_cmd.machine); } _ => (), } @@ -470,7 +467,7 @@ impl WorkflowMachines { Ok(()) } - fn handle_driven_results(&mut self, results: Vec) { + fn handle_driven_results(&mut self, results: Vec) -> Result<()> { for cmd in results { // I don't love how boilerplate this is for just pushing new commands, and how // weird it feels for cancels. @@ -480,31 +477,27 @@ impl WorkflowMachines { self.current_wf_task_commands.push_back(timer); } WFCommand::CancelTimer(attrs) => { - // TODO: real errors - if let Some(event_id) = self.timer_id_to_initiating_event.get(&attrs.timer_id) { - let machine = self.machines_by_id.get(event_id); - if let Some(machine) = machine { - // TODO: Fix this all up - let res = (**machine).borrow_mut().cancel(); - match res { - Ok(MachineResponse::IssueNewCommand(c)) => { - self.current_wf_task_commands.push_back(CommandAndMachine { - command: c, - machine: machine.clone(), - }) - } - Ok(v) => { - dbg!(v); - } - Err(e) => { - panic!(format!("Cancel timer error {:?}", e)); - } + if let Some(machine) = self.timer_id_to_machine.get(&attrs.timer_id) { + let res = (**machine).borrow_mut().cancel()?; + match res { + MachineResponse::IssueNewCommand(c) => { + self.current_wf_task_commands.push_back(CommandAndMachine { + command: c, + machine: machine.clone(), + }) + } + v => { + return Err(WFMachinesError::UnexpectedMachineResponse( + v, + "When cancelling timer", + )) } - } else { - panic!("Ahh no associated timer machine") } } else { - panic!("Ahh no associated timer") + return Err(WFMachinesError::MissingAssociatedMachine(format!( + "Timer with id {} was missing associated machine", + attrs.timer_id + ))); } } WFCommand::CompleteWorkflow(attrs) => { @@ -514,6 +507,7 @@ impl WorkflowMachines { WFCommand::NoCommandsFromLang => (), } } + Ok(()) } /// Transfer commands from `current_wf_task_commands` to `commands`, so they may be sent off diff --git a/src/protos/mod.rs b/src/protos/mod.rs index f7536a822..3fc2385e9 100644 --- a/src/protos/mod.rs +++ b/src/protos/mod.rs @@ -140,7 +140,7 @@ pub mod temporal { /// /// If `up_to_event_id` is provided, the count will be returned as soon as /// processing advances past that id. - pub fn get_workflow_task_count( + pub(crate) fn get_workflow_task_count( &self, up_to_event_id: Option, ) -> Result { diff --git a/src/protosext/history_info.rs b/src/protosext/history_info.rs index 3ce264f1e..9988a099a 100644 --- a/src/protosext/history_info.rs +++ b/src/protosext/history_info.rs @@ -1,5 +1,5 @@ use crate::{ - machines::{WFMachinesError, WorkflowMachines}, + machines::WorkflowMachines, protos::temporal::api::enums::v1::EventType, protos::temporal::api::history::v1::{History, HistoryEvent}, }; @@ -24,8 +24,10 @@ pub enum HistoryInfoError { FailedOrTimeout(HistoryEvent), #[error("Last item in history wasn't WorkflowTaskStarted")] HistoryEndsUnexpectedly, + + // We erase the underlying error type here to keep from leaking it into public #[error("Underlying error in workflow machine: {0:?}")] - UnderlyingMachineError(#[from] WFMachinesError), + UnderlyingMachineError(#[from] anyhow::Error), } impl HistoryInfo { @@ -128,7 +130,9 @@ impl HistoryInfo { if next_event.is_none() || next_is_completed { started_id = event.event_id; if next_event.is_none() { - wf_machines.handle_event(event, false)?; + wf_machines + .handle_event(event, false) + .map_err(anyhow::Error::from)?; return Ok(()); } } else if next_event.is_some() && !next_is_failed_or_timeout { @@ -136,7 +140,9 @@ impl HistoryInfo { } } - wf_machines.handle_event(event, next_event.is_some())?; + wf_machines + .handle_event(event, next_event.is_some()) + .map_err(anyhow::Error::from)?; if next_event.is_none() { if event.is_final_wf_execution_event() { From 6015f20d2c89df42803d50f3851d37c50ecb12be Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Tue, 23 Feb 2021 12:21:36 -0800 Subject: [PATCH 39/51] Fix tracing warnings --- src/core_tracing.rs | 27 +++++++++++++++------------ src/lib.rs | 2 +- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/core_tracing.rs b/src/core_tracing.rs index 7d3eb0cc6..86d66a5a0 100644 --- a/src/core_tracing.rs +++ b/src/core_tracing.rs @@ -3,19 +3,22 @@ use opentelemetry_jaeger::Uninstall; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; static TRACING_INIT: OnceCell = OnceCell::new(); +const TRACING_ENABLE_ENV_VAR: &str = "TEMPORAL_CORE_TRACING"; pub(crate) fn tracing_init() { let _ = env_logger::try_init(); - // TRACING_INIT.get_or_init(|| { - // let (tracer, uninstall) = opentelemetry_jaeger::new_pipeline() - // .with_service_name("coresdk") - // .install() - // .unwrap(); - // let opentelemetry = tracing_opentelemetry::layer().with_tracer(tracer); - // tracing_subscriber::registry() - // .with(opentelemetry) - // .try_init() - // .unwrap(); - // uninstall - // }); + if std::env::var(TRACING_ENABLE_ENV_VAR).is_ok() { + TRACING_INIT.get_or_init(|| { + let (tracer, uninstall) = opentelemetry_jaeger::new_pipeline() + .with_service_name("coresdk") + .install() + .unwrap(); + let opentelemetry = tracing_opentelemetry::layer().with_tracer(tracer); + tracing_subscriber::registry() + .with(opentelemetry) + .try_init() + .unwrap(); + uninstall + }); + } } diff --git a/src/lib.rs b/src/lib.rs index 3b5025609..c4a481f41 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -79,7 +79,7 @@ pub struct CoreInitOptions { /// * Will panic if called from within an async context, as it will construct a runtime and you /// cannot construct a runtime from within a runtime. pub fn init(opts: CoreInitOptions) -> Result { - let _ = env_logger::try_init(); + core_tracing::tracing_init(); let runtime = Runtime::new().map_err(CoreError::TokioInitError)?; // Initialize server client let work_provider = runtime.block_on(opts.gateway_opts.connect())?; From 98f4dc92904996c30bb0ec70f7f4b3c8eb2af78e Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Tue, 23 Feb 2021 12:26:24 -0800 Subject: [PATCH 40/51] Clippy fixes --- .../complete_workflow_state_machine.rs | 2 +- src/machines/mod.rs | 4 ++-- src/machines/timer_state_machine.rs | 2 +- src/machines/workflow_machines.rs | 23 +++++++++---------- src/protosext/history_info.rs | 1 + 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/machines/complete_workflow_state_machine.rs b/src/machines/complete_workflow_state_machine.rs index 93bc95461..8bdae312d 100644 --- a/src/machines/complete_workflow_state_machine.rs +++ b/src/machines/complete_workflow_state_machine.rs @@ -106,7 +106,7 @@ impl Created { attributes: Some(dat.into()), }; TransitionResult::commands::<_, CompleteWorkflowCommandCreated>(vec![ - CompleteWFCommand::AddCommand(cmd.into()), + CompleteWFCommand::AddCommand(cmd), ]) } } diff --git a/src/machines/mod.rs b/src/machines/mod.rs index 7b692a555..e11e75b64 100644 --- a/src/machines/mod.rs +++ b/src/machines/mod.rs @@ -280,8 +280,8 @@ trait Cancellable: StateMachine { /// machines. fn cancel(&mut self) -> Result> { // It's a logic error on our part if this is ever called on a machine that can't actually - // be cancelled TODO: Result instead? - panic!(format!("This type of machine cannot be cancelled")) + // be cancelled + panic!(format!("Machine {} cannot be cancelled", self.name())) } /// Should return true if the command was cancelled before we sent it to the server diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index 334aebc29..0df7b92a1 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -157,7 +157,7 @@ impl Created { attributes: Some(dat.attrs.into()), }; TimerMachineTransition::commands::<_, StartCommandCreated>(vec![ - TimerMachineCommand::AddCommand(cmd.into()), + TimerMachineCommand::AddCommand(cmd), ]) } } diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index 489057431..b8b4c4679 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -80,6 +80,7 @@ pub(crate) struct WorkflowMachines { /// Returned by [TemporalStateMachine]s when handling events #[derive(Debug, derive_more::From)] #[must_use] +#[allow(clippy::large_enum_variant)] pub(crate) enum MachineResponse { PushWFJob(#[from(forward)] wf_activation_job::Attributes), IssueNewCommand(Command), @@ -288,18 +289,16 @@ impl WorkflowMachines { // Additionally, some command types have user-created identifiers that may need to // be associated with the event id, so that when (ex) a request to cancel them is // issued we can identify them. - match consumed_cmd.command { - Command { - attributes: - Some(command::Attributes::StartTimerCommandAttributes( - StartTimerCommandAttributes { timer_id, .. }, - )), - .. - } => { - self.timer_id_to_machine - .insert(timer_id, consumed_cmd.machine); - } - _ => (), + if let Command { + attributes: + Some(command::Attributes::StartTimerCommandAttributes( + StartTimerCommandAttributes { timer_id, .. }, + )), + .. + } = consumed_cmd.command + { + self.timer_id_to_machine + .insert(timer_id, consumed_cmd.machine); } } diff --git a/src/protosext/history_info.rs b/src/protosext/history_info.rs index 9988a099a..7a98f579e 100644 --- a/src/protosext/history_info.rs +++ b/src/protosext/history_info.rs @@ -14,6 +14,7 @@ pub(crate) struct HistoryInfo { type Result = std::result::Result; #[derive(thiserror::Error, Debug)] +#[allow(clippy::large_enum_variant)] pub enum HistoryInfoError { #[error("Latest wf started id and previous one are equal! ${previous_started_event_id:?}")] UnexpectedEventId { From 6bc7bb120fbd4c757cf98c87b434a49473b10b76 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Tue, 23 Feb 2021 16:46:21 -0800 Subject: [PATCH 41/51] Use slotmap instead of Rc> stuff --- Cargo.toml | 1 + .../complete_workflow_state_machine.rs | 11 +- src/machines/mod.rs | 7 +- src/machines/timer_state_machine.rs | 10 +- src/machines/workflow_machines.rs | 150 +++++++++++------- 5 files changed, 103 insertions(+), 76 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 85d4cddb2..5de8931a1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ opentelemetry-jaeger = "0.11" opentelemetry = "0.12" prost = "0.7" prost-types = "0.7" +slotmap = "1.0" thiserror = "1.0" tokio = { version = "1.1", features = ["rt", "rt-multi-thread"] } tracing = { version = "0.1", features = ["log"] } diff --git a/src/machines/complete_workflow_state_machine.rs b/src/machines/complete_workflow_state_machine.rs index 8bdae312d..2f8e9f292 100644 --- a/src/machines/complete_workflow_state_machine.rs +++ b/src/machines/complete_workflow_state_machine.rs @@ -1,8 +1,7 @@ -use crate::machines::CommandAndMachine; use crate::{ machines::{ - workflow_machines::MachineResponse, workflow_machines::WorkflowMachines, Cancellable, - WFCommand, WFMachinesAdapter, WFMachinesError, + workflow_machines::MachineResponse, Cancellable, NewMachineWithCommand, WFCommand, + WFMachinesAdapter, WFMachinesError, }, protos::temporal::api::{ command::v1::{Command, CompleteWorkflowExecutionCommandAttributes}, @@ -37,11 +36,11 @@ pub(super) enum CompleteWFCommand { /// Complete a workflow pub(super) fn complete_workflow( attribs: CompleteWorkflowExecutionCommandAttributes, -) -> CommandAndMachine { +) -> NewMachineWithCommand { let (machine, add_cmd) = CompleteWorkflowMachine::new_scheduled(attribs); - CommandAndMachine { + NewMachineWithCommand { command: add_cmd, - machine: Rc::new(RefCell::new(machine)), + machine, } } diff --git a/src/machines/mod.rs b/src/machines/mod.rs index e11e75b64..dcf197cdf 100644 --- a/src/machines/mod.rs +++ b/src/machines/mod.rs @@ -58,10 +58,8 @@ use crate::{ use prost::alloc::fmt::Formatter; use rustfsm::{MachineError, StateMachine}; use std::{ - cell::RefCell, convert::{TryFrom, TryInto}, fmt::{Debug, Display}, - rc::Rc, }; use tracing::Level; @@ -290,11 +288,10 @@ trait Cancellable: StateMachine { } } -type MachineRef = Rc>; #[derive(Debug)] -struct CommandAndMachine { +struct NewMachineWithCommand { command: ProtoCommand, - machine: MachineRef, + machine: T, } impl Debug for dyn TemporalStateMachine { diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index 0df7b92a1..e0d74b1bb 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -1,6 +1,6 @@ #![allow(clippy::large_enum_variant)] -use crate::machines::CommandAndMachine; +use crate::machines::NewMachineWithCommand; use crate::{ machines::{ workflow_machines::{MachineResponse, WFMachinesError, WorkflowMachines}, @@ -65,11 +65,13 @@ pub(super) enum TimerMachineCommand { } /// Creates a new, scheduled, timer as a [CancellableCommand] -pub(super) fn new_timer(attribs: StartTimerCommandAttributes) -> CommandAndMachine { +pub(super) fn new_timer( + attribs: StartTimerCommandAttributes, +) -> NewMachineWithCommand { let (timer, add_cmd) = TimerMachine::new_scheduled(attribs); - CommandAndMachine { + NewMachineWithCommand { command: add_cmd, - machine: Rc::new(RefCell::new(timer)), + machine: timer, } } diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index b8b4c4679..11a77c850 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -1,14 +1,11 @@ -use crate::machines::{CommandAndMachine, MachineRef}; -use crate::protos::temporal::api::command::v1::Command; use crate::{ machines::{ complete_workflow_state_machine::complete_workflow, timer_state_machine::new_timer, workflow_task_state_machine::WorkflowTaskMachine, ActivationListener, DrivenWorkflow, - ProtoCommand, TemporalStateMachine, WFCommand, + NewMachineWithCommand, ProtoCommand, TemporalStateMachine, WFCommand, }, - protos::coresdk::WfActivationJob, protos::{ - coresdk::{wf_activation_job, StartWorkflowTaskAttributes, WfActivation}, + coresdk::{wf_activation_job, StartWorkflowTaskAttributes, WfActivation, WfActivationJob}, temporal::api::{ command::v1::{command, StartTimerCommandAttributes}, common::v1::WorkflowExecution, @@ -19,12 +16,11 @@ use crate::{ }; use futures::Future; use rustfsm::{MachineError, StateMachine}; -use std::cell::RefCell; +use slotmap::{DefaultKey, SlotMap}; use std::{ - borrow::BorrowMut, + borrow::{Borrow, BorrowMut}, collections::{HashMap, HashSet, VecDeque}, ops::DerefMut, - rc::Rc, sync::{atomic::AtomicBool, Arc}, time::SystemTime, }; @@ -52,14 +48,15 @@ pub(crate) struct WorkflowMachines { /// The current workflow time if it has been established current_wf_time: Option, + all_machines: SlotMap>, + /// A mapping for accessing all the machines, where the key is the id of the initiating event /// for that machine. - machines_by_id: HashMap, + machines_by_event_id: HashMap, - /// Maps timer ids as created by workflow authors to their initiating event IDs. There is no - /// reason to force the lang side to track event IDs, so we do it for them. - /// TODO: Make this apply to *all* cancellable things - timer_id_to_machine: HashMap, + /// Maps timer ids as created by workflow authors to their associated machines + /// TODO: Make this apply to *all* cancellable things, once we've added more. Key can be enum. + timer_id_to_machine: HashMap, /// Queued commands which have been produced by machines and await processing / being sent to /// the server. @@ -77,13 +74,20 @@ pub(crate) struct WorkflowMachines { drive_me: Box, } +slotmap::new_key_type! { struct MachineKey; } +#[derive(Debug)] +struct CommandAndMachine { + command: ProtoCommand, + machine: MachineKey, +} + /// Returned by [TemporalStateMachine]s when handling events #[derive(Debug, derive_more::From)] #[must_use] #[allow(clippy::large_enum_variant)] pub(crate) enum MachineResponse { PushWFJob(#[from(forward)] wf_activation_job::Attributes), - IssueNewCommand(Command), + IssueNewCommand(ProtoCommand), TriggerWFTaskStarted { task_started_event_id: i64, time: SystemTime, @@ -135,7 +139,8 @@ impl WorkflowMachines { previous_started_event_id: 0, replaying: false, current_wf_time: None, - machines_by_id: Default::default(), + all_machines: Default::default(), + machines_by_event_id: Default::default(), timer_id_to_machine: Default::default(), commands: Default::default(), current_wf_task_commands: Default::default(), @@ -178,9 +183,9 @@ impl WorkflowMachines { Some(initial_cmd_id) => { // We remove the machine while we it handles events, then return it, to avoid // borrowing from ourself mutably. - let mut maybe_machine = self.machines_by_id.remove(&initial_cmd_id); - if let Some(sm) = maybe_machine.as_ref() { - self.submachine_handle_event((**sm).borrow_mut(), event, has_next_event)?; + let mut maybe_machine = self.machines_by_event_id.remove(&initial_cmd_id); + if let Some(sm) = maybe_machine { + self.submachine_handle_event(sm, event, has_next_event)?; } else { event!( Level::ERROR, @@ -192,8 +197,8 @@ impl WorkflowMachines { // Restore machine if not in it's final state if let Some(sm) = maybe_machine { - if !sm.borrow().is_final_state() { - self.machines_by_id.insert(initial_cmd_id, sm); + if !self.machine(sm).is_final_state() { + self.machines_by_event_id.insert(initial_cmd_id, sm); } } } @@ -263,14 +268,14 @@ impl WorkflowMachines { // Feed the machine the event let mut break_later = false; - self.submachine_handle_event((*command.machine).borrow_mut(), event, true)?; + self.submachine_handle_event(command.machine, event, true)?; // TODO: // * More special handling for version machine - see java // * Commands cancelled this iteration are allowed to not match the event? - if !(*command.machine) - .borrow() + if !self + .machine(command.machine) .was_cancelled_before_sent_to_server() { break_later = true; @@ -283,13 +288,13 @@ impl WorkflowMachines { // TODO: validate command - if !consumed_cmd.machine.borrow().is_final_state() { - self.machines_by_id - .insert(event.event_id, consumed_cmd.machine.clone()); + if !self.machine(consumed_cmd.machine).is_final_state() { + self.machines_by_event_id + .insert(event.event_id, consumed_cmd.machine); // Additionally, some command types have user-created identifiers that may need to // be associated with the event id, so that when (ex) a request to cancel them is // issued we can identify them. - if let Command { + if let ProtoCommand { attributes: Some(command::Attributes::StartTimerCommandAttributes( StartTimerCommandAttributes { timer_id, .. }, @@ -341,13 +346,9 @@ impl WorkflowMachines { } Some(EventType::WorkflowTaskScheduled) => { let mut wf_task_sm = WorkflowTaskMachine::new(self.workflow_task_started_event_id); - self.submachine_handle_event( - &mut wf_task_sm as &mut dyn TemporalStateMachine, - event, - has_next_event, - )?; - self.machines_by_id - .insert(event.event_id, Rc::new(RefCell::new(wf_task_sm))); + let key = self.all_machines.insert(Box::new(wf_task_sm)); + self.submachine_handle_event(key, event, has_next_event)?; + self.machines_by_event_id.insert(event.event_id, key); } Some(EventType::WorkflowExecutionSignaled) => { // TODO: Signal callbacks @@ -372,10 +373,9 @@ impl WorkflowMachines { self.commands .iter() .filter_map(|c| { - if !(*c.machine).borrow().is_final_state() { + if !self.machine(c.machine).is_final_state() { Some(c.command.clone()) } else { - dbg!("Final state!!!!!"); None } }) @@ -434,10 +434,11 @@ impl WorkflowMachines { /// on the returned triggers fn submachine_handle_event( &mut self, - mut sm: impl DerefMut, + sm: MachineKey, event: &HistoryEvent, has_next_event: bool, ) -> Result<()> { + let mut sm = self.all_machines.get_mut(sm).expect("Machine must exist"); let triggers = sm.handle_event(event, has_next_event).map_err(|e| { if let WFMachinesError::MalformedEventDetail(s) = e { WFMachinesError::MalformedEvent(event.clone(), s) @@ -472,36 +473,38 @@ impl WorkflowMachines { // weird it feels for cancels. match cmd { WFCommand::AddTimer(attrs) => { - let timer = new_timer(attrs); + let timer = self.add_new_machine(new_timer(attrs)); self.current_wf_task_commands.push_back(timer); } WFCommand::CancelTimer(attrs) => { - if let Some(machine) = self.timer_id_to_machine.get(&attrs.timer_id) { - let res = (**machine).borrow_mut().cancel()?; - match res { - MachineResponse::IssueNewCommand(c) => { - self.current_wf_task_commands.push_back(CommandAndMachine { - command: c, - machine: machine.clone(), - }) - } - v => { - return Err(WFMachinesError::UnexpectedMachineResponse( - v, - "When cancelling timer", - )) - } + let mkey = *self + .timer_id_to_machine + .get(&attrs.timer_id) + .ok_or_else(|| { + WFMachinesError::MissingAssociatedMachine(format!( + "Missing associated machine for cancelling timer {}", + &attrs.timer_id + )) + })?; + let res = self.machine_mut(mkey).cancel()?; + match res { + MachineResponse::IssueNewCommand(c) => { + self.current_wf_task_commands.push_back(CommandAndMachine { + command: c, + machine: mkey, + }) + } + v => { + return Err(WFMachinesError::UnexpectedMachineResponse( + v, + "When cancelling timer", + )) } - } else { - return Err(WFMachinesError::MissingAssociatedMachine(format!( - "Timer with id {} was missing associated machine", - attrs.timer_id - ))); } } WFCommand::CompleteWorkflow(attrs) => { - self.current_wf_task_commands - .push_back(complete_workflow(attrs)); + let cwfm = self.add_new_machine(complete_workflow(attrs)); + self.current_wf_task_commands.push_back(cwfm); } WFCommand::NoCommandsFromLang => (), } @@ -519,9 +522,34 @@ impl WorkflowMachines { while let Some(mut c) = self.current_wf_task_commands.pop_front() { // TODO: This conversion sux -- probably add to NewOrExistingCommand let cmd_type = CommandType::from_i32(c.command.command_type).unwrap(); - (*c.machine).borrow_mut().handle_command(cmd_type); + self.machine_mut(c.machine).handle_command(cmd_type); self.commands.push_back(c); } event!(Level::DEBUG, msg = "end prepare_commands", commands = ?self.commands); } + + fn add_new_machine( + &mut self, + machine: NewMachineWithCommand, + ) -> CommandAndMachine { + let k = self.all_machines.insert(Box::new(machine.machine)); + CommandAndMachine { + command: machine.command, + machine: k, + } + } + + fn machine(&self, m: MachineKey) -> &dyn TemporalStateMachine { + self.all_machines + .get(m) + .expect("Machine must exist") + .borrow() + } + + fn machine_mut(&mut self, m: MachineKey) -> &mut (dyn TemporalStateMachine + 'static) { + self.all_machines + .get_mut(m) + .expect("Machine must exist") + .borrow_mut() + } } From 6653bfc2acfa85335a6490f005be721835a5336b Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Tue, 23 Feb 2021 18:54:50 -0800 Subject: [PATCH 42/51] Add higher level test, dedupe replay testing stuff with rstest --- src/lib.rs | 77 ++++++++++++++++++++++++++++++--------------- src/machines/mod.rs | 1 + src/protos/mod.rs | 4 +++ 3 files changed, 56 insertions(+), 26 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c4a481f41..ebaa6a2a9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -311,16 +311,19 @@ mod test { }, temporal::api::{ command::v1::{ - CompleteWorkflowExecutionCommandAttributes, StartTimerCommandAttributes, + CancelTimerCommandAttributes, CompleteWorkflowExecutionCommandAttributes, + StartTimerCommandAttributes, }, enums::v1::EventType, + history::v1::TimerCanceledEventAttributes, history::v1::{history_event, TimerFiredEventAttributes}, }, }, }; + use rstest::rstest; - #[test] - fn single_timer_test_across_wf_bridge() { + #[rstest(hist_batches, case::incremental(&[1, 2]), case::replay(&[2]))] + fn single_timer_test_across_wf_bridge(hist_batches: &[usize]) { let wfid = "fake_wf_id"; let run_id = "fake_run_id"; let timer_id = "fake_timer".to_string(); @@ -350,7 +353,7 @@ mod test { 8: EVENT_TYPE_WORKFLOW_TASK_STARTED --- */ - let core = build_fake_core(wfid, run_id, &mut t, &[1, 2]); + let core = build_fake_core(wfid, run_id, &mut t, hist_batches); let res = core.poll_task(task_queue).unwrap(); assert_matches!( @@ -387,8 +390,8 @@ mod test { .unwrap(); } - #[test] - fn parallel_timer_test_across_wf_bridge() { + #[rstest(hist_batches, case::incremental(&[1, 2]), case::replay(&[2]))] + fn parallel_timer_test_across_wf_bridge(hist_batches: &[usize]) { let wfid = "fake_wf_id"; let run_id = "fake_run_id"; let timer_1_id = "timer1".to_string(); @@ -429,7 +432,7 @@ mod test { 10: EVENT_TYPE_WORKFLOW_TASK_STARTED --- */ - let core = build_fake_core(wfid, run_id, &mut t, &[1, 2]); + let core = build_fake_core(wfid, run_id, &mut t, hist_batches); let res = core.poll_task(task_queue).unwrap(); assert_matches!( @@ -485,31 +488,40 @@ mod test { .unwrap(); } - #[test] - fn single_timer_whole_replay_test_across_wf_bridge() { - let s = span!(Level::DEBUG, "Test start", t = "bridge"); - let _enter = s.enter(); - + #[rstest(hist_batches, case::incremental(&[1, 2]), case::replay(&[2]))] + fn timer_cancel_test_across_wf_bridge(hist_batches: &[usize]) { let wfid = "fake_wf_id"; let run_id = "fake_run_id"; - let timer_1_id = "timer1".to_string(); + let timer_id = "wait_timer".to_string(); + let cancel_timer_id = "cancel_timer".to_string(); let task_queue = "test-task-queue"; let mut t = TestHistoryBuilder::default(); t.add_by_type(EventType::WorkflowExecutionStarted); t.add_full_wf_task(); - let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); + let wait_timer_started_id = t.add_get_event_id(EventType::TimerStarted, None); + let cancel_timer_started_id = t.add_get_event_id(EventType::TimerStarted, None); t.add( EventType::TimerFired, history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { - started_event_id: timer_started_event_id, - timer_id: timer_1_id.clone(), + started_event_id: wait_timer_started_id, + timer_id: timer_id.clone(), }), ); - t.add_workflow_task_scheduled_and_started(); - // NOTE! What makes this a replay test is the server only responds with *one* batch here. - // So, server is polled once, but lang->core interactions look just like non-replay test. - let core = build_fake_core(wfid, run_id, &mut t, &[2]); + // 8 + t.add_full_wf_task(); + // 11 + t.add( + EventType::TimerCanceled, + history_event::Attributes::TimerCanceledEventAttributes(TimerCanceledEventAttributes { + started_event_id: cancel_timer_started_id, + timer_id: cancel_timer_id.clone(), + ..Default::default() + }), + ); + // 12 + t.add_workflow_execution_completed(); + let core = build_fake_core(wfid, run_id, &mut t, hist_batches); let res = core.poll_task(task_queue).unwrap(); assert_matches!( @@ -522,11 +534,18 @@ mod test { let task_tok = res.task_token; core.complete_task(TaskCompletion::ok_from_api_attrs( - vec![StartTimerCommandAttributes { - timer_id: timer_1_id, - ..Default::default() - } - .into()], + vec![ + StartTimerCommandAttributes { + timer_id, + ..Default::default() + } + .into(), + StartTimerCommandAttributes { + timer_id: cancel_timer_id.clone(), + ..Default::default() + } + .into(), + ], task_tok, )) .unwrap(); @@ -540,7 +559,13 @@ mod test { ); let task_tok = res.task_token; core.complete_task(TaskCompletion::ok_from_api_attrs( - vec![CompleteWorkflowExecutionCommandAttributes { result: None }.into()], + vec![ + CancelTimerCommandAttributes { + timer_id: cancel_timer_id, + } + .into(), + CompleteWorkflowExecutionCommandAttributes { result: None }.into(), + ], task_tok, )) .unwrap(); diff --git a/src/machines/mod.rs b/src/machines/mod.rs index dcf197cdf..29a263659 100644 --- a/src/machines/mod.rs +++ b/src/machines/mod.rs @@ -119,6 +119,7 @@ impl TryFrom for WFCommand { .. })) => match attrs { Attributes::StartTimerCommandAttributes(s) => Ok(WFCommand::AddTimer(s)), + Attributes::CancelTimerCommandAttributes(s) => Ok(WFCommand::CancelTimer(s)), Attributes::CompleteWorkflowExecutionCommandAttributes(c) => { Ok(WFCommand::CompleteWorkflow(c)) } diff --git a/src/protos/mod.rs b/src/protos/mod.rs index 3fc2385e9..6bf344a59 100644 --- a/src/protos/mod.rs +++ b/src/protos/mod.rs @@ -92,6 +92,10 @@ pub mod temporal { command_type: CommandType::StartTimer as i32, attributes: Some(a), }, + a @ Attributes::CancelTimerCommandAttributes(_) => Self { + command_type: CommandType::CancelTimer as i32, + attributes: Some(a), + }, a @ Attributes::CompleteWorkflowExecutionCommandAttributes(_) => Self { command_type: CommandType::CompleteWorkflowExecution as i32, attributes: Some(a), From 64b7527cb065d986568842ca868ea1c5b13f5e9f Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Tue, 23 Feb 2021 19:03:47 -0800 Subject: [PATCH 43/51] Integration test --- src/machines/timer_state_machine.rs | 5 +-- tests/integ_tests/simple_wf_tests.rs | 55 ++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index e0d74b1bb..cd4665a3a 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -370,14 +370,13 @@ mod test { let commands = t .handle_workflow_task_take_cmds(&mut state_machines, Some(1)) .unwrap(); - dbg!(&commands); - dbg!(state_machines.get_wf_activation()); + state_machines.get_wf_activation(); assert_eq!(commands.len(), 1); assert_eq!(commands[0].command_type, CommandType::StartTimer as i32); let commands = t .handle_workflow_task_take_cmds(&mut state_machines, Some(2)) .unwrap(); - dbg!(state_machines.get_wf_activation()); + state_machines.get_wf_activation(); assert_eq!(commands.len(), 1); assert_eq!( commands[0].command_type, diff --git a/tests/integ_tests/simple_wf_tests.rs b/tests/integ_tests/simple_wf_tests.rs index 95109e3ae..2697a019b 100644 --- a/tests/integ_tests/simple_wf_tests.rs +++ b/tests/integ_tests/simple_wf_tests.rs @@ -1,6 +1,7 @@ use assert_matches::assert_matches; use rand::{self, Rng}; use std::{convert::TryFrom, env, time::Duration}; +use temporal_sdk_core::protos::temporal::api::command::v1::CancelTimerCommandAttributes; use temporal_sdk_core::{ protos::{ coresdk::{wf_activation_job, TaskCompletion, TimerFiredTaskAttributes, WfActivationJob}, @@ -134,3 +135,57 @@ fn parallel_timer_workflow() { )) .unwrap(); } + +#[test] +fn timer_cancel_workflow() { + let task_q = "timer_cancel_workflow"; + let temporal_server_address = match env::var("TEMPORAL_SERVICE_ADDRESS") { + Ok(addr) => addr, + Err(_) => "http://localhost:7233".to_owned(), + }; + let url = Url::try_from(&*temporal_server_address).unwrap(); + let gateway_opts = ServerGatewayOptions { + namespace: NAMESPACE.to_string(), + identity: "none".to_string(), + worker_binary_id: "".to_string(), + long_poll_timeout: Duration::from_secs(60), + target_url: url, + }; + let core = temporal_sdk_core::init(CoreInitOptions { gateway_opts }).unwrap(); + let mut rng = rand::thread_rng(); + let workflow_id: u32 = rng.gen(); + dbg!(create_workflow(&core, task_q, &workflow_id.to_string())); + let timer_id = "wait_timer"; + let cancel_timer_id = "cancel_timer"; + let task = core.poll_task(task_q).unwrap(); + core.complete_task(TaskCompletion::ok_from_api_attrs( + vec![ + StartTimerCommandAttributes { + timer_id: timer_id.to_string(), + start_to_fire_timeout: Some(Duration::from_millis(50).into()), + ..Default::default() + } + .into(), + StartTimerCommandAttributes { + timer_id: cancel_timer_id.to_string(), + start_to_fire_timeout: Some(Duration::from_secs(10).into()), + ..Default::default() + } + .into(), + ], + task.task_token, + )) + .unwrap(); + let task = dbg!(core.poll_task(task_q).unwrap()); + core.complete_task(TaskCompletion::ok_from_api_attrs( + vec![ + CancelTimerCommandAttributes { + timer_id: cancel_timer_id.to_string(), + } + .into(), + CompleteWorkflowExecutionCommandAttributes { result: None }.into(), + ], + task.task_token, + )) + .unwrap(); +} From 6dcc989929f218aae68fc6be15e708ec0759c080 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Tue, 23 Feb 2021 19:08:55 -0800 Subject: [PATCH 44/51] Various cleanups Fix unknown command unwrap Remove pointless WFMachinesError::Underlying Clean up weird add command stuff when creating new timers Big 'ol clippy cleanup Fix old things still called triggers instead of machine responses --- src/lib.rs | 5 +- .../complete_workflow_state_machine.rs | 7 +- src/machines/mod.rs | 12 +-- src/machines/test_help/mod.rs | 2 +- src/machines/timer_state_machine.rs | 92 ++++++------------- src/machines/workflow_machines.rs | 82 +++++++++-------- src/machines/workflow_task_state_machine.rs | 6 +- tests/integ_tests/simple_wf_tests.rs | 3 + 8 files changed, 87 insertions(+), 122 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index ebaa6a2a9..09348069e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,6 +20,7 @@ mod workflow; pub use pollers::{ServerGateway, ServerGatewayApis, ServerGatewayOptions}; pub use url::Url; +use crate::machines::WFMachinesError; use crate::{ machines::{InconvertibleCommandError, WFCommand}, protos::{ @@ -260,7 +261,7 @@ impl CoreSDK { .collect::>>()?; self.workflow_machines.access(run_id, |mgr| { mgr.command_sink.send(cmds)?; - mgr.machines.iterate_machines(); + mgr.machines.iterate_machines()?; Ok(()) })?; Ok(()) @@ -284,6 +285,8 @@ pub enum CoreError { UninterpretableCommand(#[from] InconvertibleCommandError), /// Underlying error in history processing UnderlyingHistError(#[from] HistoryInfoError), + /// Underlying error in state machines + UnderlyingMachinesError(#[from] WFMachinesError), /// Task token had nothing associated with it: {0:?} NothingFoundForTaskToken(Vec), /// Error calling the service: {0:?} diff --git a/src/machines/complete_workflow_state_machine.rs b/src/machines/complete_workflow_state_machine.rs index 2f8e9f292..9b31dc5a7 100644 --- a/src/machines/complete_workflow_state_machine.rs +++ b/src/machines/complete_workflow_state_machine.rs @@ -1,7 +1,7 @@ use crate::{ machines::{ - workflow_machines::MachineResponse, Cancellable, NewMachineWithCommand, WFCommand, - WFMachinesAdapter, WFMachinesError, + workflow_machines::MachineResponse, Cancellable, NewMachineWithCommand, WFMachinesAdapter, + WFMachinesError, }, protos::temporal::api::{ command::v1::{Command, CompleteWorkflowExecutionCommandAttributes}, @@ -10,8 +10,7 @@ use crate::{ }, }; use rustfsm::{fsm, StateMachine, TransitionResult}; -use std::cell::RefCell; -use std::{convert::TryFrom, rc::Rc}; +use std::convert::TryFrom; fsm! { pub(super) diff --git a/src/machines/mod.rs b/src/machines/mod.rs index 29a263659..c3ad06212 100644 --- a/src/machines/mod.rs +++ b/src/machines/mod.rs @@ -1,4 +1,3 @@ -#[allow(unused)] mod workflow_machines; // TODO: Move all these inside a submachines module @@ -10,7 +9,6 @@ mod cancel_external_state_machine; mod cancel_workflow_state_machine; #[allow(unused)] mod child_workflow_state_machine; -#[allow(unused)] mod complete_workflow_state_machine; #[allow(unused)] mod continue_as_new_workflow_state_machine; @@ -24,13 +22,11 @@ mod mutable_side_effect_state_machine; mod side_effect_state_machine; #[allow(unused)] mod signal_external_state_machine; -#[allow(unused)] mod timer_state_machine; #[allow(unused)] mod upsert_search_attributes_state_machine; #[allow(unused)] mod version_state_machine; -#[allow(unused)] mod workflow_task_state_machine; #[cfg(test)] @@ -137,7 +133,7 @@ trait TemporalStateMachine: CheckStateMachineInFinal + Send { fn name(&self) -> &str; fn handle_command(&mut self, command_type: CommandType) -> Result<(), WFMachinesError>; - /// Tell the state machine to handle some event. Returns a list of triggers that can be used + /// Tell the state machine to handle some event. Returns a list of responses that can be used /// to update the overall state of the workflow. EX: To issue outgoing WF activations. fn handle_event( &mut self, @@ -206,11 +202,11 @@ where if !c.is_empty() { event!(Level::DEBUG, msg = "Machine produced commands", ?c, state = %self.state()); } - let mut triggers = vec![]; + let mut machine_responses = vec![]; for cmd in c { - triggers.extend(self.adapt_response(event, has_next_event, cmd)?); + machine_responses.extend(self.adapt_response(event, has_next_event, cmd)?); } - Ok(triggers) + Ok(machine_responses) } Err(MachineError::InvalidTransition) => { Err(WFMachinesError::InvalidTransitionDuringEvent( diff --git a/src/machines/test_help/mod.rs b/src/machines/test_help/mod.rs index f669750d9..61e3cf1cf 100644 --- a/src/machines/test_help/mod.rs +++ b/src/machines/test_help/mod.rs @@ -4,7 +4,7 @@ mod history_builder; mod workflow_driver; pub(crate) use history_builder::TestHistoryBuilder; -pub(super) use workflow_driver::{CommandSender, TestWFCommand, TestWorkflowDriver}; +pub(super) use workflow_driver::{CommandSender, TestWorkflowDriver}; use crate::workflow::WorkflowConcurrencyManager; use crate::{ diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index cd4665a3a..834b14b71 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -1,37 +1,21 @@ #![allow(clippy::large_enum_variant)] -use crate::machines::NewMachineWithCommand; use crate::{ machines::{ - workflow_machines::{MachineResponse, WFMachinesError, WorkflowMachines}, - Cancellable, TemporalStateMachine, WFCommand, WFMachinesAdapter, + workflow_machines::{MachineResponse, WFMachinesError}, + Cancellable, NewMachineWithCommand, WFMachinesAdapter, }, protos::{ - coresdk::{ - HistoryEventId, TimerCanceledTaskAttributes, TimerFiredTaskAttributes, WfActivation, - }, + coresdk::{HistoryEventId, TimerCanceledTaskAttributes, TimerFiredTaskAttributes}, temporal::api::{ - command::v1::{ - command::Attributes, CancelTimerCommandAttributes, Command, - StartTimerCommandAttributes, - }, + command::v1::{CancelTimerCommandAttributes, Command, StartTimerCommandAttributes}, enums::v1::{CommandType, EventType}, - history::v1::{ - history_event, HistoryEvent, TimerCanceledEventAttributes, - TimerFiredEventAttributes, - }, + history::v1::{history_event, HistoryEvent, TimerFiredEventAttributes}, }, }, }; -use futures::FutureExt; use rustfsm::{fsm, MachineError, StateMachine, TransitionResult}; -use std::{ - cell::RefCell, - convert::TryFrom, - rc::Rc, - sync::{atomic::Ordering, Arc}, -}; -use tracing::Level; +use std::convert::TryFrom; fsm! { pub(super) name TimerMachine; @@ -39,7 +23,7 @@ fsm! { error WFMachinesError; shared_state SharedState; - Created --(Schedule, shared on_schedule) --> StartCommandCreated; + Created --(Schedule, on_schedule) --> StartCommandCreated; StartCommandCreated --(CommandStartTimer) --> StartCommandCreated; StartCommandCreated --(TimerStarted(HistoryEventId), on_timer_started) --> StartCommandRecorded; @@ -57,13 +41,17 @@ fsm! { #[derive(Debug)] pub(super) enum TimerMachineCommand { - // TODO: Perhaps just remove this - AddCommand(Command), Complete, Canceled, IssueCancelCmd(Command), } +#[derive(Default, Clone)] +pub(super) struct SharedState { + attrs: StartTimerCommandAttributes, + cancelled_before_sent: bool, +} + /// Creates a new, scheduled, timer as a [CancellableCommand] pub(super) fn new_timer( attribs: StartTimerCommandAttributes, @@ -79,14 +67,11 @@ impl TimerMachine { /// Create a new timer and immediately schedule it pub(crate) fn new_scheduled(attribs: StartTimerCommandAttributes) -> (Self, Command) { let mut s = Self::new(attribs); - let cmd = match s - .on_event_mut(TimerMachineEvents::Schedule) - .expect("Scheduling timers doesn't fail") - .pop() - { - // TODO: This seems silly - why bother with the command at all? - Some(TimerMachineCommand::AddCommand(c)) => c, - _ => panic!("Timer on_schedule must produce command"), + s.on_event_mut(TimerMachineEvents::Schedule) + .expect("Scheduling timers doesn't fail"); + let cmd = Command { + command_type: CommandType::StartTimer as i32, + attributes: Some(s.shared_state().attrs.clone().into()), }; (s, cmd) } @@ -143,24 +128,12 @@ impl TryFrom for TimerMachineEvents { } } -#[derive(Default, Clone)] -pub(super) struct SharedState { - attrs: StartTimerCommandAttributes, - cancelled_before_sent: bool, -} - #[derive(Default, Clone)] pub(super) struct Created {} impl Created { - pub(super) fn on_schedule(self, dat: SharedState) -> TimerMachineTransition { - let cmd = Command { - command_type: CommandType::StartTimer as i32, - attributes: Some(dat.attrs.into()), - }; - TimerMachineTransition::commands::<_, StartCommandCreated>(vec![ - TimerMachineCommand::AddCommand(cmd), - ]) + pub(super) fn on_schedule(self) -> TimerMachineTransition { + TimerMachineTransition::default::() } } @@ -197,7 +170,7 @@ impl StartCommandCreated { // TODO: Java recorded an initial event ID, but it seemingly was never used. TimerMachineTransition::default::() } - pub(super) fn on_cancel(mut self, dat: SharedState) -> TimerMachineTransition { + pub(super) fn on_cancel(self, dat: SharedState) -> TimerMachineTransition { TimerMachineTransition::ok_shared( vec![TimerMachineCommand::Canceled], Canceled::default(), @@ -263,9 +236,6 @@ impl WFMachinesAdapter for TimerMachine { } .into()], TimerMachineCommand::IssueCancelCmd(c) => vec![MachineResponse::IssueNewCommand(c)], - TimerMachineCommand::AddCommand(_) => { - unreachable!() - } }) } } @@ -290,23 +260,17 @@ mod test { use super::*; use crate::{ machines::{ - complete_workflow_state_machine::complete_workflow, - test_help::{CommandSender, TestHistoryBuilder, TestWFCommand, TestWorkflowDriver}, + test_help::{CommandSender, TestHistoryBuilder, TestWorkflowDriver}, workflow_machines::WorkflowMachines, - DrivenWorkflow, WFCommand, }, protos::temporal::api::{ command::v1::CompleteWorkflowExecutionCommandAttributes, - history::v1::{ - TimerFiredEventAttributes, WorkflowExecutionCanceledEventAttributes, - WorkflowExecutionSignaledEventAttributes, WorkflowExecutionStartedEventAttributes, - }, + history::v1::{TimerCanceledEventAttributes, TimerFiredEventAttributes}, }, }; - use futures::{channel::mpsc::Sender, FutureExt, SinkExt}; use rstest::{fixture, rstest}; - use rustfsm::MachineError; - use std::{error::Error, sync::Arc, time::Duration}; + use std::time::Duration; + use tracing::Level; #[fixture] fn fire_happy_hist() -> (TestHistoryBuilder, WorkflowMachines) { @@ -342,7 +306,7 @@ mod test { }); let mut t = TestHistoryBuilder::default(); - let mut state_machines = + let state_machines = WorkflowMachines::new("wfid".to_string(), "runid".to_string(), Box::new(twd)); t.add_by_type(EventType::WorkflowExecutionStarted); @@ -437,7 +401,7 @@ mod test { crate::core_tracing::tracing_init(); let twd = TestWorkflowDriver::new(|mut cmd_sink: CommandSender| async move { - let cancel_this = cmd_sink.timer( + let _cancel_this = cmd_sink.timer( StartTimerCommandAttributes { timer_id: "cancel_timer".to_string(), start_to_fire_timeout: Some(Duration::from_secs(500).into()), @@ -459,7 +423,7 @@ mod test { }); let mut t = TestHistoryBuilder::default(); - let mut state_machines = + let state_machines = WorkflowMachines::new("wfid".to_string(), "runid".to_string(), Box::new(twd)); t.add_by_type(EventType::WorkflowExecutionStarted); diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index 11a77c850..cb2dd9a48 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -1,27 +1,22 @@ use crate::{ machines::{ complete_workflow_state_machine::complete_workflow, timer_state_machine::new_timer, - workflow_task_state_machine::WorkflowTaskMachine, ActivationListener, DrivenWorkflow, - NewMachineWithCommand, ProtoCommand, TemporalStateMachine, WFCommand, + workflow_task_state_machine::WorkflowTaskMachine, DrivenWorkflow, NewMachineWithCommand, + ProtoCommand, TemporalStateMachine, WFCommand, }, protos::{ - coresdk::{wf_activation_job, StartWorkflowTaskAttributes, WfActivation, WfActivationJob}, + coresdk::{wf_activation_job, StartWorkflowTaskAttributes, WfActivation}, temporal::api::{ command::v1::{command, StartTimerCommandAttributes}, - common::v1::WorkflowExecution, enums::v1::{CommandType, EventType}, history::v1::{history_event, HistoryEvent}, }, }, }; -use futures::Future; -use rustfsm::{MachineError, StateMachine}; -use slotmap::{DefaultKey, SlotMap}; +use slotmap::SlotMap; use std::{ borrow::{Borrow, BorrowMut}, - collections::{HashMap, HashSet, VecDeque}, - ops::DerefMut, - sync::{atomic::AtomicBool, Arc}, + collections::{HashMap, VecDeque}, time::SystemTime, }; use tracing::Level; @@ -85,7 +80,7 @@ struct CommandAndMachine { #[derive(Debug, derive_more::From)] #[must_use] #[allow(clippy::large_enum_variant)] -pub(crate) enum MachineResponse { +pub enum MachineResponse { PushWFJob(#[from(forward)] wf_activation_job::Attributes), IssueNewCommand(ProtoCommand), TriggerWFTaskStarted { @@ -95,7 +90,8 @@ pub(crate) enum MachineResponse { } #[derive(thiserror::Error, Debug)] -pub(crate) enum WFMachinesError { +// TODO: Some of these are redundant with MachineError -- we should try to dedupe / simplify +pub enum WFMachinesError { #[error("Event {0:?} was not expected: {1}")] UnexpectedEvent(HistoryEvent, &'static str), #[error("Event {0:?} was not expected: {1}")] @@ -108,6 +104,8 @@ pub(crate) enum WFMachinesError { MalformedEventDetail(String), #[error("Command type {0:?} was not expected")] UnexpectedCommand(CommandType), + #[error("Command type {0} is not known")] + UnknownCommandType(i32), #[error("No command was scheduled for event {0:?}")] NoCommandScheduledForEvent(HistoryEvent), #[error("Machine response {0:?} was not expected: {1}")] @@ -117,10 +115,6 @@ pub(crate) enum WFMachinesError { #[error("Machine encountered an invalid transition: {0}")] InvalidTransition(&'static str), - - // TODO: Don't really need anyhow here? - #[error("Underlying error {0:?}")] - Underlying(#[from] anyhow::Error), } impl WorkflowMachines { @@ -183,7 +177,7 @@ impl WorkflowMachines { Some(initial_cmd_id) => { // We remove the machine while we it handles events, then return it, to avoid // borrowing from ourself mutably. - let mut maybe_machine = self.machines_by_event_id.remove(&initial_cmd_id); + let maybe_machine = self.machines_by_event_id.remove(&initial_cmd_id); if let Some(sm) = maybe_machine { self.submachine_handle_event(sm, event, has_next_event)?; } else { @@ -210,7 +204,11 @@ impl WorkflowMachines { /// Called when we want to run the event loop because a workflow task started event has /// triggered - pub(super) fn task_started(&mut self, task_started_event_id: i64, time: SystemTime) { + pub(super) fn task_started( + &mut self, + task_started_event_id: i64, + time: SystemTime, + ) -> Result<()> { let s = span!(Level::DEBUG, "Task started trigger"); let _enter = s.enter(); @@ -235,7 +233,8 @@ impl WorkflowMachines { self.current_started_event_id = task_started_event_id; self.set_current_time(time); - self.iterate_machines(); + self.iterate_machines()?; + Ok(()) } /// A command event is an event which is generated from a command emitted as a result of @@ -254,12 +253,12 @@ impl WorkflowMachines { // } event!(Level::DEBUG, msg = "handling command event", current_commands = ?self.commands); - let mut consumed_cmd = loop { + let consumed_cmd = loop { // handleVersionMarker can skip a marker event if the getVersion call was removed. // In this case we don't want to consume a command. -- we will need to replace it back // to the front when implementing, or something better let maybe_command = self.commands.pop_front(); - let mut command = if let Some(c) = maybe_command { + let command = if let Some(c) = maybe_command { c } else { return Err(WFMachinesError::NoCommandScheduledForEvent(event.clone())); @@ -345,7 +344,7 @@ impl WorkflowMachines { } } Some(EventType::WorkflowTaskScheduled) => { - let mut wf_task_sm = WorkflowTaskMachine::new(self.workflow_task_started_event_id); + let wf_task_sm = WorkflowTaskMachine::new(self.workflow_task_started_event_id); let key = self.all_machines.insert(Box::new(wf_task_sm)); self.submachine_handle_event(key, event, has_next_event)?; self.machines_by_event_id.insert(event.event_id, key); @@ -423,34 +422,38 @@ impl WorkflowMachines { /// Iterate the state machines, which consists of grabbing any pending outgoing commands from /// the workflow, handling them, and preparing them to be sent off to the server. - pub(crate) fn iterate_machines(&mut self) { + pub(crate) fn iterate_machines(&mut self) -> Result<()> { let results = self.drive_me.fetch_workflow_iteration_output(); - self.handle_driven_results(results); - - self.prepare_commands(); + self.handle_driven_results(results)?; + self.prepare_commands()?; + Ok(()) } /// Wrapper for calling [TemporalStateMachine::handle_event] which appropriately takes action - /// on the returned triggers + /// on the returned machine responses fn submachine_handle_event( &mut self, sm: MachineKey, event: &HistoryEvent, has_next_event: bool, ) -> Result<()> { - let mut sm = self.all_machines.get_mut(sm).expect("Machine must exist"); - let triggers = sm.handle_event(event, has_next_event).map_err(|e| { + let sm = self.all_machines.get_mut(sm).expect("Machine must exist"); + let machine_responses = sm.handle_event(event, has_next_event).map_err(|e| { if let WFMachinesError::MalformedEventDetail(s) = e { WFMachinesError::MalformedEvent(event.clone(), s) } else { e } })?; - if !triggers.is_empty() { - event!(Level::DEBUG, msg = "Machine produced triggers", ?triggers); + if !machine_responses.is_empty() { + event!( + Level::DEBUG, + msg = "Machine produced responses", + ?machine_responses + ); } - for trigger in triggers { - match trigger { + for response in machine_responses { + match response { MachineResponse::PushWFJob(a) => { self.drive_me.on_activation_job(&a); self.outgoing_wf_activation_jobs.push_back(a); @@ -459,7 +462,7 @@ impl WorkflowMachines { task_started_event_id, time, } => { - self.task_started(task_started_event_id, time); + self.task_started(task_started_event_id, time)?; } _ => panic!("TODO: Should anything else be possible here? Probably?"), } @@ -516,16 +519,17 @@ impl WorkflowMachines { /// to the server. While doing so, [TemporalStateMachine::handle_command] is called on the /// machine associated with the command. #[instrument(level = "debug", skip(self))] - fn prepare_commands(&mut self) { + fn prepare_commands(&mut self) -> Result<()> { event!(Level::DEBUG, msg = "start prepare_commands", cur_wf_task_cmds = ?self.current_wf_task_commands); - while let Some(mut c) = self.current_wf_task_commands.pop_front() { - // TODO: This conversion sux -- probably add to NewOrExistingCommand - let cmd_type = CommandType::from_i32(c.command.command_type).unwrap(); - self.machine_mut(c.machine).handle_command(cmd_type); + while let Some(c) = self.current_wf_task_commands.pop_front() { + let cmd_type = CommandType::from_i32(c.command.command_type) + .ok_or(WFMachinesError::UnknownCommandType(c.command.command_type))?; + self.machine_mut(c.machine).handle_command(cmd_type)?; self.commands.push_back(c); } event!(Level::DEBUG, msg = "end prepare_commands", commands = ?self.commands); + Ok(()) } fn add_new_machine( diff --git a/src/machines/workflow_task_state_machine.rs b/src/machines/workflow_task_state_machine.rs index 494a6c5e7..085ffff1d 100644 --- a/src/machines/workflow_task_state_machine.rs +++ b/src/machines/workflow_task_state_machine.rs @@ -3,10 +3,7 @@ use crate::machines::workflow_machines::MachineResponse; use crate::machines::Cancellable; use crate::{ - machines::{ - workflow_machines::{WFMachinesError, WorkflowMachines}, - WFMachinesAdapter, - }, + machines::{workflow_machines::WFMachinesError, WFMachinesAdapter}, protos::temporal::api::{ enums::v1::{CommandType, EventType}, history::v1::HistoryEvent, @@ -14,7 +11,6 @@ use crate::{ }; use rustfsm::{fsm, TransitionResult}; use std::{convert::TryFrom, time::SystemTime}; -use tracing::Level; fsm! { pub(super) name WorkflowTaskMachine; diff --git a/tests/integ_tests/simple_wf_tests.rs b/tests/integ_tests/simple_wf_tests.rs index 2697a019b..7d4ddcc76 100644 --- a/tests/integ_tests/simple_wf_tests.rs +++ b/tests/integ_tests/simple_wf_tests.rs @@ -16,6 +16,9 @@ use temporal_sdk_core::{ // restarted, because pulling from the same task queue produces tasks for the previous failed // workflows. Fix that. +// TODO: We should also get expected histories for these tests and confirm that the history +// at the end matches. + const NAMESPACE: &str = "default"; #[tokio::main] From f6884236426da4dfd189aa67b7777db7f66f6f38 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Tue, 23 Feb 2021 23:00:30 -0800 Subject: [PATCH 45/51] Use shutdown of workflow managers in core shutdown --- src/lib.rs | 51 +++++++++++++++-------------- src/workflow/concurrency_manager.rs | 12 ++++--- 2 files changed, 35 insertions(+), 28 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 43e1c995c..17327ba4b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -239,6 +239,7 @@ where fn shutdown(&self) -> Result<(), CoreError> { self.shutdown_requested.store(true, Ordering::SeqCst); + self.workflow_machines.shutdown(); Ok(()) } } @@ -331,9 +332,8 @@ pub enum CoreError { #[cfg(test)] mod test { use super::*; - use crate::machines::test_help::FakeCore; use crate::{ - machines::test_help::{build_fake_core, TestHistoryBuilder}, + machines::test_help::{build_fake_core, FakeCore, TestHistoryBuilder}, protos::{ coresdk::{ wf_activation_job, TaskCompletion, TimerFiredTaskAttributes, WfActivationJob, @@ -349,14 +349,15 @@ mod test { }, }, }; - use rstest::rstest; + use rstest::{fixture, rstest}; - #[rstest(hist_batches, case::incremental(&[1, 2]), case::replay(&[2]))] - fn single_timer_test_across_wf_bridge(hist_batches: &[usize]) { + const TASK_Q: &str = "test-task-queue"; + const RUN_ID: &str = "fake_run_id"; + + #[fixture(hist_batches=&[])] + fn single_timer_setup(hist_batches: &[usize]) -> FakeCore { let wfid = "fake_wf_id"; - let run_id = "fake_run_id"; let timer_id = "fake_timer".to_string(); - let task_queue = "test-task-queue"; let mut t = TestHistoryBuilder::default(); t.add_by_type(EventType::WorkflowExecutionStarted); @@ -366,7 +367,7 @@ mod test { EventType::TimerFired, history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { started_event_id: timer_started_event_id, - timer_id: timer_id.clone(), + timer_id, }), ); t.add_workflow_task_scheduled_and_started(); @@ -382,21 +383,28 @@ mod test { 8: EVENT_TYPE_WORKFLOW_TASK_STARTED --- */ - let core = build_fake_core(wfid, run_id, &mut t, hist_batches); + let core = build_fake_core(wfid, RUN_ID, &mut t, hist_batches); + core + } - let res = core.poll_task(task_queue).unwrap(); + #[rstest(core, + case::incremental(single_timer_setup(&[1, 2])), + case::replay(single_timer_setup(&[2])) + )] + fn single_timer_test_across_wf_bridge(core: FakeCore) { + let res = core.poll_task(TASK_Q).unwrap(); assert_matches!( res.get_wf_jobs().as_slice(), [WfActivationJob { attributes: Some(wf_activation_job::Attributes::StartWorkflow(_)), }] ); - assert!(core.workflow_machines.exists(run_id)); + assert!(core.workflow_machines.exists(RUN_ID)); let task_tok = res.task_token; core.complete_task(TaskCompletion::ok_from_api_attrs( vec![StartTimerCommandAttributes { - timer_id, + timer_id: "fake_timer".to_string(), ..Default::default() } .into()], @@ -404,7 +412,7 @@ mod test { )) .unwrap(); - let res = core.poll_task(task_queue).unwrap(); + let res = core.poll_task(TASK_Q).unwrap(); assert_matches!( res.get_wf_jobs().as_slice(), [WfActivationJob { @@ -598,21 +606,16 @@ mod test { task_tok, )) .unwrap(); - core } - #[rstest] - fn single_timer_whole_replay_test_across_wf_bridge(_single_timer_whole_replay: FakeCore) { - // Nothing to do here -- whole real test is in fixture. Rstest properly handles leading `_` - } + #[rstest(single_timer_setup(&[1]))] + fn after_shutdown_server_is_not_polled(single_timer_setup: FakeCore) { + let res = single_timer_setup.poll_task(TASK_Q).unwrap(); + assert_eq!(res.get_wf_jobs().len(), 1); - #[rstest] - fn after_shutdown_server_is_not_polled(single_timer_whole_replay: FakeCore) { - single_timer_whole_replay.shutdown().unwrap(); + single_timer_setup.shutdown().unwrap(); assert_matches!( - single_timer_whole_replay - .poll_task("irrelevant") - .unwrap_err(), + single_timer_setup.poll_task(TASK_Q).unwrap_err(), CoreError::ShuttingDown ); } diff --git a/src/workflow/concurrency_manager.rs b/src/workflow/concurrency_manager.rs index 1daaca3b4..c9a40f467 100644 --- a/src/workflow/concurrency_manager.rs +++ b/src/workflow/concurrency_manager.rs @@ -10,6 +10,7 @@ use crossbeam::channel::{bounded, unbounded, Receiver, Select, Sender, TryRecvEr use dashmap::DashMap; use std::{ fmt::Debug, + sync::Mutex, thread::{self, JoinHandle}, }; use tracing::Level; @@ -22,7 +23,7 @@ pub(crate) struct WorkflowConcurrencyManager { // in core SDK yet either - once we're ready to remove things, they can be removed from this // map and the wfm thread will drop the machines. machines: DashMap, - wf_thread: JoinHandle<()>, + wf_thread: Mutex>>, machine_creator: Sender, shutdown_chan: Sender, } @@ -51,7 +52,7 @@ impl WorkflowConcurrencyManager { Self { machines: Default::default(), - wf_thread, + wf_thread: Mutex::new(Some(wf_thread)), machine_creator, shutdown_chan, } @@ -119,10 +120,13 @@ impl WorkflowConcurrencyManager { /// /// # Panics /// If the workflow machine thread panicked - #[allow(unused)] // TODO: Will be used when other shutdown PR is merged - pub fn shutdown(self) { + pub fn shutdown(&self) { let _ = self.shutdown_chan.send(true); self.wf_thread + .lock() + .expect("Workflow manager thread mutex must be lockable") + .take() + .unwrap() .join() .expect("Workflow manager thread should shut down cleanly"); } From ce8600d557bb597723443682945627384323fe5f Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Wed, 24 Feb 2021 09:54:16 -0800 Subject: [PATCH 46/51] Remove probably unneeded todo --- src/protos/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/protos/mod.rs b/src/protos/mod.rs index 6bf344a59..0893a5e9c 100644 --- a/src/protos/mod.rs +++ b/src/protos/mod.rs @@ -156,7 +156,6 @@ pub mod temporal { if event.is_final_wf_execution_event() { // If the workflow is complete, we're done. - // TODO: Should we throw err if next event is populated? return Ok(count); } From a8cabe8ea10eeec4029b5af886d3fbfdd50157b5 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Wed, 24 Feb 2021 10:30:41 -0800 Subject: [PATCH 47/51] Dedupe histories in tests --- src/lib.rs | 124 +++++----------------------- src/machines/timer_state_machine.rs | 68 ++------------- src/machines/workflow_machines.rs | 3 +- src/protosext/history_info.rs | 31 +------ src/test_help/canned_histories.rs | 104 +++++++++++++++++++++++ src/test_help/mod.rs | 1 + 6 files changed, 136 insertions(+), 195 deletions(-) create mode 100644 src/test_help/canned_histories.rs create mode 100644 src/test_help/mod.rs diff --git a/src/lib.rs b/src/lib.rs index 17327ba4b..bd783a473 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,6 +17,9 @@ mod pollers; mod protosext; mod workflow; +#[cfg(test)] +mod test_help; + pub use pollers::{ServerGateway, ServerGatewayApis, ServerGatewayOptions}; pub use url::Url; @@ -333,21 +336,17 @@ pub enum CoreError { mod test { use super::*; use crate::{ - machines::test_help::{build_fake_core, FakeCore, TestHistoryBuilder}, + machines::test_help::{build_fake_core, FakeCore}, protos::{ coresdk::{ wf_activation_job, TaskCompletion, TimerFiredTaskAttributes, WfActivationJob, }, - temporal::api::{ - command::v1::{ - CancelTimerCommandAttributes, CompleteWorkflowExecutionCommandAttributes, - StartTimerCommandAttributes, - }, - enums::v1::EventType, - history::v1::TimerCanceledEventAttributes, - history::v1::{history_event, TimerFiredEventAttributes}, + temporal::api::command::v1::{ + CancelTimerCommandAttributes, CompleteWorkflowExecutionCommandAttributes, + StartTimerCommandAttributes, }, }, + test_help::canned_histories, }; use rstest::{fixture, rstest}; @@ -357,32 +356,8 @@ mod test { #[fixture(hist_batches=&[])] fn single_timer_setup(hist_batches: &[usize]) -> FakeCore { let wfid = "fake_wf_id"; - let timer_id = "fake_timer".to_string(); - - let mut t = TestHistoryBuilder::default(); - t.add_by_type(EventType::WorkflowExecutionStarted); - t.add_full_wf_task(); - let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); - t.add( - EventType::TimerFired, - history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { - started_event_id: timer_started_event_id, - timer_id, - }), - ); - t.add_workflow_task_scheduled_and_started(); - /* - 1: EVENT_TYPE_WORKFLOW_EXECUTION_STARTED - 2: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED - 3: EVENT_TYPE_WORKFLOW_TASK_STARTED - --- - 4: EVENT_TYPE_WORKFLOW_TASK_COMPLETED - 5: EVENT_TYPE_TIMER_STARTED - 6: EVENT_TYPE_TIMER_FIRED - 7: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED - 8: EVENT_TYPE_WORKFLOW_TASK_STARTED - --- - */ + + let mut t = canned_histories::single_timer("fake_timer"); let core = build_fake_core(wfid, RUN_ID, &mut t, hist_batches); core } @@ -431,44 +406,11 @@ mod test { fn parallel_timer_test_across_wf_bridge(hist_batches: &[usize]) { let wfid = "fake_wf_id"; let run_id = "fake_run_id"; - let timer_1_id = "timer1".to_string(); - let timer_2_id = "timer2".to_string(); + let timer_1_id = "timer1"; + let timer_2_id = "timer2"; let task_queue = "test-task-queue"; - let mut t = TestHistoryBuilder::default(); - t.add_by_type(EventType::WorkflowExecutionStarted); - t.add_full_wf_task(); - let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); - let timer_2_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); - t.add( - EventType::TimerFired, - history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { - started_event_id: timer_started_event_id, - timer_id: timer_1_id.clone(), - }), - ); - t.add( - EventType::TimerFired, - history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { - started_event_id: timer_2_started_event_id, - timer_id: timer_2_id.clone(), - }), - ); - t.add_workflow_task_scheduled_and_started(); - /* - 1: EVENT_TYPE_WORKFLOW_EXECUTION_STARTED - 2: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED - 3: EVENT_TYPE_WORKFLOW_TASK_STARTED - --- - 4: EVENT_TYPE_WORKFLOW_TASK_COMPLETED - 5: EVENT_TYPE_TIMER_STARTED - 6: EVENT_TYPE_TIMER_STARTED - 7: EVENT_TYPE_TIMER_FIRED - 8: EVENT_TYPE_TIMER_FIRED - 9: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED - 10: EVENT_TYPE_WORKFLOW_TASK_STARTED - --- - */ + let mut t = canned_histories::parallel_timer(timer_1_id, timer_2_id); let core = build_fake_core(wfid, run_id, &mut t, hist_batches); let res = core.poll_task(task_queue).unwrap(); @@ -484,12 +426,12 @@ mod test { core.complete_task(TaskCompletion::ok_from_api_attrs( vec![ StartTimerCommandAttributes { - timer_id: timer_1_id.clone(), + timer_id: timer_1_id.to_string(), ..Default::default() } .into(), StartTimerCommandAttributes { - timer_id: timer_2_id.clone(), + timer_id: timer_2_id.to_string(), ..Default::default() } .into(), @@ -529,35 +471,11 @@ mod test { fn timer_cancel_test_across_wf_bridge(hist_batches: &[usize]) { let wfid = "fake_wf_id"; let run_id = "fake_run_id"; - let timer_id = "wait_timer".to_string(); - let cancel_timer_id = "cancel_timer".to_string(); + let timer_id = "wait_timer"; + let cancel_timer_id = "cancel_timer"; let task_queue = "test-task-queue"; - let mut t = TestHistoryBuilder::default(); - t.add_by_type(EventType::WorkflowExecutionStarted); - t.add_full_wf_task(); - let wait_timer_started_id = t.add_get_event_id(EventType::TimerStarted, None); - let cancel_timer_started_id = t.add_get_event_id(EventType::TimerStarted, None); - t.add( - EventType::TimerFired, - history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { - started_event_id: wait_timer_started_id, - timer_id: timer_id.clone(), - }), - ); - // 8 - t.add_full_wf_task(); - // 11 - t.add( - EventType::TimerCanceled, - history_event::Attributes::TimerCanceledEventAttributes(TimerCanceledEventAttributes { - started_event_id: cancel_timer_started_id, - timer_id: cancel_timer_id.clone(), - ..Default::default() - }), - ); - // 12 - t.add_workflow_execution_completed(); + let mut t = canned_histories::cancel_timer(timer_id, cancel_timer_id); let core = build_fake_core(wfid, run_id, &mut t, hist_batches); let res = core.poll_task(task_queue).unwrap(); @@ -573,12 +491,12 @@ mod test { core.complete_task(TaskCompletion::ok_from_api_attrs( vec![ StartTimerCommandAttributes { - timer_id, + timer_id: cancel_timer_id.to_string(), ..Default::default() } .into(), StartTimerCommandAttributes { - timer_id: cancel_timer_id.clone(), + timer_id: timer_id.to_string(), ..Default::default() } .into(), @@ -598,7 +516,7 @@ mod test { core.complete_task(TaskCompletion::ok_from_api_attrs( vec![ CancelTimerCommandAttributes { - timer_id: cancel_timer_id, + timer_id: cancel_timer_id.to_string(), } .into(), CompleteWorkflowExecutionCommandAttributes { result: None }.into(), diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index 834b14b71..5f91bf20f 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -263,10 +263,8 @@ mod test { test_help::{CommandSender, TestHistoryBuilder, TestWorkflowDriver}, workflow_machines::WorkflowMachines, }, - protos::temporal::api::{ - command::v1::CompleteWorkflowExecutionCommandAttributes, - history::v1::{TimerCanceledEventAttributes, TimerFiredEventAttributes}, - }, + protos::temporal::api::command::v1::CompleteWorkflowExecutionCommandAttributes, + test_help::canned_histories, }; use rstest::{fixture, rstest}; use std::time::Duration; @@ -276,15 +274,6 @@ mod test { fn fire_happy_hist() -> (TestHistoryBuilder, WorkflowMachines) { crate::core_tracing::tracing_init(); /* - 1: EVENT_TYPE_WORKFLOW_EXECUTION_STARTED - 2: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED - 3: EVENT_TYPE_WORKFLOW_TASK_STARTED - 4: EVENT_TYPE_WORKFLOW_TASK_COMPLETED - 5: EVENT_TYPE_TIMER_STARTED - 6: EVENT_TYPE_TIMER_FIRED - 7: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED - 8: EVENT_TYPE_WORKFLOW_TASK_STARTED - We have two versions of this test, one which processes the history in two calls, and one which replays all of it in one go. The former will run the event loop three times total, and the latter two. @@ -305,21 +294,10 @@ mod test { command_sink.send(complete.into()); }); - let mut t = TestHistoryBuilder::default(); + let t = canned_histories::single_timer("timer1"); let state_machines = WorkflowMachines::new("wfid".to_string(), "runid".to_string(), Box::new(twd)); - t.add_by_type(EventType::WorkflowExecutionStarted); - t.add_full_wf_task(); - let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); - t.add( - EventType::TimerFired, - history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { - started_event_id: timer_started_event_id, - timer_id: "timer1".to_string(), - }), - ); - t.add_workflow_task_scheduled_and_started(); assert_eq!(2, t.as_history().get_workflow_task_count(None).unwrap()); (t, state_machines) } @@ -374,21 +352,10 @@ mod test { command_sink.timer(timer, true); }); - let mut t = TestHistoryBuilder::default(); + let t = canned_histories::single_timer("badid"); let mut state_machines = WorkflowMachines::new("wfid".to_string(), "runid".to_string(), Box::new(twd)); - t.add_by_type(EventType::WorkflowExecutionStarted); - t.add_full_wf_task(); - let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); - t.add( - EventType::TimerFired, - history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { - started_event_id: timer_started_event_id, - timer_id: "badid".to_string(), - }), - ); - t.add_workflow_task_scheduled_and_started(); assert!(t .handle_workflow_task_take_cmds(&mut state_machines, None) .unwrap_err() @@ -422,34 +389,9 @@ mod test { cmd_sink.send(complete.into()); }); - let mut t = TestHistoryBuilder::default(); + let t = canned_histories::cancel_timer("wait_timer", "cancel_timer"); let state_machines = WorkflowMachines::new("wfid".to_string(), "runid".to_string(), Box::new(twd)); - - t.add_by_type(EventType::WorkflowExecutionStarted); - t.add_full_wf_task(); - let cancel_timer_started_id = t.add_get_event_id(EventType::TimerStarted, None); - let wait_timer_started_id = t.add_get_event_id(EventType::TimerStarted, None); - t.add( - EventType::TimerFired, - history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { - started_event_id: wait_timer_started_id, - timer_id: "wait_timer".to_string(), - }), - ); - // 8 - t.add_full_wf_task(); - // 11 - t.add( - EventType::TimerCanceled, - history_event::Attributes::TimerCanceledEventAttributes(TimerCanceledEventAttributes { - started_event_id: cancel_timer_started_id, - timer_id: "cancel_timer".to_string(), - ..Default::default() - }), - ); - // 12 - t.add_workflow_execution_completed(); (t, state_machines) } diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index cb2dd9a48..684053f52 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -382,7 +382,8 @@ impl WorkflowMachines { } /// Returns the next activation that needs to be performed by the lang sdk. Things like unblock - /// timer, etc. + /// timer, etc. This does *not* cause any advancement of the state machines, it merely drains + /// from the outgoing queue of activation jobs. pub(crate) fn get_wf_activation(&mut self) -> Option { if self.outgoing_wf_activation_jobs.is_empty() { None diff --git a/src/protosext/history_info.rs b/src/protosext/history_info.rs index 7a98f579e..7d831af0d 100644 --- a/src/protosext/history_info.rs +++ b/src/protosext/history_info.rs @@ -165,37 +165,12 @@ impl HistoryInfo { #[cfg(test)] mod tests { - use super::*; - use crate::{ - machines::test_help::TestHistoryBuilder, - protos::temporal::api::history::v1::{history_event, TimerFiredEventAttributes}, - }; + use crate::test_help::canned_histories; #[test] fn history_info_constructs_properly() { - /* - 1: EVENT_TYPE_WORKFLOW_EXECUTION_STARTED - 2: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED - 3: EVENT_TYPE_WORKFLOW_TASK_STARTED - 4: EVENT_TYPE_WORKFLOW_TASK_COMPLETED - 5: EVENT_TYPE_TIMER_STARTED - 6: EVENT_TYPE_TIMER_FIRED - 7: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED - 8: EVENT_TYPE_WORKFLOW_TASK_STARTED - */ - let mut t = TestHistoryBuilder::default(); - - t.add_by_type(EventType::WorkflowExecutionStarted); - t.add_full_wf_task(); - let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); - t.add( - EventType::TimerFired, - history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { - started_event_id: timer_started_event_id, - timer_id: "timer1".to_string(), - }), - ); - t.add_workflow_task_scheduled_and_started(); + let t = canned_histories::single_timer("timer1"); + let history_info = t.get_history_info(1).unwrap(); assert_eq!(3, history_info.events.len()); let history_info = t.get_history_info(2).unwrap(); diff --git a/src/test_help/canned_histories.rs b/src/test_help/canned_histories.rs new file mode 100644 index 000000000..a93ea0ce1 --- /dev/null +++ b/src/test_help/canned_histories.rs @@ -0,0 +1,104 @@ +use crate::machines::test_help::TestHistoryBuilder; +use crate::protos::temporal::api::enums::v1::EventType; +use crate::protos::temporal::api::history::v1::{ + history_event, TimerCanceledEventAttributes, TimerFiredEventAttributes, +}; + +/// 1: EVENT_TYPE_WORKFLOW_EXECUTION_STARTED +/// 2: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED +/// 3: EVENT_TYPE_WORKFLOW_TASK_STARTED +/// 4: EVENT_TYPE_WORKFLOW_TASK_COMPLETED +/// 5: EVENT_TYPE_TIMER_STARTED +/// 6: EVENT_TYPE_TIMER_FIRED +/// 7: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED +/// 8: EVENT_TYPE_WORKFLOW_TASK_STARTED +pub fn single_timer(timer_id: &str) -> TestHistoryBuilder { + let mut t = TestHistoryBuilder::default(); + t.add_by_type(EventType::WorkflowExecutionStarted); + t.add_full_wf_task(); + let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); + t.add( + EventType::TimerFired, + history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { + started_event_id: timer_started_event_id, + timer_id: timer_id.to_string(), + }), + ); + t.add_workflow_task_scheduled_and_started(); + t +} + +/// 1: EVENT_TYPE_WORKFLOW_EXECUTION_STARTED +/// 2: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED +/// 3: EVENT_TYPE_WORKFLOW_TASK_STARTED +/// 4: EVENT_TYPE_WORKFLOW_TASK_COMPLETED +/// 5: EVENT_TYPE_TIMER_STARTED (cancel) +/// 6: EVENT_TYPE_TIMER_STARTED (wait) +/// 7: EVENT_TYPE_TIMER_FIRED (wait) +/// 8: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED +/// 9: EVENT_TYPE_WORKFLOW_TASK_STARTED +/// 10: EVENT_TYPE_WORKFLOW_TASK_COMPLETED +/// 11: EVENT_TYPE_TIMER_CANCELED (cancel) +/// 12: EVENT_TYPE_WORKFLOW_EXECUTION_COMPLETED +pub fn cancel_timer(wait_timer_id: &str, cancel_timer_id: &str) -> TestHistoryBuilder { + let mut t = TestHistoryBuilder::default(); + t.add_by_type(EventType::WorkflowExecutionStarted); + t.add_full_wf_task(); + let cancel_timer_started_id = t.add_get_event_id(EventType::TimerStarted, None); + let wait_timer_started_id = t.add_get_event_id(EventType::TimerStarted, None); + t.add( + EventType::TimerFired, + history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { + started_event_id: wait_timer_started_id, + timer_id: wait_timer_id.to_string(), + }), + ); + // 8 + t.add_full_wf_task(); + // 11 + t.add( + EventType::TimerCanceled, + history_event::Attributes::TimerCanceledEventAttributes(TimerCanceledEventAttributes { + started_event_id: cancel_timer_started_id, + timer_id: cancel_timer_id.to_string(), + ..Default::default() + }), + ); + // 12 + t.add_workflow_execution_completed(); + t +} + +/// 1: EVENT_TYPE_WORKFLOW_EXECUTION_STARTED +/// 2: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED +/// 3: EVENT_TYPE_WORKFLOW_TASK_STARTED +/// 4: EVENT_TYPE_WORKFLOW_TASK_COMPLETED +/// 5: EVENT_TYPE_TIMER_STARTED +/// 6: EVENT_TYPE_TIMER_STARTED +/// 7: EVENT_TYPE_TIMER_FIRED +/// 8: EVENT_TYPE_TIMER_FIRED +/// 9: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED +/// 10: EVENT_TYPE_WORKFLOW_TASK_STARTED +pub fn parallel_timer(timer1: &str, timer2: &str) -> TestHistoryBuilder { + let mut t = TestHistoryBuilder::default(); + t.add_by_type(EventType::WorkflowExecutionStarted); + t.add_full_wf_task(); + let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); + let timer_2_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); + t.add( + EventType::TimerFired, + history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { + started_event_id: timer_started_event_id, + timer_id: timer1.to_string(), + }), + ); + t.add( + EventType::TimerFired, + history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { + started_event_id: timer_2_started_event_id, + timer_id: timer2.to_string(), + }), + ); + t.add_workflow_task_scheduled_and_started(); + t +} diff --git a/src/test_help/mod.rs b/src/test_help/mod.rs new file mode 100644 index 000000000..e0fd84f78 --- /dev/null +++ b/src/test_help/mod.rs @@ -0,0 +1 @@ +pub mod canned_histories; From 68ca783ad3bfb42d7b621489e25f37c9ac9d823b Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Wed, 24 Feb 2021 15:20:06 -0800 Subject: [PATCH 48/51] Fix up merge problems --- src/lib.rs | 36 ++------------------- src/machines/test_help/history_builder.rs | 12 +++---- src/machines/timer_state_machine.rs | 12 +++++-- src/machines/workflow_machines.rs | 14 ++++---- src/machines/workflow_task_state_machine.rs | 5 +-- src/test_help/canned_histories.rs | 32 +++++++++++++++++- 6 files changed, 56 insertions(+), 55 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 0e0eefed8..af5be324a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -545,40 +545,10 @@ mod test { let wfid = "fake_wf_id"; let run_id = "CA733AB0-8133-45F6-A4C1-8D375F61AE8B"; let original_run_id = "86E39A5F-AE31-4626-BDFE-398EE072D156"; - let timer_1_id = "timer1".to_string(); + let timer_1_id = "timer1"; let task_queue = "test-task-queue"; - /* - 1: EVENT_TYPE_WORKFLOW_EXECUTION_STARTED - 2: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED - 3: EVENT_TYPE_WORKFLOW_TASK_STARTED - 4: EVENT_TYPE_WORKFLOW_TASK_COMPLETED - 5: EVENT_TYPE_TIMER_STARTED - 6: EVENT_TYPE_TIMER_FIRED - 7: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED - 8: EVENT_TYPE_WORKFLOW_TASK_STARTED - 9: EVENT_TYPE_WORKFLOW_TASK_FAILED - 10: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED - 11: EVENT_TYPE_WORKFLOW_TASK_STARTED - */ - let mut t = TestHistoryBuilder::default(); - t.add_by_type(EventType::WorkflowExecutionStarted); - t.add_workflow_task(); - let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); - t.add( - EventType::TimerFired, - history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { - started_event_id: timer_started_event_id, - timer_id: timer_1_id.clone(), - }), - ); - t.add_workflow_task_scheduled_and_started(); - t.add_workflow_task_failed(WorkflowTaskFailedCause::ResetWorkflow, original_run_id); - - t.add_workflow_task_scheduled_and_started(); - - // NOTE! What makes this a replay test is the server only responds with *one* batch here. - // So, server is polled once, but lang->core interactions look just like non-replay test. + let mut t = canned_histories::workflow_fails_after_timer(timer_1_id, original_run_id); let core = build_fake_core(wfid, run_id, &mut t, &[2]); let res = core.poll_task(task_queue).unwrap(); @@ -598,7 +568,7 @@ mod test { let task_tok = res.task_token; core.complete_task(TaskCompletion::ok_from_api_attrs( vec![StartTimerCommandAttributes { - timer_id: timer_1_id, + timer_id: timer_1_id.to_string(), ..Default::default() } .into()], diff --git a/src/machines/test_help/history_builder.rs b/src/machines/test_help/history_builder.rs index 58f7e2a93..e81743a10 100644 --- a/src/machines/test_help/history_builder.rs +++ b/src/machines/test_help/history_builder.rs @@ -1,16 +1,12 @@ use super::Result; -use crate::protos::temporal::api::enums::v1::WorkflowTaskFailedCause; -use crate::protos::temporal::api::history::v1::{ - History, WorkflowExecutionCompletedEventAttributes, -}; -use crate::protos::temporal::api::history::v1::{History, WorkflowTaskFailedEventAttributes}; use crate::{ machines::{workflow_machines::WorkflowMachines, ProtoCommand}, protos::temporal::api::{ - enums::v1::EventType, + enums::v1::{EventType, WorkflowTaskFailedCause}, history::v1::{ - history_event::Attributes, HistoryEvent, TimerStartedEventAttributes, - WorkflowExecutionStartedEventAttributes, WorkflowTaskCompletedEventAttributes, + history_event::Attributes, History, HistoryEvent, TimerStartedEventAttributes, + WorkflowExecutionCompletedEventAttributes, WorkflowExecutionStartedEventAttributes, + WorkflowTaskCompletedEventAttributes, WorkflowTaskFailedEventAttributes, WorkflowTaskScheduledEventAttributes, WorkflowTaskStartedEventAttributes, }, }, diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index c32504d1b..b7800f2e6 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -276,9 +276,15 @@ mod test { fn fire_happy_hist() -> (TestHistoryBuilder, WorkflowMachines) { crate::core_tracing::tracing_init(); /* - We have two versions of this test, one which processes the history in two calls, - and one which replays all of it in one go. Both versions must produce the same - two activations. + We have two versions of this test, one which processes the history in two calls, and one + which replays all of it in one go. Both versions must produce the same two activations. + However, The former will iterate the machines three times and the latter will iterate + them twice. + + There are two workflow tasks, so it seems we should iterate two times, but the reason + for the extra iteration in the incremental version is that we need to "wait" for the + timer to fire. In the all-in-one-go test, the timer is created and resolved in the same + task, hence no extra loop. */ let twd = TestWorkflowDriver::new(|mut command_sink: CommandSender| async move { let timer = StartTimerCommandAttributes { diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index d739a6955..e9389ee7e 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -1,4 +1,3 @@ -use crate::machines::workflow_machines::WFMachinesError::MalformedEvent; use crate::{ machines::{ complete_workflow_state_machine::complete_workflow, timer_state_machine::new_timer, @@ -7,8 +6,8 @@ use crate::{ }, protos::{ coresdk::{ - wf_activation_job, wf_activation_job::Attributes::RandomSeedUpdated, - RandomSeedUpdatedAttributes, StartWorkflowTaskAttributes, WfActivation, + wf_activation_job, RandomSeedUpdatedAttributes, StartWorkflowTaskAttributes, + WfActivation, }, temporal::api::{ command::v1::{command, StartTimerCommandAttributes}, @@ -20,11 +19,11 @@ use crate::{ use slotmap::SlotMap; use std::{ borrow::{Borrow, BorrowMut}, - collections::{HashMap, VecDeque}, + collections::{hash_map::DefaultHasher, HashMap, VecDeque}, + hash::{Hash, Hasher}, time::SystemTime, }; use tracing::Level; -use uuid::Uuid; type Result = std::result::Result; @@ -476,7 +475,7 @@ impl WorkflowMachines { } => { self.task_started(task_started_event_id, time)?; } - WorkflowTrigger::UpdateRunIdOnWorkflowReset { run_id: new_run_id } => { + MachineResponse::UpdateRunIdOnWorkflowReset { run_id: new_run_id } => { self.outgoing_wf_activation_jobs.push_back( wf_activation_job::Attributes::RandomSeedUpdated( RandomSeedUpdatedAttributes { @@ -485,6 +484,9 @@ impl WorkflowMachines { ), ); } + MachineResponse::IssueNewCommand(_) => { + panic!("Issue new command machine response not expected here") + } } } Ok(()) diff --git a/src/machines/workflow_task_state_machine.rs b/src/machines/workflow_task_state_machine.rs index 015b7cb4b..70a0469f6 100644 --- a/src/machines/workflow_task_state_machine.rs +++ b/src/machines/workflow_task_state_machine.rs @@ -11,10 +11,7 @@ use crate::{ }, }; use rustfsm::{fsm, TransitionResult}; -use std::panic::resume_unwind; use std::{convert::TryFrom, time::SystemTime}; -use tracing::Level; -use uuid::Uuid; fsm! { pub(super) name WorkflowTaskMachine; @@ -87,7 +84,7 @@ impl WFMachinesAdapter for WorkflowTaskMachine { }]) } WFTaskMachineCommand::RunIdOnWorkflowResetUpdate { run_id } => { - Ok(vec![WorkflowTrigger::UpdateRunIdOnWorkflowReset { run_id }]) + Ok(vec![MachineResponse::UpdateRunIdOnWorkflowReset { run_id }]) } } } diff --git a/src/test_help/canned_histories.rs b/src/test_help/canned_histories.rs index a93ea0ce1..b2b43341f 100644 --- a/src/test_help/canned_histories.rs +++ b/src/test_help/canned_histories.rs @@ -1,5 +1,5 @@ use crate::machines::test_help::TestHistoryBuilder; -use crate::protos::temporal::api::enums::v1::EventType; +use crate::protos::temporal::api::enums::v1::{EventType, WorkflowTaskFailedCause}; use crate::protos::temporal::api::history::v1::{ history_event, TimerCanceledEventAttributes, TimerFiredEventAttributes, }; @@ -102,3 +102,33 @@ pub fn parallel_timer(timer1: &str, timer2: &str) -> TestHistoryBuilder { t.add_workflow_task_scheduled_and_started(); t } + +/// 1: EVENT_TYPE_WORKFLOW_EXECUTION_STARTED +/// 2: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED +/// 3: EVENT_TYPE_WORKFLOW_TASK_STARTED +/// 4: EVENT_TYPE_WORKFLOW_TASK_COMPLETED +/// 5: EVENT_TYPE_TIMER_STARTED +/// 6: EVENT_TYPE_TIMER_FIRED +/// 7: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED +/// 8: EVENT_TYPE_WORKFLOW_TASK_STARTED +/// 9: EVENT_TYPE_WORKFLOW_TASK_FAILED +/// 10: EVENT_TYPE_WORKFLOW_TASK_SCHEDULED +/// 11: EVENT_TYPE_WORKFLOW_TASK_STARTED +pub fn workflow_fails_after_timer(timer_id: &str, original_run_id: &str) -> TestHistoryBuilder { + let mut t = TestHistoryBuilder::default(); + t.add_by_type(EventType::WorkflowExecutionStarted); + t.add_full_wf_task(); + let timer_started_event_id = t.add_get_event_id(EventType::TimerStarted, None); + t.add( + EventType::TimerFired, + history_event::Attributes::TimerFiredEventAttributes(TimerFiredEventAttributes { + started_event_id: timer_started_event_id, + timer_id: timer_id.to_string(), + }), + ); + t.add_workflow_task_scheduled_and_started(); + t.add_workflow_task_failed(WorkflowTaskFailedCause::ResetWorkflow, original_run_id); + + t.add_workflow_task_scheduled_and_started(); + t +} From 1e2f7c7fb283d71ee0bab1276c08c3510be45dc8 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Wed, 24 Feb 2021 17:00:07 -0800 Subject: [PATCH 49/51] Add UT for cancelling before sending to server --- src/machines/timer_state_machine.rs | 51 +++++++++++++++++---- src/machines/workflow_machines.rs | 29 +++++------- src/machines/workflow_task_state_machine.rs | 2 - 3 files changed, 52 insertions(+), 30 deletions(-) diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index b7800f2e6..4c0a06e99 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -244,12 +244,11 @@ impl WFMachinesAdapter for TimerMachine { impl Cancellable for TimerMachine { fn cancel(&mut self) -> Result> { - match self.on_event_mut(TimerMachineEvents::Cancel)?.pop() { - Some(TimerMachineCommand::IssueCancelCmd(cmd)) => { - Ok(MachineResponse::IssueNewCommand(cmd)) - } + Ok(match self.on_event_mut(TimerMachineEvents::Cancel)?.pop() { + Some(TimerMachineCommand::IssueCancelCmd(cmd)) => MachineResponse::IssueNewCommand(cmd), + Some(TimerMachineCommand::Canceled) => MachineResponse::NoOp, x => panic!(format!("Invalid cancel event response {:?}", x)), - } + }) } fn was_cancelled_before_sent_to_server(&self) -> bool { @@ -419,11 +418,6 @@ mod test { commands[1].command_type, CommandType::CompleteWorkflowExecution as i32 ); - // TODO in Java no commands are prepared or anything for 11 and 12 - // but I'm screwing up on event 10, the last WFTC. - // Problem seems to be timer machine's cancel gets called a second time, when it shouldn't - // be, which might really just be a problem with the way the test is driven, as it looks - // like no commands should be emitted on last (4th) iteration of wf. Think that's it. let commands = t .handle_workflow_task_take_cmds(&mut state_machines, None) .unwrap(); @@ -443,4 +437,41 @@ mod test { // There should be no commands - the wf completed at the same time the timer was cancelled assert_eq!(commands.len(), 0); } + + #[test] + fn cancel_before_sent_to_server() { + crate::core_tracing::tracing_init(); + + let twd = TestWorkflowDriver::new(|mut cmd_sink: CommandSender| async move { + cmd_sink.timer( + StartTimerCommandAttributes { + timer_id: "cancel_timer".to_string(), + start_to_fire_timeout: Some(Duration::from_secs(500).into()), + }, + false, + ); + // Immediately cancel the timer + cmd_sink.cancel_timer("cancel_timer"); + + let complete = CompleteWorkflowExecutionCommandAttributes::default(); + cmd_sink.send(complete.into()); + }); + + let mut t = TestHistoryBuilder::default(); + t.add_by_type(EventType::WorkflowExecutionStarted); + t.add_full_wf_task(); + t.add_workflow_task_scheduled_and_started(); + + let mut state_machines = + WorkflowMachines::new("wfid".to_string(), "runid".to_string(), Box::new(twd)); + + let commands = t + .handle_workflow_task_take_cmds(&mut state_machines, None) + .unwrap(); + assert_eq!(commands.len(), 1); + assert_eq!( + commands[0].command_type, + CommandType::CompleteWorkflowExecution as i32 + ); + } } diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index e9389ee7e..280c90d75 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -10,7 +10,6 @@ use crate::{ WfActivation, }, temporal::api::{ - command::v1::{command, StartTimerCommandAttributes}, enums::v1::{CommandType, EventType}, history::v1::{history_event, HistoryEvent}, }, @@ -94,6 +93,7 @@ pub enum MachineResponse { UpdateRunIdOnWorkflowReset { run_id: String, }, + NoOp, } #[derive(thiserror::Error, Debug)] @@ -297,20 +297,6 @@ impl WorkflowMachines { if !self.machine(consumed_cmd.machine).is_final_state() { self.machines_by_event_id .insert(event.event_id, consumed_cmd.machine); - // Additionally, some command types have user-created identifiers that may need to - // be associated with the event id, so that when (ex) a request to cancel them is - // issued we can identify them. - if let ProtoCommand { - attributes: - Some(command::Attributes::StartTimerCommandAttributes( - StartTimerCommandAttributes { timer_id, .. }, - )), - .. - } = consumed_cmd.command - { - self.timer_id_to_machine - .insert(timer_id, consumed_cmd.machine); - } } Ok(()) @@ -484,6 +470,7 @@ impl WorkflowMachines { ), ); } + MachineResponse::NoOp => (), MachineResponse::IssueNewCommand(_) => { panic!("Issue new command machine response not expected here") } @@ -494,11 +481,11 @@ impl WorkflowMachines { fn handle_driven_results(&mut self, results: Vec) -> Result<()> { for cmd in results { - // I don't love how boilerplate this is for just pushing new commands, and how - // weird it feels for cancels. match cmd { WFCommand::AddTimer(attrs) => { + let tid = attrs.timer_id.clone(); let timer = self.add_new_machine(new_timer(attrs)); + self.timer_id_to_machine.insert(tid, timer.machine); self.current_wf_task_commands.push_back(timer); } WFCommand::CancelTimer(attrs) => { @@ -519,6 +506,7 @@ impl WorkflowMachines { machine: mkey, }) } + MachineResponse::NoOp => {} v => { return Err(WFMachinesError::UnexpectedMachineResponse( v, @@ -547,7 +535,12 @@ impl WorkflowMachines { while let Some(c) = self.current_wf_task_commands.pop_front() { let cmd_type = CommandType::from_i32(c.command.command_type) .ok_or(WFMachinesError::UnknownCommandType(c.command.command_type))?; - self.machine_mut(c.machine).handle_command(cmd_type)?; + if !self + .machine(c.machine) + .was_cancelled_before_sent_to_server() + { + self.machine_mut(c.machine).handle_command(cmd_type)?; + } self.commands.push_back(c); } event!(Level::DEBUG, msg = "end prepare_commands", commands = ?self.commands); diff --git a/src/machines/workflow_task_state_machine.rs b/src/machines/workflow_task_state_machine.rs index 70a0469f6..1fad9a517 100644 --- a/src/machines/workflow_task_state_machine.rs +++ b/src/machines/workflow_task_state_machine.rs @@ -74,8 +74,6 @@ impl WFMachinesAdapter for WorkflowTaskMachine { if event_type == EventType::WorkflowTaskStarted && (!cur_event_past_or_at_start || has_next_event) { - // Last event in history is a task started event, so we don't - // want to iterate. return Ok(vec![]); } Ok(vec![MachineResponse::TriggerWFTaskStarted { From 431064e60e650b9456fc277841df6157d297f07c Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Wed, 24 Feb 2021 17:11:51 -0800 Subject: [PATCH 50/51] Add higher level & integ test. Need to fix hist ending w/ WECompleted --- src/lib.rs | 46 ++++++++++++++++++++++++++++ src/machines/timer_state_machine.rs | 3 +- tests/integ_tests/simple_wf_tests.rs | 40 ++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index af5be324a..bfaa5d496 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -333,6 +333,8 @@ pub enum CoreError { #[cfg(test)] mod test { use super::*; + use crate::machines::test_help::TestHistoryBuilder; + use crate::protos::temporal::api::enums::v1::EventType; use crate::{ machines::test_help::{build_fake_core, FakeCore}, protos::{ @@ -595,4 +597,48 @@ mod test { )) .unwrap(); } + + #[rstest(hist_batches, case::incremental(&[1, 2]), case::replay(&[2]))] + fn cancel_timer_before_sent_wf_bridge(hist_batches: &[usize]) { + let wfid = "fake_wf_id"; + let run_id = "fake_run_id"; + let cancel_timer_id = "cancel_timer"; + let task_queue = "test-task-queue"; + + let mut t = TestHistoryBuilder::default(); + t.add_by_type(EventType::WorkflowExecutionStarted); + t.add_full_wf_task(); + t.add_workflow_task_scheduled_and_started(); + + let core = build_fake_core(wfid, run_id, &mut t, hist_batches); + + let res = core.poll_task(task_queue).unwrap(); + assert_matches!( + res.get_wf_jobs().as_slice(), + [WfActivationJob { + attributes: Some(wf_activation_job::Attributes::StartWorkflow(_)), + }] + ); + + let task_tok = res.task_token; + core.complete_task(TaskCompletion::ok_from_api_attrs( + vec![ + StartTimerCommandAttributes { + timer_id: cancel_timer_id.to_string(), + ..Default::default() + } + .into(), + CancelTimerCommandAttributes { + timer_id: cancel_timer_id.to_string(), + ..Default::default() + } + .into(), + CompleteWorkflowExecutionCommandAttributes { result: None }.into(), + ], + task_tok, + )) + .unwrap(); + // Really only here to appease mock expectations for incremental + core.poll_task(task_queue).unwrap(); + } } diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index 4c0a06e99..09717fb7f 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -440,8 +440,6 @@ mod test { #[test] fn cancel_before_sent_to_server() { - crate::core_tracing::tracing_init(); - let twd = TestWorkflowDriver::new(|mut cmd_sink: CommandSender| async move { cmd_sink.timer( StartTimerCommandAttributes { @@ -460,6 +458,7 @@ mod test { let mut t = TestHistoryBuilder::default(); t.add_by_type(EventType::WorkflowExecutionStarted); t.add_full_wf_task(); + // TODO: Isn't working when this is completed, which should work t.add_workflow_task_scheduled_and_started(); let mut state_machines = diff --git a/tests/integ_tests/simple_wf_tests.rs b/tests/integ_tests/simple_wf_tests.rs index 7d4ddcc76..a48731261 100644 --- a/tests/integ_tests/simple_wf_tests.rs +++ b/tests/integ_tests/simple_wf_tests.rs @@ -192,3 +192,43 @@ fn timer_cancel_workflow() { )) .unwrap(); } + +#[test] +fn timer_immediate_cancel_workflow() { + let task_q = "timer_cancel_workflow"; + let temporal_server_address = match env::var("TEMPORAL_SERVICE_ADDRESS") { + Ok(addr) => addr, + Err(_) => "http://localhost:7233".to_owned(), + }; + let url = Url::try_from(&*temporal_server_address).unwrap(); + let gateway_opts = ServerGatewayOptions { + namespace: NAMESPACE.to_string(), + identity: "none".to_string(), + worker_binary_id: "".to_string(), + long_poll_timeout: Duration::from_secs(60), + target_url: url, + }; + let core = temporal_sdk_core::init(CoreInitOptions { gateway_opts }).unwrap(); + let mut rng = rand::thread_rng(); + let workflow_id: u32 = rng.gen(); + create_workflow(&core, task_q, &workflow_id.to_string()); + let cancel_timer_id = "cancel_timer"; + let task = core.poll_task(task_q).unwrap(); + core.complete_task(TaskCompletion::ok_from_api_attrs( + vec![ + StartTimerCommandAttributes { + timer_id: cancel_timer_id.to_string(), + ..Default::default() + } + .into(), + CancelTimerCommandAttributes { + timer_id: cancel_timer_id.to_string(), + ..Default::default() + } + .into(), + CompleteWorkflowExecutionCommandAttributes { result: None }.into(), + ], + task.task_token, + )) + .unwrap(); +} From b5b601c72ea86e79410adffc37cc91a9ca5d79c1 Mon Sep 17 00:00:00 2001 From: Spencer Judge Date: Wed, 24 Feb 2021 18:42:48 -0800 Subject: [PATCH 51/51] Make history w/ workflow execution completed work for cancel b4 sent --- src/lib.rs | 7 ++++--- src/machines/timer_state_machine.rs | 10 +++------- src/machines/workflow_machines.rs | 12 +++++++----- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index bfaa5d496..09405e935 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -608,7 +608,7 @@ mod test { let mut t = TestHistoryBuilder::default(); t.add_by_type(EventType::WorkflowExecutionStarted); t.add_full_wf_task(); - t.add_workflow_task_scheduled_and_started(); + t.add_workflow_execution_completed(); let core = build_fake_core(wfid, run_id, &mut t, hist_batches); @@ -638,7 +638,8 @@ mod test { task_tok, )) .unwrap(); - // Really only here to appease mock expectations for incremental - core.poll_task(task_queue).unwrap(); + if hist_batches.len() > 1 { + core.poll_task(task_queue).unwrap(); + } } } diff --git a/src/machines/timer_state_machine.rs b/src/machines/timer_state_machine.rs index 09717fb7f..94b4c4415 100644 --- a/src/machines/timer_state_machine.rs +++ b/src/machines/timer_state_machine.rs @@ -440,6 +440,7 @@ mod test { #[test] fn cancel_before_sent_to_server() { + crate::core_tracing::tracing_init(); let twd = TestWorkflowDriver::new(|mut cmd_sink: CommandSender| async move { cmd_sink.timer( StartTimerCommandAttributes { @@ -458,8 +459,7 @@ mod test { let mut t = TestHistoryBuilder::default(); t.add_by_type(EventType::WorkflowExecutionStarted); t.add_full_wf_task(); - // TODO: Isn't working when this is completed, which should work - t.add_workflow_task_scheduled_and_started(); + t.add_workflow_execution_completed(); let mut state_machines = WorkflowMachines::new("wfid".to_string(), "runid".to_string(), Box::new(twd)); @@ -467,10 +467,6 @@ mod test { let commands = t .handle_workflow_task_take_cmds(&mut state_machines, None) .unwrap(); - assert_eq!(commands.len(), 1); - assert_eq!( - commands[0].command_type, - CommandType::CompleteWorkflowExecution as i32 - ); + assert_eq!(commands.len(), 0); } } diff --git a/src/machines/workflow_machines.rs b/src/machines/workflow_machines.rs index 280c90d75..362a7745d 100644 --- a/src/machines/workflow_machines.rs +++ b/src/machines/workflow_machines.rs @@ -273,17 +273,19 @@ impl WorkflowMachines { // Feed the machine the event let mut break_later = false; + let canceled_before_sent = self + .machine(command.machine) + .was_cancelled_before_sent_to_server(); - self.submachine_handle_event(command.machine, event, true)?; + if !canceled_before_sent { + self.submachine_handle_event(command.machine, event, true)?; + } // TODO: // * More special handling for version machine - see java // * Commands cancelled this iteration are allowed to not match the event? - if !self - .machine(command.machine) - .was_cancelled_before_sent_to_server() - { + if !canceled_before_sent { break_later = true; }