Skip to content

Commit 75a20a3

Browse files
committed
Fixing some tsan issues
- Prometheus civeweb server reads from the heap from worker thread. Moving registry creation before exposer creation to avoid this. - Protecting access to txnid set in the Mock stats classes - on_timer calls unwanted PolicyStatsManager->sendRequest() which is used for sending stats request per table to OVS. This is not needed for stats UTs since we are populating xid and simulating stats messages from ovs to agent. Added a new method update_state() which just updates stats state and genie objects. Note: made this for service stats mgr alone. Can be extended to other stats managers in future. - Setting up logging level needs protection. Avoiding that for now. - Add getProtoVersion in MockConnection used for stats tests - Address flakiness: SecGrp ut timer set to 100s to avoid interfering with on_timer calls. - Moving away from aggressive 10ms timer in Contract and Service stats UT to avoid data race flakiness. Now its kept at 300ms. This is adequate enough to wait for IntFlowManager to create flows and for stats mgr to update state. There are still 7 pending tsan errors in my build vm. This is WIP. Signed-off-by: Gautam Venkataramanan <gautam.chennai@gmail.com> Change-Id: I96a0c233906e0f5af8f6567baf9152f1ec74f3f2
1 parent e7a01bf commit 75a20a3

10 files changed

+57
-46
lines changed

agent-ovs/lib/PrometheusManager.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -848,11 +848,11 @@ void PrometheusManager::start (bool exposeLocalHostOnly, bool exposeEpSvcNan_)
848848
* Note: Port #9612 has been reserved for opflex here:
849849
* https://github.com/prometheus/prometheus/wiki/Default-port-allocations
850850
*/
851+
registry_ptr = make_shared<Registry>();
851852
if (exposeLocalHostOnly)
852853
exposer_ptr = unique_ptr<Exposer>(new Exposer{"127.0.0.1:9612", "/metrics", 1});
853854
else
854855
exposer_ptr = unique_ptr<Exposer>(new Exposer{"9612", "/metrics", 1});
855-
registry_ptr = make_shared<Registry>();
856856

857857
/* Initialize Metric families which can be created during
858858
* init time */

agent-ovs/ovs/PolicyStatsManager.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ void PolicyStatsManager::stop(bool unregister_listener) {
9797
{
9898
std::lock_guard<std::mutex> lock(timer_mutex);
9999
if (timer) {
100+
LOG(DEBUG) << "timer cancelled";
100101
timer->cancel();
101102
}
102103
}

agent-ovs/ovs/ServiceStatsManager.cpp

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -59,15 +59,8 @@ void ServiceStatsManager::stop() {
5959
PolicyStatsManager::stop();
6060
}
6161

62-
void ServiceStatsManager::on_timer(const error_code& ec) {
63-
if (ec) {
64-
std::lock_guard<std::mutex> lock(timer_mutex);
65-
// shut down the timer when we get a cancellation
66-
LOG(DEBUG) << "Resetting timer, error: " << ec.message();
67-
timer.reset();
68-
return;
69-
}
70-
62+
void ServiceStatsManager::update_state (const error_code& ec)
63+
{
7164
// Request Switch Manager to provide flow entries
7265
{
7366
TableState::cookie_callback_t cb_func;
@@ -139,10 +132,22 @@ void ServiceStatsManager::on_timer(const error_code& ec) {
139132
// have already created the objects.
140133
updateServiceStatsObjects(&svrCountersMap);
141134
}
135+
}
142136

137+
void ServiceStatsManager::on_timer(const error_code& ec) {
138+
if (ec) {
139+
std::lock_guard<std::mutex> lock(timer_mutex);
140+
// shut down the timer when we get a cancellation
141+
LOG(DEBUG) << "Resetting timer, error: " << ec.message();
142+
timer.reset();
143+
return;
144+
}
145+
146+
update_state(ec);
143147
sendRequest(IntFlowManager::STATS_TABLE_ID);
144148
sendRequest(IntFlowManager::SERVICE_NEXTHOP_TABLE_ID);
145149
sendRequest(IntFlowManager::SERVICE_REV_TABLE_ID);
150+
146151
if (!stopping) {
147152
std::lock_guard<std::mutex> lock(timer_mutex);
148153
if (timer) {

agent-ovs/ovs/include/ServiceStatsManager.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class ServiceStatsManager : public PolicyStatsManager {
5959
void stop();
6060

6161
/**
62-
* Timer interval handler. For unit tests only.
62+
* Timer interval handler
6363
*/
6464
void on_timer(const boost::system::error_code& ec) override;
6565

@@ -71,6 +71,11 @@ class ServiceStatsManager : public PolicyStatsManager {
7171
ofpbuf *msg,
7272
struct ofputil_flow_removed* fentry=NULL) override;
7373

74+
/**
75+
* Update stats state
76+
*/
77+
void update_state(const boost::system::error_code& ec);
78+
7479
/** Interface: ObjectListener */
7580
// Note: This is used to delete observer MOs.
7681
// EpToSvc and SvcToEp objects are removed in IntFlowManager

agent-ovs/ovs/test/ContractStatsManager_test.cpp

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ class MockContractStatsManager : public ContractStatsManager {
5757
: ContractStatsManager(agent_, idGen_, switchManager_, timer_interval_) {};
5858

5959
void testInjectTxnId (uint32_t txn_id) {
60+
std::lock_guard<mutex> lock(txnMtx);
6061
txns.insert(txn_id);
6162
}
6263
};
@@ -68,7 +69,7 @@ class ContractStatsManagerFixture : public PolicyStatsManagerFixture {
6869
intFlowManager(agent, switchManager, idGen,
6970
ctZoneManager, tunnelEpManager),
7071
contractStatsManager(&agent, idGen,
71-
switchManager, 10),
72+
switchManager, 300),
7273
policyManager(agent.getPolicyManager()) {
7374
switchManager.setMaxFlowTables(IntFlowManager::NUM_FLOW_TABLES);
7475
intFlowManager.start();
@@ -225,12 +226,8 @@ struct ofpbuf *makeFlowStatReplyMessage(MockConnection *pConn,
225226

226227
bool ContractStatsManagerFixture::checkNewFlowMapSize (size_t pol_table_size)
227228
{
228-
// Call on_timer function to process the flow entries received from
229-
// switchManager.
230-
boost::system::error_code ec;
231-
ec = make_error_code(boost::system::errc::success);
232-
contractStatsManager.on_timer(ec);
233-
229+
//on_timer will kick in via agent_io thread. That will update
230+
//stats state to indicate all necessary flows have been initialized
234231
std::lock_guard<std::mutex> lock(contractStatsManager.pstatMtx);
235232
if (contractStatsManager.contractState.newFlowCounterMap.size() == pol_table_size)
236233
return true;

agent-ovs/ovs/test/IntFlowManager_test.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1447,7 +1447,6 @@ BOOST_FIXTURE_TEST_CASE(loadBalancedService_vlan, VlanIntFlowManagerFixture) {
14471447

14481448
void BaseIntFlowManagerFixture::loadBalancedServiceTest() {
14491449
setConnected();
1450-
setLoggingLevel("trace");
14511450
LOG(DEBUG) << "#### Starting LB Service Test ####";
14521451

14531452
intFlowManager.egDomainUpdated(epg0->getURI());

agent-ovs/ovs/test/SecGrpStatsManager_test.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ class MockSecGrpStatsManager : public SecGrpStatsManager {
6363
: SecGrpStatsManager(agent_, idGen_, switchManager_, timer_interval_) {};
6464

6565
void testInjectTxnId (uint32_t txn_id) {
66+
std::lock_guard<mutex> lock(txnMtx);
6667
txns.insert(txn_id);
6768
}
6869
};
@@ -72,7 +73,7 @@ class SecGrpStatsManagerFixture : public PolicyStatsManagerFixture {
7273
public:
7374
SecGrpStatsManagerFixture() : PolicyStatsManagerFixture(),
7475
secGrpStatsManager(&agent, idGen,
75-
switchManager, 10) {
76+
switchManager, 100000) {
7677
idGen.initNamespace("l24classifierRule");
7778
idGen.initNamespace("secGroupSet");
7879
idGen.initNamespace("secGroup");

agent-ovs/ovs/test/ServiceStatsManager_test.cpp

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ class MockServiceStatsManager : public ServiceStatsManager {
5555
intFlowManager_, timer_interval_) {};
5656

5757
void testInjectTxnId (uint32_t txn_id) {
58+
std::lock_guard<mutex> lock(txnMtx);
5859
txns.insert(txn_id);
5960
}
6061
};
@@ -68,12 +69,11 @@ class ServiceStatsManagerFixture : public PolicyStatsManagerFixture {
6869
pktInHandler(agent, intFlowManager),
6970
serviceStatsManager(&agent, idGen,
7071
switchManager,
71-
intFlowManager, 10) {
72+
intFlowManager, 300) {
7273
createObjects();
7374
switchManager.setMaxFlowTables(IntFlowManager::NUM_FLOW_TABLES);
7475
intFlowManager.start();
7576

76-
setLoggingLevel("trace");
7777
// cloud nodeport tests need veth_host_ac
7878
portmapper.setPort("veth_host_ac", 72);
7979
portmapper.setPort(72, "veth_host_ac");
@@ -285,7 +285,7 @@ ServiceStatsManagerFixture::testFlowAge (PolicyStatsManager *statsManager,
285285
for (auto age = 0; age < PolicyStatsManager::MAX_AGE; age++) {
286286
boost::system::error_code ec;
287287
ec = make_error_code(boost::system::errc::success);
288-
statsManager->on_timer(ec);
288+
serviceStatsManager.update_state(ec);
289289
}
290290

291291
// 2 flows got aged
@@ -303,7 +303,7 @@ ServiceStatsManagerFixture::testFlowAge (PolicyStatsManager *statsManager,
303303

304304
boost::system::error_code ec;
305305
ec = make_error_code(boost::system::errc::success);
306-
statsManager->on_timer(ec);
306+
serviceStatsManager.update_state(ec);
307307

308308
// 2 flows get readded to new map
309309
guard.lock();
@@ -320,7 +320,7 @@ ServiceStatsManagerFixture::testFlowAge (PolicyStatsManager *statsManager,
320320
for (auto age = 0; age < PolicyStatsManager::MAX_AGE; age++) {
321321
boost::system::error_code ec;
322322
ec = make_error_code(boost::system::errc::success);
323-
statsManager->on_timer(ec);
323+
serviceStatsManager.update_state(ec);
324324
}
325325

326326
guard.lock();
@@ -335,7 +335,7 @@ ServiceStatsManagerFixture::testFlowAge (PolicyStatsManager *statsManager,
335335

336336
boost::system::error_code ec;
337337
ec = make_error_code(boost::system::errc::success);
338-
statsManager->on_timer(ec);
338+
serviceStatsManager.update_state(ec);
339339

340340
guard.lock();
341341
BOOST_CHECK_EQUAL(serviceStatsManager.statsState.newFlowCounterMap.size(), 17);
@@ -434,8 +434,8 @@ ServiceStatsManagerFixture::testFlowStatsSvcTgt (MockConnection& portConn,
434434

435435
boost::system::error_code ec;
436436
ec = make_error_code(boost::system::errc::success);
437-
// Call on_timer function to setup flow stat state.
438-
statsManager->on_timer(ec);
437+
// Call update_state() function to setup flow stat state.
438+
serviceStatsManager.update_state(ec);
439439

440440
// create first flow reply message
441441
struct ofpbuf *res_msg = makeFlowStatReplyMessage_2(&portConn,
@@ -454,9 +454,9 @@ ServiceStatsManagerFixture::testFlowStatsSvcTgt (MockConnection& portConn,
454454
LOG(DEBUG) << "1 FlowStatsReplyMessage handling successful";
455455
ofpbuf_delete(res_msg);
456456

457-
// Call on_timer function to process the stats collected
457+
// Call update_state() function to process the stats collected
458458
// and update Genie objects for stats
459-
statsManager->on_timer(ec);
459+
serviceStatsManager.update_state(ec);
460460

461461
// calculate expected packet count and byte count
462462
// that we should have in Genie object
@@ -483,9 +483,9 @@ ServiceStatsManagerFixture::testFlowStatsSvcTgt (MockConnection& portConn,
483483
LOG(DEBUG) << "2 FlowStatsReplyMessage handling successful";
484484
ofpbuf_delete(res_msg);
485485

486-
// Call on_timer function to process the stats collected
486+
// Call update_state() function to process the stats collected
487487
// and update Genie objects for stats
488-
statsManager->on_timer(ec);
488+
serviceStatsManager.update_state(ec);
489489

490490
uint32_t numFlows = entryList.size()/2;
491491
expPkts =
@@ -577,8 +577,8 @@ ServiceStatsManagerFixture::testFlowStatsPodSvc (MockConnection& portConn,
577577

578578
boost::system::error_code ec;
579579
ec = make_error_code(boost::system::errc::success);
580-
// Call on_timer function to setup flow stat state.
581-
statsManager->on_timer(ec);
580+
// Call update_state() function to setup flow stat state.
581+
serviceStatsManager.update_state(ec);
582582

583583
// create first flow reply message
584584
struct ofpbuf *res_msg = makeFlowStatReplyMessage_2(&portConn,
@@ -597,9 +597,9 @@ ServiceStatsManagerFixture::testFlowStatsPodSvc (MockConnection& portConn,
597597
LOG(DEBUG) << "1 FlowStatsReplyMessage handling successful";
598598
ofpbuf_delete(res_msg);
599599

600-
// Call on_timer function to process the stats collected
600+
// Call update_state() function to process the stats collected
601601
// and update Genie objects for stats
602-
statsManager->on_timer(ec);
602+
serviceStatsManager.update_state(ec);
603603

604604
// calculate expected packet count and byte count
605605
// that we should have in Genie object
@@ -623,9 +623,9 @@ ServiceStatsManagerFixture::testFlowStatsPodSvc (MockConnection& portConn,
623623
LOG(DEBUG) << "2 FlowStatsReplyMessage handling successful";
624624
ofpbuf_delete(res_msg);
625625

626-
// Call on_timer function to process the stats collected
626+
// Call update_state() function to process the stats collected
627627
// and update Genie objects for stats
628-
statsManager->on_timer(ec);
628+
serviceStatsManager.update_state(ec);
629629

630630
uint32_t numFlows = entryList.size()/2;
631631
expPkts =
@@ -711,8 +711,8 @@ ServiceStatsManagerFixture::testFlowRemovedSvcTgt (MockConnection& portConn,
711711

712712
boost::system::error_code ec;
713713
ec = make_error_code(boost::system::errc::success);
714-
// Call on_timer function to setup flow stat state.
715-
statsManager->on_timer(ec);
714+
// Call update_state() function to setup flow stat state.
715+
serviceStatsManager.update_state(ec);
716716

717717
// create flow removed message 1
718718
struct ofpbuf *res_msg = makeFlowRemovedMessage_2(&portConn,
@@ -749,9 +749,9 @@ ServiceStatsManagerFixture::testFlowRemovedSvcTgt (MockConnection& portConn,
749749
LOG(DEBUG) << "2 FlowRemovedMessage handling successful";
750750
ofpbuf_delete(res_msg);
751751

752-
// Call on_timer function to process the stats collected
752+
// Call update_state() function to process the stats collected
753753
// and update Genie objects for stats
754-
statsManager->on_timer(ec);
754+
serviceStatsManager.update_state(ec);
755755

756756
// calculate expected packet count and byte count
757757
// that we should have in Genie object
@@ -816,8 +816,8 @@ ServiceStatsManagerFixture::testFlowRemovedPodSvc (MockConnection& portConn,
816816

817817
boost::system::error_code ec;
818818
ec = make_error_code(boost::system::errc::success);
819-
// Call on_timer function to setup flow stat state.
820-
statsManager->on_timer(ec);
819+
// Call update_state() function to setup flow stat state.
820+
serviceStatsManager.update_state(ec);
821821

822822
// create flow removed message 1
823823
struct ofpbuf *res_msg = makeFlowRemovedMessage_2(&portConn,
@@ -854,9 +854,9 @@ ServiceStatsManagerFixture::testFlowRemovedPodSvc (MockConnection& portConn,
854854
LOG(DEBUG) << "2 FlowRemovedMessage handling successful";
855855
ofpbuf_delete(res_msg);
856856

857-
// Call on_timer function to process the stats collected
857+
// Call update_state() function to process the stats collected
858858
// and update Genie objects for stats
859-
statsManager->on_timer(ec);
859+
serviceStatsManager.update_state(ec);
860860

861861
// calculate expected packet count and byte count
862862
// that we should have in Genie object

agent-ovs/ovs/test/TableDropStatsManager_test.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ class MockIntTableDropStatsManager : public BaseTableDropStatsManager {
5151
switchManager, timer_interval){};
5252

5353
void testInjectTxnId (uint32_t txn_id) {
54+
std::lock_guard<mutex> lock(txnMtx);
5455
txns.insert(txn_id);
5556
}
5657
};
@@ -64,6 +65,7 @@ class MockAccessTableDropStatsManager : public BaseTableDropStatsManager {
6465
switchManager, timer_interval) {};
6566

6667
void testInjectTxnId (uint32_t txn_id) {
68+
std::lock_guard<mutex> lock(txnMtx);
6769
txns.insert(txn_id);
6870
}
6971
};

agent-ovs/ovs/test/include/PolicyStatsManagerFixture.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ class MockConnection : public SwitchConnection {
6666
msg.reset();
6767
return 0;
6868
}
69+
int GetProtocolVersion() { return OFP13_VERSION; }
6970
};
7071

7172
class PolicyStatsManagerFixture : public FlowManagerFixture {

0 commit comments

Comments
 (0)