Skip to content

Commit

Permalink
[Auto-Techsupport] Issues related to Multiple Cores crashing handled (s…
Browse files Browse the repository at this point in the history
…onic-net#1948)

#### What I did

**Issues seen when multiple cores are crashed in very quick succession:**
1) The **rate_limit_interval** is not honored. Because, i previously was finding out the last created tech-support using the glob pattern `sonic_dump_*tar*`, which  will not include the dumps which are being currently run. These existing dump will not have .tar.gz extension. Thus, modified the `get_ts_dumps` to search based on the TS_ROOT i.e `sonic_dump_*`
2) **show auto-tech support history** is not showing all the created dumps. I've previously used to take the diff of tech support dumps before and after running the invocation and used to assign the diff as the corresponding techsupport for this core. This approach is prone to race condition as we can have multiple dumps in the diff found in the interval. 
Avoided this by parsing the stdout returned by `show techsupport` invocation

#### How to verify it

1) Unit Tests
2) Generate core-dumps in very quick succession. Use the default rate limit interval. Should only see one entry in tech-support history
3) Set global rate limit interval to 0. Generate cores in quick succession. Should see a few entries in the history.
  • Loading branch information
vivekrnv committed Dec 6, 2021
1 parent 656ade1 commit 6de91af
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 42 deletions.
33 changes: 20 additions & 13 deletions scripts/coredump_gen_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import time
import argparse
import syslog
import re
from swsscommon.swsscommon import SonicV2Connector
from utilities_common.auto_techsupport_helper import *

Expand Down Expand Up @@ -54,7 +55,6 @@ def __init__(self, core_name, container_name, db):
self.db = db
self.proc_mp = {}
self.core_ts_map = {}
self.curr_ts_list = []

def handle_core_dump_creation_event(self):
file_path = os.path.join(CORE_DUMP_DIR, self.core_name)
Expand Down Expand Up @@ -93,7 +93,7 @@ def handle_core_dump_creation_event(self):
since_cfg = self.get_since_arg()
new_file = self.invoke_ts_cmd(since_cfg)
if new_file:
self.write_to_state_db(int(time.time()), new_file[0])
self.write_to_state_db(int(time.time()), new_file)

def write_to_state_db(self, timestamp, ts_dump):
name = strip_ts_ext(ts_dump)
Expand All @@ -111,26 +111,33 @@ def get_since_arg(self):
return since_cfg
return SINCE_DEFAULT

def parse_ts_dump_name(self, ts_stdout):
""" Figure out the ts_dump name from the techsupport stdout """
matches = re.findall(TS_PTRN, ts_stdout)
if matches:
return matches[-1]
syslog.syslog(syslog.LOG_ERR, "stdout of the 'show techsupport' cmd doesn't have the dump name")
return ""

def invoke_ts_cmd(self, since_cfg):
since_cfg = "'" + since_cfg + "'"
cmd = " ".join(["show", "techsupport", "--since", since_cfg])
rc, _, stderr = subprocess_exec(["show", "techsupport", "--since", since_cfg], env=ENV_VAR)
cmd_opts = ["show", "techsupport", "--silent", "--since", since_cfg]
cmd = " ".join(cmd_opts)
rc, stdout, stderr = subprocess_exec(cmd_opts, env=ENV_VAR)
if not rc:
syslog.syslog(syslog.LOG_ERR, "show techsupport failed with exit code {}, stderr:{}".format(rc, stderr))
new_list = get_ts_dumps(True)
diff = list(set(new_list).difference(set(self.curr_ts_list)))
self.curr_ts_list = new_list
if not diff:
new_dump = self.parse_ts_dump_name(stdout)
if not new_dump:
syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd))
else:
syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, diff))
return diff
syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump))
return new_dump

def verify_rate_limit_intervals(self, global_cooloff, container_cooloff):
"""Verify both the global and per-proc rate_limit_intervals have passed"""
self.curr_ts_list = get_ts_dumps(True)
if global_cooloff and self.curr_ts_list:
last_ts_dump_creation = os.path.getmtime(self.curr_ts_list[-1])
curr_ts_list = get_ts_dumps(True)
if global_cooloff and curr_ts_list:
last_ts_dump_creation = os.path.getmtime(curr_ts_list[-1])
if time.time() - last_ts_dump_creation < global_cooloff:
msg = "Global rate_limit_interval period has not passed. Techsupport Invocation is skipped. Core: {}"
syslog.syslog(syslog.LOG_INFO, msg.format(self.core_name))
Expand Down
3 changes: 1 addition & 2 deletions scripts/generate_dump
Original file line number Diff line number Diff line change
Expand Up @@ -1339,8 +1339,7 @@ main() {
fi

# Invoke the TechSupport Cleanup Hook
setsid $(echo > /tmp/techsupport_cleanup.log;
python3 /usr/local/bin/techsupport_cleanup.py ${TARFILE} &>> /tmp/techsupport_cleanup.log) &
setsid python3 /usr/local/bin/techsupport_cleanup.py ${TARFILE} &> /tmp/techsupport_cleanup.log &

echo ${TARFILE}

Expand Down
4 changes: 2 additions & 2 deletions scripts/techsupport_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def handle_techsupport_creation_event(dump_name, db):
file_path = os.path.join(TS_DIR, dump_name)
if not verify_recent_file_creation(file_path):
return
_ , num_bytes = get_stats(os.path.join(TS_DIR, TS_PTRN))
_ , num_bytes = get_stats(os.path.join(TS_DIR, TS_PTRN_GLOB))

if db.get(CFG_DB, AUTO_TS, CFG_STATE) != "enabled":
msg = "techsupport_cleanup is disabled. No cleanup is performed. current size occupied : {}"
Expand All @@ -40,7 +40,7 @@ def handle_techsupport_creation_event(dump_name, db):
syslog.syslog(syslog.LOG_NOTICE, msg.format(pretty_size(num_bytes)))
return

removed_files = cleanup_process(max_ts, TS_PTRN, TS_DIR)
removed_files = cleanup_process(max_ts, TS_PTRN_GLOB, TS_DIR)
clean_state_db_entries(removed_files, db)


Expand Down
57 changes: 36 additions & 21 deletions tests/coredump_gen_handler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
sys.path.append("scripts")
import coredump_gen_handler as cdump_mod

AUTO_TS_STDOUT="""
Techsupport is running with silent option. This command might take a long time.
The SAI dump is generated to /tmp/saisdkdump/sai_sdk_dump_11_22_2021_11_07_PM
/tmp/saisdkdump
"""

def set_auto_ts_cfg(redis_mock, state="disabled",
rate_limit_interval="0",
Expand Down Expand Up @@ -74,12 +79,13 @@ def test_invoc_ts_state_db_update(self):
populate_state_db(redis_mock)
with Patcher() as patcher:
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "show techsupport" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
patcher.fs.create_file(ts_dump)
else:
return 1, "", "Command Not Found"
return 0, "", ""
return 0, AUTO_TS_STDOUT + ts_dump, ""
cdump_mod.subprocess_exec = mock_cmd
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz")
Expand All @@ -105,12 +111,13 @@ def test_global_rate_limit_interval(self):
populate_state_db(redis_mock)
with Patcher() as patcher:
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "show techsupport" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
patcher.fs.create_file(ts_dump)
else:
return 1, "", "Command Not Found"
return 0, "", ""
return 0, AUTO_TS_STDOUT + ts_dump, ""
cdump_mod.subprocess_exec = mock_cmd
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz")
Expand Down Expand Up @@ -138,12 +145,13 @@ def test_per_container_rate_limit_interval(self):
"orchagent;{};swss".format(int(time.time()))})
with Patcher() as patcher:
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "show techsupport" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
patcher.fs.create_file(ts_dump)
else:
return 1, "", "Command Not Found"
return 0, "", ""
return 0, AUTO_TS_STDOUT + ts_dump, ""
cdump_mod.subprocess_exec = mock_cmd
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz")
Expand All @@ -167,12 +175,13 @@ def test_invoc_ts_after_rate_limit_interval(self):
"orchagent;{};swss".format(int(time.time()))})
with Patcher() as patcher:
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "show techsupport" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
patcher.fs.create_file(ts_dump)
else:
return 1, "", "Command Not Found"
return 0, "", ""
return 0, AUTO_TS_STDOUT + ts_dump, ""
cdump_mod.subprocess_exec = mock_cmd
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz")
Expand All @@ -197,12 +206,13 @@ def test_core_dump_with_invalid_container_name(self):
populate_state_db(redis_mock, {})
with Patcher() as patcher:
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "show techsupport" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
patcher.fs.create_file(ts_dump)
else:
return 1, "", "Command Not Found"
return 0, "", ""
return 0, AUTO_TS_STDOUT + ts_dump, ""
cdump_mod.subprocess_exec = mock_cmd
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
patcher.fs.create_file("/var/core/snmpd.12345.123.core.gz")
Expand All @@ -225,12 +235,13 @@ def test_feature_table_not_set(self):
populate_state_db(redis_mock, {})
with Patcher() as patcher:
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "show techsupport" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
patcher.fs.create_file(ts_dump)
else:
return 1, "", "Command Not Found"
return 0, "", ""
return 0, AUTO_TS_STDOUT + ts_dump, ""
cdump_mod.subprocess_exec = mock_cmd
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
patcher.fs.create_file("/var/core/python3.12345.123.core.gz")
Expand All @@ -251,10 +262,11 @@ def test_since_argument(self):
populate_state_db(redis_mock)
with Patcher() as patcher:
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "show techsupport --since '4 days ago'" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
return 0, "", ""
if "--since '4 days ago'" in cmd_str:
patcher.fs.create_file(ts_dump)
return 0, AUTO_TS_STDOUT + ts_dump, ""
elif "date --date='4 days ago'" in cmd_str:
return 0, "", ""
else:
Expand Down Expand Up @@ -284,12 +296,13 @@ def test_masic_core_dump(self):
populate_state_db(redis_mock)
with Patcher() as patcher:
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "show techsupport" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
patcher.fs.create_file(ts_dump)
else:
return 1, "", "Command Not Found"
return 0, "", ""
return 0, AUTO_TS_STDOUT + ts_dump, ""
cdump_mod.subprocess_exec = mock_cmd
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz")
Expand All @@ -315,10 +328,12 @@ def test_invalid_since_argument(self):
populate_state_db(redis_mock)
with Patcher() as patcher:
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "show techsupport --since '2 days ago'" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
return 0, "", ""
if "--since '2 days ago'" in cmd_str:
patcher.fs.create_file(ts_dump)
print(AUTO_TS_STDOUT + ts_dump)
return 0, AUTO_TS_STDOUT + ts_dump, ""
elif "date --date='whatever'" in cmd_str:
return 1, "", "Invalid Date Format"
else:
Expand Down Expand Up @@ -370,7 +385,7 @@ def mock_cmd(cmd, env):
cmd_str = " ".join(cmd)
if "show techsupport" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
return 0, "", ""
return 0, AUTO_TS_STDOUT + ts_dump, ""
patcher.fs.set_disk_usage(2000, path="/var/core/")
patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz", st_size=25)
patcher.fs.create_file("/var/core/lldpmgrd.12345.22.core.gz", st_size=25)
Expand Down
12 changes: 8 additions & 4 deletions utilities_common/auto_techsupport_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"CFG_DB", "AUTO_TS", "CFG_STATE", "CFG_MAX_TS", "COOLOFF",
"CFG_CORE_USAGE", "CFG_SINCE", "FEATURE", "STATE_DB",
"TS_MAP", "CORE_DUMP", "TIMESTAMP", "CONTAINER",
"TIME_BUF", "SINCE_DEFAULT"
"TIME_BUF", "SINCE_DEFAULT", "TS_PTRN_GLOB"
] + [ # Methods
"verify_recent_file_creation",
"get_ts_dumps",
Expand All @@ -30,7 +30,9 @@
CORE_DUMP_PTRN = "*.core.gz"

TS_DIR = "/var/dump"
TS_PTRN = "sonic_dump_*.tar*"
TS_ROOT = "sonic_dump_*"
TS_PTRN = "sonic_dump_.*tar.*" # Regex Exp
TS_PTRN_GLOB = "sonic_dump_*tar*" # Glob Exp

# CONFIG DB Attributes
CFG_DB = "CONFIG_DB"
Expand Down Expand Up @@ -78,8 +80,10 @@ def strip_ts_ext(ts_path):


def get_ts_dumps(full_path=False):
""" Get the list of TS dumps in the TS_DIR, sorted by the creation time """
curr_list = glob.glob(os.path.join(TS_DIR, TS_PTRN))
"""
Get the list of TS dumps in the TS_DIR, sorted by the creation time
"""
curr_list = glob.glob(os.path.join(TS_DIR, TS_ROOT))
curr_list.sort(key=os.path.getmtime)
if full_path:
return curr_list
Expand Down

0 comments on commit 6de91af

Please sign in to comment.