From 12e5949d2ae0ec830e46331e6ae12601c086af37 Mon Sep 17 00:00:00 2001 From: wwade Date: Thu, 27 May 2021 16:34:24 -0700 Subject: [PATCH] utils: fix autoDecode error for specific sequence Since we're probably going to be seeing either ASCII or UTF-8 input anyway, bump the detection confidence requirement from 0.5 to 0.8. In the case of the first test input shown in utils_test.py, it was coming in at 0.559 for Windows-1254 and just 0.505 for UTF-8, when in fact, it's UTF-8. This also makes me question using chardet at all, but it probably won't hurt at the new confidence threshold. --- jobrunner/test/utils_test.py | 18 ++++++++++++++++++ jobrunner/utils.py | 4 +++- 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 jobrunner/test/utils_test.py diff --git a/jobrunner/test/utils_test.py b/jobrunner/test/utils_test.py new file mode 100644 index 0000000..148a489 --- /dev/null +++ b/jobrunner/test/utils_test.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python +# Copyright (c) 2021 Arista Networks, Inc. All rights reserved. +# Arista Networks, Inc. Confidential and Proprietary. + +from __future__ import absolute_import, division, print_function + +import pytest + +from jobrunner.utils import autoDecode + + +@pytest.mark.parametrize(("value", "encoding"), [ + (b"Waiting for '\xe2\x9d\xaf|[Pp]db' in session " + b"routing-enabled-structure_0_64\n(Pdb++)\n", "utf-8"), + (b"hi there", "ascii"), +]) +def testAutoDecode(value, encoding): + assert value.decode(encoding) == autoDecode(value) diff --git a/jobrunner/utils.py b/jobrunner/utils.py index d87f778..9e6c9d6 100644 --- a/jobrunner/utils.py +++ b/jobrunner/utils.py @@ -351,6 +351,8 @@ def sudoKillProcGroup(pgrp): def autoDecode(byteArray): detected = chardet.detect(byteArray) encoding = detected['encoding'] - if detected['confidence'] < 0.5: # very arbitrary + if detected['confidence'] < 0.8: # very arbitrary + LOG.debug("char encoding below confidence level 0.8 (%r). " + "Fall back to UTF-8.", detected) encoding = 'utf-8' return byteArray.decode(encoding)