From ed62066288e582776634fa25401d58aebad9dbaf Mon Sep 17 00:00:00 2001 From: Artur Lebedev Date: Wed, 27 May 2026 18:20:04 +0300 Subject: [PATCH 1/2] Update zip_utils.py - get_file_names fixes Resolves #3047: - adds ability to process files with any unicode characters in names - makes `get_file_names` much faster --- analyzer/windows/lib/common/zip_utils.py | 40 +++++++++++++----------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/analyzer/windows/lib/common/zip_utils.py b/analyzer/windows/lib/common/zip_utils.py index 7ea21702a6a..370f4260e80 100644 --- a/analyzer/windows/lib/common/zip_utils.py +++ b/analyzer/windows/lib/common/zip_utils.py @@ -1,7 +1,6 @@ import hashlib import logging import os -import re import shutil import subprocess from pathlib import Path @@ -14,8 +13,6 @@ log = logging.getLogger(__name__) -# FILE_NAME_REGEX = re.compile("[\s]{2}((?:[a-zA-Z0-9\.\-,_\\\\]+( [a-zA-Z0-9\.\-,_\\\\]+)?)+)\\r") -FILE_NAME_REGEX = re.compile(r"\s{2}((?:[a-zA-Z0-9.\-,_\\]+(?: [a-zA-Z0-9.\-,_\\]+)?)*)\r") FILE_EXT_OF_INTEREST = [ ".bat", ".cmd", @@ -106,38 +103,43 @@ def get_file_names(seven_zip_path, archive_path): """ log.debug([seven_zip_path, "l", archive_path]) p = subprocess.run( - [seven_zip_path, "l", archive_path], stdin=subprocess.DEVNULL, stderr=subprocess.PIPE, stdout=subprocess.PIPE + [seven_zip_path, "l", "-sccUTF-8", archive_path], + stdin=subprocess.DEVNULL, + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, ) - stdoutput = p.stdout.decode() + stdoutput = p.stdout.decode("utf-8", errors="replace") stdoutput_lines = stdoutput.split("\n") - in_table = False items_under_header = False file_names = [] for line in stdoutput_lines: + line = line.rstrip("\r") if in_table: - # This is a line in the table (header or footer separators) if "-----" in line: if items_under_header: items_under_header = False else: items_under_header = True continue - - # These are the lines that we care about, since they contain the file names if items_under_header: - # Find the end of the line (\r), note the carriage return since 7zip will run on Windows - file_name = re.search(FILE_NAME_REGEX, line) - if file_name: - # The first capture group is the whole file name + returns - # The second capture group is just the file name - file_name = file_name.group(1) - file_names.append(file_name) + if len(line) > 53: + file_name = line[53:].strip() + if file_name: + file_names.append(file_name) else: - # Table Headers - if all(item.lower() in line.lower() for item in ("Date", "Time", "Attr", "Size", "Compressed", "Name")): + if all( + item.lower() in line.lower() + for item in ( + "Date", + "Time", + "Attr", + "Size", + "Compressed", + "Name", + ) + ): in_table = True - return file_names From 109a89758677cb910e4ac726a323083c39fee773 Mon Sep 17 00:00:00 2001 From: Artur Lebedev Date: Wed, 27 May 2026 18:41:55 +0300 Subject: [PATCH 2/2] Update zip_utils.py Apply gemini-code-assist advices --- analyzer/windows/lib/common/zip_utils.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/analyzer/windows/lib/common/zip_utils.py b/analyzer/windows/lib/common/zip_utils.py index 370f4260e80..a8343d23a11 100644 --- a/analyzer/windows/lib/common/zip_utils.py +++ b/analyzer/windows/lib/common/zip_utils.py @@ -109,22 +109,25 @@ def get_file_names(seven_zip_path, archive_path): stdout=subprocess.PIPE, ) stdoutput = p.stdout.decode("utf-8", errors="replace") - stdoutput_lines = stdoutput.split("\n") + stdoutput_lines = stdoutput.splitlines() in_table = False items_under_header = False file_names = [] + name_col_index = 53 for line in stdoutput_lines: - line = line.rstrip("\r") if in_table: if "-----" in line: if items_under_header: items_under_header = False else: items_under_header = True + last_space = line.rfind(" ") + if last_space != -1: + name_col_index = last_space + 1 continue if items_under_header: - if len(line) > 53: - file_name = line[53:].strip() + if len(line) > name_col_index: + file_name = line[name_col_index:].strip() if file_name: file_names.append(file_name) else: