From f1c6ca368c598e1b56a734b6628669a5e9a89323 Mon Sep 17 00:00:00 2001 From: Florian Wahl <41266220+wahlflo@users.noreply.github.com> Date: Fri, 4 Aug 2023 16:44:17 +0200 Subject: [PATCH 1/7] added test case with umlauts --- .../parser/test_emails/utf8_with_umlauts.eml | 13 +++++++++++++ tests/library/parser/test_parsed_email.py | 13 ++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 tests/library/parser/test_emails/utf8_with_umlauts.eml diff --git a/tests/library/parser/test_emails/utf8_with_umlauts.eml b/tests/library/parser/test_emails/utf8_with_umlauts.eml new file mode 100644 index 0000000..a7ecd97 --- /dev/null +++ b/tests/library/parser/test_emails/utf8_with_umlauts.eml @@ -0,0 +1,13 @@ +Message-ID: +Date: Tue, 25 Jul 2023 16:07:11 +0200 +MIME-Version: 1.0 +User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 + Thunderbird/102.13.0 +Content-Language: en-US +To: dmaier@mailbox.org +From: dmaier@mailbox.org +Subject: =?UTF-8?Q?Dies_ist_ein_d=c3=a4mlicher_Test?= +Content-Type: text/plain; charset=UTF-8; format=flowed +Content-Transfer-Encoding: 8bit + +Dies ist ein dämlicher Test. diff --git a/tests/library/parser/test_parsed_email.py b/tests/library/parser/test_parsed_email.py index 5a5d621..119726c 100644 --- a/tests/library/parser/test_parsed_email.py +++ b/tests/library/parser/test_parsed_email.py @@ -228,4 +228,15 @@ def test_get_reloaded_content_from_html_case_3(self): def url_decode(self): import urllib.parse - self.assertEqual(r"data=05%7C01", urllib.parse.unquote(r"data=05|01")) \ No newline at end of file + self.assertEqual(r"data=05%7C01", urllib.parse.unquote(r"data=05|01")) + + def test_case_uf8_with_umlauts_txt(self): + eml_content = load_test_eml_file('utf8_with_umlauts.eml') + x = ParsedEmail(eml_content=eml_content) + self.assertEqual(x.get_text_content().replace('\n', ' ').strip(), 'Dies ist ein dämlicher Test.') + + + def test_case_uf8_with_umlauts_txt(self): + eml_content = load_test_eml_file('utf8_with_umlauts.eml') + x = ParsedEmail(eml_content=eml_content) + self.assertEqual(x.get_text_content().replace('\n', ' ').strip(), 'Dies ist ein dämlicher Test.') From 6120af9ee0eb99152ef5903100ac6e41eb346917 Mon Sep 17 00:00:00 2001 From: Florian Wahl <41266220+wahlflo@users.noreply.github.com> Date: Fri, 4 Aug 2023 16:48:45 +0200 Subject: [PATCH 2/7] added Unit tests on Windows --- .github/workflows/unit-tests.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index f1d2dbc..5780018 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -8,13 +8,15 @@ on: jobs: build: - - runs-on: ubuntu-latest + name: Test on ${{ matrix.os }} strategy: fail-fast: false matrix: + os: [ ubuntu-latest, windows-latest ] python-version: [ "3.7", "3.8", "3.9", "3.10" , "3.11" ] + runs-on: ${{ matrix.os }} + steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} From 8eec8d714f8a35f69cf46e08dcba60ffbf605de8 Mon Sep 17 00:00:00 2001 From: Florian Wahl <41266220+wahlflo@users.noreply.github.com> Date: Fri, 4 Aug 2023 16:52:51 +0200 Subject: [PATCH 3/7] made Install dependencies work on every OS --- .github/workflows/unit-tests.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 5780018..4ce6901 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -23,11 +23,13 @@ jobs: uses: actions/setup-python@v3 with: python-version: ${{ matrix.python-version }} + - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + python -m pip install -r requirements.txt + - name: Test with pytest run: | pytest From 50dfa3def95217e4dd1e5ff6667477200b4fee53 Mon Sep 17 00:00:00 2001 From: Florian Wahl <41266220+wahlflo@users.noreply.github.com> Date: Fri, 4 Aug 2023 18:19:16 +0200 Subject: [PATCH 4/7] set default encoding to utf-8 when reading files --- eml_analyzer/cli_script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eml_analyzer/cli_script.py b/eml_analyzer/cli_script.py index 18916a1..7a56bcc 100644 --- a/eml_analyzer/cli_script.py +++ b/eml_analyzer/cli_script.py @@ -10,7 +10,7 @@ def main(): argument_parser = argparse.ArgumentParser(prog='emlAnalyzer', description='A CLI script to analyze an email in the EML format for viewing headers, extracting attachments, etc.') - argument_parser.add_argument('-i', '--input', help="Path to the EML file. Accepts standard input if omitted", type=argparse.FileType('r'), nargs='?', default=sys.stdin) + argument_parser.add_argument('-i', '--input', help="Path to the EML file. Accepts standard input if omitted", type=argparse.FileType('r', encoding='utf-8'), nargs='?', default=sys.stdin) argument_parser.add_argument('--header', action='store_true', default=False, help="Shows the headers") argument_parser.add_argument('-x', '--tracking', action='store_true', default=False, help="Shows content which is reloaded from external resources in the HTML part") argument_parser.add_argument('-a', '--attachments', action='store_true', default=False, help="Lists attachments") From 1d49d0d0e7f4965ddd727950b6f445d3e7454749 Mon Sep 17 00:00:00 2001 From: Florian Wahl <41266220+wahlflo@users.noreply.github.com> Date: Fri, 4 Aug 2023 19:03:20 +0200 Subject: [PATCH 5/7] do not decode if decoding is not needed --- eml_analyzer/library/parser/parsed_email.py | 48 ++++++++++++++++----- tests/library/parser/test_parsed_email.py | 2 +- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/eml_analyzer/library/parser/parsed_email.py b/eml_analyzer/library/parser/parsed_email.py index 7d6ec9e..5691fb9 100644 --- a/eml_analyzer/library/parser/parsed_email.py +++ b/eml_analyzer/library/parser/parsed_email.py @@ -86,6 +86,10 @@ def _get_first_email_payload_with_matching_type(message: email.message.Message, @staticmethod def _get_decoded_payload_from_message(message: email.message.Message) -> None or str: + transfer_encoding = ParsedEmail._header_lookup_first_element(message=message, key='content-transfer-encoding') + if transfer_encoding in {'7bit', '8bit', 'binary'}: + return message.get_payload(decode=False) + payload_in_bytes = message.get_payload(decode=True) list_of_possible_encodings = ParsedEmail._create_list_of_possible_encodings(message=message) @@ -93,7 +97,8 @@ def _get_decoded_payload_from_message(message: email.message.Message) -> None or for encoding_format in list_of_possible_encodings: try: return payload_in_bytes.decode(encoding_format) - except ValueError: + except ValueError as error: + print('Error: ' + str(error)) continue raise PayloadDecodingException('Payload could not be decoded') @@ -102,23 +107,44 @@ def _create_list_of_possible_encodings(message: email.message.Message) -> list: """ creates a list of the most possible encodings of a payload """ list_of_possible_encodings = list() + header_values = ParsedEmail._header_lookup(message=message, key='content-type') + # at first add the encodings mentioned in the object header - for k, v in message.items(): - k = str(k).lower() - v = str(v).lower() - if k == 'content-type': - entries = v.split(';') - for entry in entries: - entry = entry.strip() - if entry.startswith('charset='): - encoding = entry.replace('charset=', '').replace('"', '') - list_of_possible_encodings.append(encoding) + for v in header_values: + entries = v.split(';') + for entry in entries: + entry = entry.strip() + if entry.startswith('charset='): + encoding = entry.replace('charset=', '').replace('"', '') + list_of_possible_encodings.append(encoding) for x in ['utf-8', 'windows-1251', 'iso-8859-1', 'us-ascii', 'iso-8859-15']: if x not in list_of_possible_encodings: list_of_possible_encodings.append(x) return list_of_possible_encodings + @staticmethod + def _payload_needs_decoding(message: email.message.Message) -> bool: + transfer_encoding = ParsedEmail._header_lookup_first_element(message=message, key='content-transfer-encoding') + if transfer_encoding is None: + return True + return transfer_encoding not in {'7bit', '8bit', 'binary'} + + @staticmethod + def _header_lookup_first_element(message: email.message.Message, key: str) -> str or None: + for header_key, value in message.items(): + if str(header_key).lower() == key: + return str(value).lower() + return None + + @staticmethod + def _header_lookup(message: email.message.Message, key: str) -> [str]: + values = list() + for header_key, value in message.items(): + if str(header_key).lower() == key: + values.append(str(value).lower()) + return values + def get_attachments(self) -> List[Attachment]: return_list = list() counter = 0 diff --git a/tests/library/parser/test_parsed_email.py b/tests/library/parser/test_parsed_email.py index 119726c..2c2aa52 100644 --- a/tests/library/parser/test_parsed_email.py +++ b/tests/library/parser/test_parsed_email.py @@ -8,7 +8,7 @@ def load_test_eml_file(test_file) -> str: current_directory_of_the_script = os.path.dirname(__file__) test_emails = os.path.join(current_directory_of_the_script, 'test_emails') path_to_test_file = os.path.join(test_emails, test_file) - with open(path_to_test_file, mode='r') as input_file: + with open(path_to_test_file, mode='r', encoding='utf-8') as input_file: return input_file.read() From 7fe068f7de9258c37832b10dcb4d2372f096a821 Mon Sep 17 00:00:00 2001 From: Florian Wahl <41266220+wahlflo@users.noreply.github.com> Date: Fri, 4 Aug 2023 20:45:45 +0200 Subject: [PATCH 6/7] decode values of header fields if they are encoded --- eml_analyzer/library/parser/parsed_email.py | 3 ++- .../library/parser/printable_filename.py | 25 +++++++++++++++++-- tests/library/parser/test_parsed_email.py | 12 ++++++--- .../library/parser/test_printable_filename.py | 7 ++++-- 4 files changed, 38 insertions(+), 9 deletions(-) diff --git a/eml_analyzer/library/parser/parsed_email.py b/eml_analyzer/library/parser/parsed_email.py index 5691fb9..4b57aa4 100644 --- a/eml_analyzer/library/parser/parsed_email.py +++ b/eml_analyzer/library/parser/parsed_email.py @@ -7,6 +7,7 @@ from typing import NamedTuple, List, Tuple, Set from eml_analyzer.library.parser.attachment import Attachment +from eml_analyzer.library.parser.printable_filename import decode_ASCII_encoded_string from eml_analyzer.library.parser.structure_item import StructureItem @@ -48,7 +49,7 @@ def _add_error_messages(self, error_message: str) -> None: def get_header(self) -> List[Tuple[str, any]]: """ returns list of key-value pairs of header entries """ - return self._parsed_email.items() + return [(key, decode_ASCII_encoded_string(value)) for key, value in self._parsed_email.items()] def get_structure(self) -> StructureItem: return StructureItem(message=self._parsed_email) diff --git a/eml_analyzer/library/parser/printable_filename.py b/eml_analyzer/library/parser/printable_filename.py index 25c4df1..afdf731 100644 --- a/eml_analyzer/library/parser/printable_filename.py +++ b/eml_analyzer/library/parser/printable_filename.py @@ -2,6 +2,7 @@ import binascii import re import email.message +import quopri def get_printable_filename_if_existent(message: email.message.Message) -> str or None: @@ -12,7 +13,7 @@ def get_printable_filename_if_existent(message: email.message.Message) -> str or def _make_string_printable(original_string: str) -> str: - original_string = _decode_ASCII_encoded_string(string=original_string) + original_string = decode_ASCII_encoded_string(string=original_string) additional_allowed_chars = {'_', '.', '(', ')', '-', ' '} clean_name = '' @@ -24,7 +25,13 @@ def _make_string_printable(original_string: str) -> str: return clean_name -def _decode_ASCII_encoded_string(string: str) -> str: +def decode_ASCII_encoded_string(string: str) -> str: + string = _decode_ASCII_encoded_string_baseX(string=string) + string = _decode_ASCII_encoded_string_quoted_printable_string(string=string) + return string + + +def _decode_ASCII_encoded_string_baseX(string: str) -> str: """ decodes ASCII strings which are encoded like: name := "=?UTF-8?B?" + base64_encode(string) + "?=" """ pattern = re.compile(r'=\?(.+?)\?B\?(.+?)\?=', re.IGNORECASE) for match in list(re.finditer(pattern=pattern, string=string)): @@ -33,3 +40,17 @@ def _decode_ASCII_encoded_string(string: str) -> str: except binascii.Error: pass return string + + +def _decode_ASCII_encoded_string_quoted_printable_string(string: str) -> str: + pattern = re.compile(r'=\?(.+?)\?Q\?(.+?)\?=', re.IGNORECASE) + for match in list(re.finditer(pattern=pattern, string=string)): + try: + encoding = match.group(1) + encoded_string = match.group(2) + decoded_string = quopri.decodestring(encoded_string) + replacement = decoded_string.decode(encoding) + string = string.replace(match.group(0), replacement) + except binascii.Error: + pass + return string diff --git a/tests/library/parser/test_parsed_email.py b/tests/library/parser/test_parsed_email.py index 2c2aa52..76e2298 100644 --- a/tests/library/parser/test_parsed_email.py +++ b/tests/library/parser/test_parsed_email.py @@ -32,7 +32,7 @@ def test_case_1_header_subject(self): header = x.get_header() for key, value in header: if key == 'Subject': - self.assertIn(value, 'UnitTest Subject =?UTF-8?B?TcO8bmNoZW4s?=') + self.assertEqual(value, 'UnitTest Subject München,') return self.fail(msg="header subject not found") @@ -235,8 +235,12 @@ def test_case_uf8_with_umlauts_txt(self): x = ParsedEmail(eml_content=eml_content) self.assertEqual(x.get_text_content().replace('\n', ' ').strip(), 'Dies ist ein dämlicher Test.') - - def test_case_uf8_with_umlauts_txt(self): + def test_case_uf8_with_umlauts_header(self): eml_content = load_test_eml_file('utf8_with_umlauts.eml') x = ParsedEmail(eml_content=eml_content) - self.assertEqual(x.get_text_content().replace('\n', ' ').strip(), 'Dies ist ein dämlicher Test.') + header = x.get_header() + for key, value in header: + if key == 'Subject': + self.assertEqual(value, 'Dies_ist_ein_dämlicher_Test') + return + self.fail(msg="header subject not found") diff --git a/tests/library/parser/test_printable_filename.py b/tests/library/parser/test_printable_filename.py index 9855c19..3dc1cdf 100644 --- a/tests/library/parser/test_printable_filename.py +++ b/tests/library/parser/test_printable_filename.py @@ -1,6 +1,6 @@ import unittest -from eml_analyzer.library.parser.printable_filename import get_printable_filename_if_existent, _make_string_printable, _decode_ASCII_encoded_string +from eml_analyzer.library.parser.printable_filename import get_printable_filename_if_existent, _make_string_printable, decode_ASCII_encoded_string class TestPrintableFilename(unittest.TestCase): @@ -12,9 +12,10 @@ def test_decode_ASCII_encoded_string(self): ('=?UTF-8?B?4o6Y7Z+/?=', '⎘퟿'), ('=?utf-8?b?4o6Y7Z+/?=', '⎘퟿'), ('=?utf-16?b?SABlAGwAbABvAFcAbwByAGwAZAA=?=', 'HelloWorld'), + ('=?UTF-8?Q?=c3=a4?=', 'ä'), ] for value, expected in test_cases: - result = _decode_ASCII_encoded_string(string=value) + result = decode_ASCII_encoded_string(string=value) self.assertEqual(result, expected) def test_make_string_printable(self): @@ -24,6 +25,7 @@ def test_make_string_printable(self): ('Hello World', 'Hello World'), ('=?UTF-8?B?7Z+/?=', ''), # character is not printable ('=?UTF-8?B?4o6Y?=', '_'), # character is printable + ('=?UTF-8?Q?=c3=a4?=', 'ä'), # character is printable ] for value, expected in test_cases: result = _make_string_printable(original_string=value) @@ -36,6 +38,7 @@ def test_get_printable_filename_if_existent(self): ('Hello World', 'Hello World'), ('=?UTF-8?B?7Z+/?=', ''), # character is not printable ('=?UTF-8?B?4o6Y?=', '_'), # character is printable + ('=?UTF-8?Q?=c3=a4?=', 'ä'), # character is printable (None, None), ] From 247cb1f4592fee5eb929a8ce4eda84ed01f711b7 Mon Sep 17 00:00:00 2001 From: Florian Wahl <41266220+wahlflo@users.noreply.github.com> Date: Fri, 4 Aug 2023 20:46:50 +0200 Subject: [PATCH 7/7] update version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 258b80f..4413f76 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setuptools.setup( name="eml-analyzer", - version="2.0.3", + version="3.0.0", author="Florian Wahl", author_email="florian.wahl.developer@gmail.com", description="A cli script to analyze an E-Mail in the eml format for viewing the header, extracting attachments, etc.",