In [1]:
import os
import sys
import shutil
from definitions import ROOT_DIR

# Add scripts directory to sys.path
# Adapted from Taras Alenin's answer on StackOverflow at:
# https://stackoverflow.com/a/55623567
scripts_path = os.path.join(ROOT_DIR, 'scripts')
if scripts_path not in sys.path:
    sys.path.insert(0, scripts_path)

# Import custom modules
from text_preprocessor import preprocess  # noqa: E402

In [2]:
test_dir = '../data/test'
preprocessing_test_dir = os.path.join(test_dir, 'preprocessing_test_data')

# Adapted from: https://stackoverflow.com/a/13118112
shutil.rmtree(preprocessing_test_dir, ignore_errors=True)
os.mkdir(preprocessing_test_dir)

In [3]:
cleaned_dir = os.path.join(preprocessing_test_dir, 'cleaned')
os.mkdir(cleaned_dir)

In [4]:
A_dir = os.path.join(cleaned_dir, 'A')
notA_dir = os.path.join(cleaned_dir, 'notA')
U_dir = os.path.join(cleaned_dir, 'U')

os.mkdir(A_dir)
os.mkdir(notA_dir)
os.mkdir(U_dir)

In [5]:
test_contents = [
    'here is some text that should not be touched',           # A-0.txt
    'Here is some text that should be touched',               # A-1.txt
    'here are s0me 1ntegers',                                 # A-2.txt
    'and sp*c|al cháracter$',                                 # A-3.txt
    """here
    is  some     white space""",                              # U-0.txt
    'and back to untouched text',                             # U-1.txt
    """
    White space
    CAPITALIZATIONS
    **and special chars**!@#$%^&*()_-+=\\|~`
    """,                                                      # U-2.txt
    'white space      and CAPITALIZATIONS',                   # U-3.txt
    'white space and special chars %#^&!@*!',              # notA-0.txt
    'CAPS and spe^#@&@*!*(!cial ch372ars ',                # notA-1.txt
    '',                                                    # notA-2.txt
    ' - -------'                                             # notA-3.txt
]

In [6]:
num_files = 4
canonical_class_labels = sorted(os.listdir(cleaned_dir))
for i, canonical_class in enumerate(canonical_class_labels):
    for j, test_file in enumerate(range(num_files)):
        file_name = f"{canonical_class}-{test_file}.txt"
        file_path = os.path.join(cleaned_dir, canonical_class, file_name)
        with open(file_path, 'w') as f:
            f.write(test_contents[i * num_files + j])

In [7]:
preprocess(preprocessing_test_dir, canonical_class_labels)

In [8]:
expected_output_dir = os.path.join(preprocessing_test_dir, 'preprocessed')
expected_output = [
    'here is some text that should not be touched',           # A-0.txt
    'here is some text that should be touched',               # A-1.txt
    'here are s0me 1ntegers',                                 # A-2.txt
    'and spcal chracter',                                     # A-3.txt
    'here is some white space',                               # U-0.txt
    'and back to untouched text',                             # U-1.txt
    'white space capitalizations and special chars',          # U-2.txt
    'white space and capitalizations',                        # U-3.txt
    'white space and special chars',                       # notA-0.txt
    'caps and special ch372ars',                           # notA-1.txt
    '',                                                    # notA-2.txt
    ''                                                     # notA-3.txt
]

In [9]:
assert os.path.isdir(expected_output_dir), True

In [10]:
for i, canonical_class in enumerate(canonical_class_labels):
    assert os.path.isdir(
        os.path.join(expected_output_dir, canonical_class)), True
    for j, test_file in enumerate(range(num_files)):
        file_name = f"{canonical_class}-{test_file}.txt"
        file_path = os.path.join(expected_output_dir, canonical_class, file_name)
        assert os.path.exists(file_path), True
        with open(file_path, 'r') as f:
            contents = f.read()
            # Adapted from: https://stackoverflow.com/a/11587247
            try:
                assert (expected_output[i * num_files + j] == contents), True
            except AssertionError:
                print(f"""
                NOT EQUAL
                File: {file_name}
                Expected: {expected_output[i * num_files + j]}
                Got: {contents}""")

Actual orig: 'here is some text that should not be touched'
Actual prep: 'here is some text that should not be touched'
Actual orig: 'here is some text that should be touched'
Actual prep: 'here is some text that should be touched'
Actual orig: 'here are s0me 1ntegers'
Actual prep: 'here are s0me 1ntegers'
Actual orig: 'and spcal chracter'
Actual prep: 'and spcal chracter'
Actual orig: 'here is some white space'
Actual prep: 'here is some white space'
Actual orig: 'and back to untouched text'
Actual prep: 'and back to untouched text'
Actual orig: 'white space capitalizations and special chars'
Actual prep: 'white space capitalizations and special chars'
Actual orig: 'white space and capitalizations'
Actual prep: 'white space and capitalizations'
Actual orig: 'white space and special chars'
Actual prep: 'white space and special chars'
Actual orig: 'caps and special ch372ars'
Actual prep: 'caps and special ch372ars'
Actual orig: ''
Actual prep: ''
Actual orig: ''
Actual prep: ''
