In [10]:
import re, os

def clean_text(text: str):
    """
    Cleans the input text by removing unwanted characters,
    keeping punctuation marks, and converting everything to lowercase.
    
    Args:
        text (str): The text to be cleaned.

    Returns:
        str: The cleaned text.
    """

    # Convert all characters to lowercase
    text = text.lower()

    # Remove all non-alphanumeric characters except spaces and punctuation
    cleaned_text = re.sub(r"[^a-zA-Z0-9\s.,!?;:'\"-]", "", text)

    # Remove extra spaces
    cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()

    return cleaned_text

#Setting up access to folder of example files
folder_path = "./data"

#Setting up txt placeholder
raw_text = ""

#listing all files in the folder
file_list = os.listdir(folder_path)

#Setting up individual path for each file, reading contents, cleaning, and output
for file_name in file_list:
    #Path
    file_path = os.path.join(folder_path, file_name)

    #Read
    with open(file_path, 'r', encoding='utf-8') as file:
        raw_text = file.read()
    
    #Cleaning & Output
    cleaned = clean_text(raw_text)
    print(f"\n--- File: {file_name} ---")
    print(f"Cleaned Text:\n{cleaned[:300]}...\n")


--- File: accounting.txt ---
Cleaned Text:
operating lease accounting under asc 842 explained with a full example by abdi ali, senior technical accounting consultant jan 21, 2025 1. operating lease treatment under asc 842 2. operating lease vs. finance lease identification under asc 842 transference of titleownership to the lessee purchase o...


--- File: badminton.txt ---
Cleaned Text:
titans of the games 5m 18s play titans of team usa at the 2024 paris olympic games video paris 2024 titans of team usa at the 2024 paris olympic games best long rallies in badminton at the 2024 paris olympics 7m 8s play best long rallies in badminton at the 2024 paris olympics video badminton best l...


--- File: cooking.txt ---
Cleaned Text:
the joylessness of cooking by helen rosner november 25, 2020 kitchen utensils withering on the kitchen counter. feelings of emptiness are normal in times of stress and uncertainty. but isnt cooking supposed to be a balm?illustration by joana avillez in theory, i