In [6]:
import json
import re # regex
import os # build fild paths that work on any OS

In [None]:
# --- Step 1: Read the File and Confirm Path ---

try:
    current_directory = os.getcwd()
    file_path = os.path.join(current_directory, '..', 'vocab', 'domislang.txt')

    print(f"Attempting to read from: {file_path}\n")

    # Use 'with open' to safely open the file.
    # Specify encoding='utf-8' to handle special characters correctly.
    with open(file_path, 'r', encoding='utf-8') as f:
        # Read only the first 500 characters to test readability.
        content_preview = f.read(500)

        print("\033[92m[✓] Success!\033[0m")  # Green text")
        print("\n--- File Preview (First 500 Characters) ---")
        print(content_preview)

except FileNotFoundError:
    print("\033[91m❌ ERROR: File not found.\033[0m")
    print(f"I looked for the file at this path: {file_path}")
    print("\n Check: " \
    "\n - File existence" \
    "\n - File name + extension" \
    "\n - Path")

Attempting to read from: /Users/willi.wonkah/_Development/dominican-chatbot/json-conversion/scripts/../vocab/domislang.txt

[92m[✓] Success![0m

--- File Preview (First 500 Characters) ---
﻿# A
1. Abimbao - This term describes someone who has just been beaten and has visible swelling as a result / beaten up.
EX: "Juan y Manuel pelearon, yo no sé por qué, pero Manuel ta abimbao."
GS: "Juan y Manuel pelearon, yo no sé por qué, pero Manuel está mal herido."
EN: "Juan and Manuel had a fight, I don't know why, but Manuel is beaten up."
SYN: Golpeado / Mal herido
ANT:-
Usage: 8 / 10


2. Abombao - Full (When you eat a lot) / Stuffed.
EX: "Me comí un pica pollo y ahora toy abombao."
GS: 


In [32]:
# --- Step 2: Split the File into Individual Entries ---
# check: 650 terms instead of the expected 645.

# debug - create a list to hold the debug output
debug_log = []
raw_entries = []
current_entry = []

try:
    current_directory = os.getcwd()
    file_path = os.path.join(current_directory, '..', 'vocab', 'domislang.txt')
    # debug - declare output file
    debug_output_file = os.path.join(current_directory, 'output', 'debug_output.txt')

    with open(file_path, 'r', encoding='utf-8-sig') as f:
        all_lines = f.readlines()

    # Implements the line-by-line chunking algorithm
    for line in all_lines:
        # debug - announcement
        debug_log.append(f"Checking line: '{line.strip()}'")

        if re.search(r'^\d+\.\s', line):
            # debug: match!
            debug_log.append(f"  └─> Matched as a new entry start.")

            if current_entry:
                raw_entries.append("\n".join(current_entry))
            current_entry = [line.strip()]
        else:
            if line.strip().startswith('#') or not line.strip():
                continue
            else:
                current_entry.append(line.strip())

    if current_entry:
        raw_entries.append("\n".join(current_entry))

    # --- Verification ---
    #print(f"\033[92m[✓] Success! The file was split into {len(raw_entries)} entries.\033[0m\n")

    # 3. After the loop, write the entire log to a file
    try:
        with open(debug_output_file, "w", encoding="utf-8") as f:
            f.write("\n".join(debug_log))
        print(f"✅ Success! Debug log created at: {debug_output_file}")
    except Exception as e:
        print(f"❌ Error writing debug file: {e}")

    # print("--- Inspecting First Entry [0] ---")
    # print(raw_entries[0])
    # print("\n----------------------------------\n")

    # print("--- Inspecting Second Entry [1] ---")
    # print(raw_entries[1])
    # print("\n-----------------------------------")


except FileNotFoundError:
    print(f"\033[91m❌ ERROR: File not found at {file_path}\033[0m")
except IndexError:
    print("\033[91m❌ ERROR: Could not inspect the entries.\033[0m")
    print(f"It seems only {len(raw_entries)} entries were found. The file might be smaller than expected.")

✅ Success! Debug log created at: /Users/willi.wonkah/_Development/dominican-chatbot/json-conversion/scripts/output/debug_output.txt
