In [1]:
""" --- Step 1: Imports --- """

import json
import re # regex
import os # build fild paths that work on any OS

In [5]:
""" --- Step 2: Setup File Paths --- """

# Use os.path.join to make sure paths work on any operating system
input_file = os.path.join('..', 'vocab', 'dominican-slang-example.txt')
output_file = os.path.join('output', 'dominican-slang-example.json')

print(f"Input file set to: {input_file}")
print(f"Output file set to: {output_file}")

Input file set to: ../vocab/dominican-slang-example.txt
Output file set to: output/dominican-slang-example.json


In [None]:
""" --- Step 3: Parse Entry Function + Test One Entry --- """

def parse_entry(entry_text):
    """
    Parses a single raw text entry and converts it into a structured dictionary.
    Handles multiple definitions, numbered keys, and optional fields.
    """

    lines = entry_text.strip().split('\n')
    first_line = lines[0].strip()
    match = re.match(r'^(\d+)\.\s(.*)', first_line)
    if not match:
        raise ValueError(f"Invalid entry format: {first_line}")
    entry_id = match.group(1)
    term = match.group(2).strip()

    entry_dict = {
        'id': entry_id,
        'term': term,
        'definitions': []
    }
    current_definition = {}

    for line in lines[1:]:
        line = line.strip()
        if not line:
            continue  # Skip empty lines

        if line == '---': # debug: check if dash type is the same. this technically has 3 dashes together, but test_entry has 1 combined
            # Separator for definitions
            if current_definition:
                entry_dict['definitions'].append(current_definition)
                current_definition = {}
            continue

        parts = line.split(':', 1)
        if len(parts) == 2:
            # 1. Clean up the key
            key = parts[0].strip().lower()
            key = re.sub(r'\d+$', '', key) # This turns 'def2' into 'def', 'syn2' into 'syn', etc.

            # 2. Clean up the value
            value = parts[1].strip().strip('"')
            keys_to_split = ['syn', 'ant', 'term']

            if key in keys_to_split:
                if value == '-' or value == '':
                    # If the value is just a dash, create an empty list.
                    current_definition[key] = []
                else:
                    # 1. Replace any commas with slashes to be consistent.
                    # 2. Split the string by the slash into a list.
                    # 3. Use a list comprehension to strip whitespace from each item.
                    items = value.replace(',', '/').split('/')
                    current_definition[key] = [item.strip() for item in items]
            else:
                current_definition[key] = value

    if current_definition:
        entry_dict['definitions'].append(current_definition)

    return entry_dict

# Test function on a single entry
test_entry = """10. Agentao / Agentá (Agentado / Agentada)
DEF: Arrogant.
EX: "Después que compraste ese carro tú ta' muy agentao."
GS: "Después que compraste ese carro, tú estás muy arrogante."
EN: "You became very arrogant after you bought that car."
SYN: Arrogante / Atrevido / Atrevida
ANT: Humilde
USAGE: 8/10
---
DEF2: Bold.
SYN2: Atrevido / Atrevida
ANT2: Humilde
USAGE: 8/10"""

test_entry2 = """273. Guagua platanera
DEF: A pick-up truck, typically an old Toyota Tacoma, that goes around neighborhoods selling fruits and vegetables, often announcing their products with a megaphone.
EX: "Si pasa una guagua platanera, párala pa' comprar fruta' pa' hacer una batida."
GS: "Si pasa una guagua platanera, párala para comprar frutas para hacer un batido."
EN: "If a guagua platanera passes by, please stop it so I can buy some fruits for a smoothie."
SYN:-
ANT:-
USAGE: 10/10"""

parsed_test = parse_entry(test_entry)
print(json.dumps(parsed_test, indent=2, ensure_ascii=False))

In [None]:
""" --- Step 4: Split the File into Individual Entries --- """

# 1. Read the input file and split it into raw_entries
raw_entries = []
current_entry = []

# Only separate entries so far
with open(input_file, 'r', encoding='utf-8') as f:
    all_lines = f.readlines()

    for line in all_lines:

        if re.search(r'^\d+\.\s', line):
            if current_entry:
                raw_entries.append("\n".join(current_entry))
            current_entry = [line.strip()]
        else:
            if line.strip().startswith('#') or not line.strip():
                continue
            else:
                current_entry.append(line.strip())

    if current_entry:
        raw_entries.append("\n".join(current_entry))

# 2. Loop through each entry and parse it
all_entries_data = []
for entry_text in raw_entries:
    try:
        parsed_data = parse_entry(entry_text)
        if parsed_data:
            all_entries_data.append(parsed_data)
    except Exception as e:
        print(f"⚠️ Error parsing entry starting with: {entry_text.split('\\n')[0]}")

# 3. Verify the output
print(f"Successfully parsed {len(all_entries_data)} entries.")
print("--- First Parsed Entry ---")
if all_entries_data:
    print(json.dumps(all_entries_data[0], indent=2, ensure_ascii=False))

Successfully parsed 19 entries.
--- First Parsed Entry ---
{
  "id": "270",
  "term": "Guagua",
  "definitions": [
    {
      "def": "Bus for public transportation in DR.",
      "ex": "Si tú quieres ir pa' allá, tú tiene' que coger una guagua.",
      "gs": "Si tú quieres ir para allá tienes que tomar un autobus.",
      "en": "If you want to get there, you will have to take a bus.",
      "syn": [
        "Autobus",
        "Minibus"
      ],
      "ant": [],
      "usage": "10/10"
    }
  ]
}


In [None]:
""" --- Step 5: Write to JSON File --- """

# Write the final list of dictionaries to a JSON file
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(all_entries_data, f, indent=2, ensure_ascii=False)

print(f"✅ Success! Data written to {output_file}")