In [1]:
import os
# from gliner2 import GLiNER2
import json
from tqdm import tqdm
import re

In [5]:
DATA_DIR = 'data/texts_ner'
SCHEMA_PATH = 'data/annotated/schema.json'
FOLDER_NAMES = ['Python', 'ML']
SAVE_DIR = 'data/cleaned_texts_ner'

In [30]:
def clean_markdown(text: str) -> str:
    """
    Cleans markdown-like formatting for NER preprocessing.
    Preserves meaningful newlines but removes structural/stylistic markup.
    """

    # 1. Remove markdown headers (##, ###, ####, etc.)
    text = re.sub(r'^\s*#{1,6}\s*', '', text, flags=re.MULTILINE)

    # 2. Remove bold/italic markers (**, *, _, ****)
    text = re.sub(r'\*{3,}', '', text)          # *** or ****
    text = re.sub(r'\*{2}([^*]+)\*{2}', r'\1', text)  # **bold**
    text = re.sub(r'\*([^*]+)\*', r'\1', text)        # *italic*
    text = re.sub(r'_([^_]+)_', r'\1', text)          # _italic_

    # 3. Remove list markers at line start (*, •, -, —)
    text = re.sub(r'^[\t ]*[\*\-•—]\s+', '', text, flags=re.MULTILINE)

    # 4. Remove markdown links [text](url) → text
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)

    # 5. Strip leading/trailing whitespace on each line
    lines = [line.strip() for line in text.splitlines()]

    # 6. Collapse multiple blank lines into a single blank line
    cleaned = []
    prev_blank = False
    for line in lines:
        if line == "":
            if not prev_blank:
                cleaned.append("")
            prev_blank = True
        else:
            cleaned.append(line)
            prev_blank = False

    return "\n".join(cleaned).strip()

In [43]:
def clean_markdown_for_ner(markdown_text: str) -> str:
    """
    Cleans structural and stylistic markdown features from a text to create 
    a plain, continuous text stream suitable for Named Entity Recognition (NER) training.

    Args:
        markdown_text: The raw text containing markdown features.

    Returns:
        The cleaned, plain text string.
    """
    text = markdown_text

    # 1. Link Markup Removal: 
    # Finds markdown links and replaces the entire [Text](URL) structure with only the 'Text'.
    # This pattern: \[([^\]]+)\]\(([^)]*)\) captures the text inside the brackets.
    # It must be run before removing general brackets/parentheses/bolding.
    text = re.sub(r'\[([^\]]+)\]\(([^)]*)\)', r'\1', text)

    # 2. Emphasis/Bolding and Separators Removal:
    # Removes ** (double asterisks) and **** (quadruple asterisks).
    text = re.sub(r'\*\*', '', text)
    text = re.sub(r'\*{4,}', '', text) # More robust: removes four or more consecutive asterisks

    # 3. Markdown Header Removal:
    # Removes ## and ### (and any number of leading # marks) followed by a space.
    text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)

    # 4. List Markers and Separators Removal (with optional leading whitespace/indentation):
    # Removes leading indentation followed by list markers (*, •, or —) and subsequent space.
    # Note: \s* removes leading whitespace (indentation).
    text = re.sub(r'^\s*[\*\•—]\s*', '', text, flags=re.MULTILINE)
    
    # Also remove horizontal rules/separators composed of dashes or hyphens
    text = re.sub(r'^-{3,}\s*$', '', text, flags=re.MULTILINE)

    # 5. Remove any remaining structural characters (brackets, parentheses, pipes) 
    # that might not have been part of link markup but are structural noise (e.g., table structure).
    # NOTE: Be careful with general parentheses if they contain content entities! 
    # We will only remove the general list markers and structural elements like the pipe `|`
    text = re.sub(r'[|]', '', text)


    # 6. Unnecessary Whitespace Normalization:
    # A. Remove leading/trailing whitespace from each line.
    text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)
    
    # B. Collapse excessive newlines (more than one line break) into a single newline.
    # This handles large vertical gaps.
    text = re.sub(r'\n{2,}', '\n', text)
    
    # C. Replace remaining newlines with a space to create a continuous flow, 
    # then remove any double spaces created by the previous steps.
    text = text.replace('\n', ' ')
    text = re.sub(r'\s{2,}', ' ', text)
    
    # D. Final trim of leading/trailing overall whitespace.
    text = text.strip()

    return text

In [44]:
texts = []
for folder in FOLDER_NAMES:
    folder_path = os.path.join(DATA_DIR, folder)
    for file in os.listdir(folder_path):
        if file.endswith('.txt'):
            file_path = os.path.join(folder_path, file)
            with open(file_path, encoding='utf-8') as f:
                raw_text = f.read()
            cleaned = clean_markdown(raw_text)
            texts.append({'filename': f"{folder}/{file}", 'text': cleaned})
print(len(texts))

151


In [45]:
print(texts[0]["text"])

Middle/Senior Full-Stack Engineer — SL
Faria is a forward-thinking company that consistently delivers new features
and is passionate about staying ahead of the competition. Every day is
different, and you will be challenged to think creatively and innovate within
a multi-disciplined team of talented people. We’re a great team to work with,
seriously committed to doing our best work, and we value individuals who can
work well as part of a team.

We are seeking a Middle/Senior Full-Stack Engineer to join the development
team of the SpotLight application, where you will develop features across the
full SpotLight technology stack, working on both data platform integration and
user interface components under the guidance of the Lead Engineer.

Key Responsibilities

Implement dashboard features requiring both back-end data processing and front-end visualization.
Develop API endpoints for educational data analytics functionality
Support data extraction and transformation processes
Collaborate

In [33]:
os.makedirs(SAVE_DIR, exist_ok=True)
os.makedirs(SAVE_DIR + "/Python", exist_ok=True)
os.makedirs(SAVE_DIR + "/ML", exist_ok=True)

In [34]:
# Save cleaned texts
os.makedirs(SAVE_DIR, exist_ok=True)
for cleaned_text in texts:
    with open(f"{SAVE_DIR}/{cleaned_text['filename']}", 'w', encoding='utf-8') as f:
        f.write(cleaned_text['text'])