In [24]:
import json
import os
from pathlib import Path

In [2]:
def detect_language(text: str) -> str:
    """
    Detects whether the text is Ukrainian (Cyrillic) or English (Latin)
    based on Unicode character ranges.
    """
    cyrillic_count = 0
    latin_count = 0

    for ch in text:
        if 'А' <= ch <= 'я' or ch in 'ЇїІіЄєҐґ':  # Cyrillic letters (includes Ukrainian-specific)
            cyrillic_count += 1
        elif 'A' <= ch <= 'Z' or 'a' <= ch <= 'z':  # Latin letters
            latin_count += 1

    if cyrillic_count == 0 and latin_count == 0:
        return "Unknown"
    elif cyrillic_count > latin_count:
        return "Ukrainian"
    elif latin_count > cyrillic_count:
        return "English"
    else:
        return "Mixed"

In [None]:
BASE_DIR = Path.cwd().resolve().parents[1]
DICTS_DIR = os.path.join(BASE_DIR, 'data', 'dicts')
TEXTS_DIR = os.path.join(BASE_DIR, 'data', 'texts')
TEXT_NER_DIR = os.path.join(BASE_DIR, 'data', 'texts_ner')

In [None]:
# Transforming data from MD to txt formats, detecting only English language
for i in os.listdir(DICTS_DIR):
    if i in ["Python.json", "ML.json", "Java.json", "Android.json", "CPP.json", "DevOps.json", "Golang.json", "FrontEnd.json", "macOS.json", "Node.json", "PHP.json", "dotNET.json"]:
        with open(f"{DICTS_DIR}/{i}", "r", encoding="utf-8") as f:
            lst = json.load(f)
            for vac_dct in lst:
                vac_path = f'{TEXTS_DIR}/{i.split(".")[0]}/{vac_dct["index"]}.md'
                with open(vac_path, "r", encoding="utf-8") as vac:
                    vac_txt = vac.read()
                new_vac_txt = vac_dct["title"] + "\n" + vac_txt
                lang = detect_language(vac_txt)
                if lang == "English":
                    vac_new_path = f'{TEXT_NER_DIR}/{i.split(".")[0]}'
                    os.makedirs(vac_new_path, exist_ok=True)
                    with open(f"{vac_new_path}/{vac_dct['index']}.txt", "w", encoding="utf-8") as vac_new:
                        vac_new.write(new_vac_txt)