In [40]:
import os
from gliner2 import GLiNER2
import json
from tqdm import tqdm
import re

In [68]:
DATA_DIR = 'data/texts_ner'
SCHEMA_PATH = 'data/annotated/schema.json'
FOLDER_NAMES = ['Python', 'ML']
SAVE_DIR = 'data/another_annotated'

In [58]:
def clean_text(text):
    """
    Cleans unwanted special characters from text, including '*', '#', and '‚óè',
    normalizes spaces and newlines, and trims the text.
    """
    # Remove bullet symbols anywhere in text
    text = re.sub(r'[\*\#‚óè]', '', text)
    # Normalize multiple spaces/tabs to single space
    text = re.sub(r'[ \t]+', ' ', text)
    # Normalize multiple newlines to max two newlines
    text = re.sub(r'\n{3,}', '\n\n', text)
    # Strip leading/trailing whitespace
    return text.strip()


texts = []
for folder in FOLDER_NAMES:
    folder_path = os.path.join(DATA_DIR, folder)
    for file in os.listdir(folder_path):
        if file.endswith('.txt'):
            file_path = os.path.join(folder_path, file)
            with open(file_path, encoding='utf-8') as f:
                raw_text = f.read()
            cleaned = clean_text(raw_text)
            texts.append({'filename': f"{folder}/{file}", 'text': cleaned})
print(len(texts))

151


In [59]:
with open(SCHEMA_PATH, "r", encoding="utf-8") as f:
    labels = json.load(f)
labels

{'SKILL_HARD': "Specific technical tools, programming languages, frameworks, or methodologies. Examples: 'Python', 'React.js', 'Docker', 'machine learning', 'REST API', 'CI/CD'",
 'SKILL_SOFT': "Personal, communication, or team-related abilities that describe behavioral or interpersonal skills. Examples: 'problem-solving', 'team player', 'attention to detail', 'leadership'",
 'ENGLISH_LEVEL': "Any explicit or implied mention of English proficiency or fluency level. Examples: 'Upper-Intermediate', 'fluent English', 'B2 level', 'advanced English communication'",
 'DEGREE': "Formal education requirements or mentions of degree type or study field. Examples: 'Bachelor‚Äôs degree', 'Master‚Äôs in Computer Science', 'PhD in Engineering'",
 'EXPERIENCE_LEVEL': "Seniority or professional rank associated with the role. Examples: 'Junior', 'Middle', 'Senior', 'Lead', 'Intern', 'Principal Engineer'",
 'EXPERIENCE_YEARS': "Duration or number of years of experience required or mentioned. Examples: '

In [9]:
model = GLiNER2.from_pretrained("fastino/gliner2-base-v1")

config.json:   0%|          | 0.00/208 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/823 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

üß†  Model Configuration
Encoder model      : microsoft/deberta-v3-base
Counting layer     : count_lstm_v2
Token pooling      : first


model.safetensors:   0%|          | 0.00/834M [00:00<?, ?B/s]

In [62]:
def assign_offsets(text, entities):
    """
    Assign start/end offsets for each entity mention in text,
    returning a list of (start, end, label) tuples sorted by start.
    """
    spans = []
    seen_spans = set()
    
    for ent in entities:
        ent_text = ent['text'].strip()
        label = ent['label'].upper()
        if not ent_text:
            continue
        
        # Word boundary regex for exact match, case-insensitive
        pattern = r'\b' + re.escape(ent_text) + r'\b'
        
        for match in re.finditer(pattern, text, flags=re.IGNORECASE):
            span = (match.start(), match.end())
            if span not in seen_spans:
                seen_spans.add(span)
                spans.append((match.start(), match.end(), label))
    
    # Sort spans by start position
    spans = sorted(spans, key=lambda x: x[0])
    return spans


def dict_to_entity_list(entities_dict):
    entities_list = []
    for label, texts in entities_dict.items():
        for text in texts:
            entities_list.append({"text": text, "label": label})
    return entities_list

annotations = []

for item in tqdm(texts, desc='Extracting entities'):
    result = model.extract_entities(item["text"], labels)
    entities_dict = result.get('entities', result)
    entity_list = dict_to_entity_list(entities_dict)
    entities_with_offsets = assign_offsets(item["text"], entity_list)
    annotations.append({
        "filename": item["filename"],
        "text": item["text"],
        "entities": entities_with_offsets
    })

Extracting entities: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 151/151 [16:03<00:00,  6.38s/it]


In [66]:
annotations[0].keys()

dict_keys(['filename', 'text', 'entities'])

In [None]:
for folder in FOLDER_NAMES:
    output_folder = os.path.join(SAVE_DIR, folder)
    os.makedirs(output_folder, exist_ok=True)

for ann in annotations:
    # Extract folder from filename, e.g. 'Python/file.txt' ‚Üí 'Python'
    folder = ann['filename'].split('/')[0]
    output_folder = os.path.join(SAVE_DIR, folder)
    filename_json = os.path.basename(ann['filename']).replace('.txt', '.json')
    output_path = os.path.join(output_folder, filename_json)

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(ann, f, ensure_ascii=False, indent=2)


In [70]:
for folder in FOLDER_NAMES:
    os.makedirs(os.path.join(SAVE_DIR, folder), exist_ok=True)

for ann in annotations:
    folder = ann['filename'].split('/')[0]
    output_folder = os.path.join(SAVE_DIR, folder)
    filename_json = os.path.basename(ann['filename']).replace('.txt', '.json')
    output_path = os.path.join(output_folder, filename_json)

    data_to_save = {
        "classes": list(labels.keys()),
        "annotations": [
            [
                ann["text"],
                {"entities": [[start, end, label] for start, end, label in ann["entities"]]}
            ]
        ]
    }

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data_to_save, f, ensure_ascii=False, indent=2)
