In [13]:
import os
from gliner2 import GLiNER2
import json
from tqdm import tqdm
import re
from pathlib import Path

In [14]:
BASE_DIR = Path.cwd().resolve().parents[1]
CLEANTED_TEXTS_DIR = os.path.join(BASE_DIR, 'data', 'texts_ner_cleaned_standard')
SCHEMA_PATH = os.path.join(BASE_DIR, 'data', 'annotated_manual', 'schema.json')
FOLDER_NAMES = ['Python', 'ML', 'Android', 'DevOps', 'dotNET', 'FrontEnd', 'Golang', 'Java', 'macOS', 'Node', 'PHP']
SAVE_DIR = os.path.join(BASE_DIR, 'data', 'cleaned_standard_annotated')

In [15]:
def clean_text(text):
    """
    Cleans unwanted special characters from text, including '*', '#', and '‚óè',
    normalizes spaces and newlines, and trims the text.
    """
    # Remove bullet symbols anywhere in text
    text = re.sub(r'[\*\#‚óè]', '', text)
    # Normalize multiple spaces/tabs to single space
    text = re.sub(r'[ \t]+', ' ', text)
    # Normalize multiple newlines to max two newlines
    text = re.sub(r'\n{3,}', '\n\n', text)
    # Strip leading/trailing whitespace
    return text.strip()


texts = []
for folder in FOLDER_NAMES:
    folder_path = os.path.join(CLEANTED_TEXTS_DIR, folder)
    for file in os.listdir(folder_path):
        if file.endswith('.txt'):
            file_path = os.path.join(folder_path, file)
            with open(file_path, encoding='utf-8') as f:
                raw_text = f.read()
            # cleaned = clean_text(raw_text)
            texts.append({'filename': f"{folder}/{file}", 'text': raw_text})
print(len(texts))

702


In [16]:
with open(SCHEMA_PATH, "r", encoding="utf-8") as f:
    labels = json.load(f)
labels

{'SKILL_HARD': "Specific technical tools, programming languages, frameworks, or methodologies. Examples: 'Python', 'React.js', 'Docker', 'machine learning', 'REST API', 'CI/CD'",
 'SKILL_SOFT': "Personal, communication, or team-related abilities that describe behavioral or interpersonal skills. Examples: 'problem-solving', 'team player', 'attention to detail', 'leadership'",
 'ENGLISH_LEVEL': "Any explicit or implied mention of English proficiency or fluency level. Examples: 'Upper-Intermediate', 'fluent English', 'B2 level', 'advanced English communication'",
 'DEGREE': "Formal education degrees or academic qualifications. Examples: 'Bachelor‚Äôs degree', 'Master‚Äôs in Computer Science', 'PhD in Engineering'",
 'EXPERIENCE_LEVEL': "Seniority or professional rank associated with the role. Examples: 'Junior', 'Middle', 'Senior', 'Lead', 'Intern'",
 'EXPERIENCE_YEARS': "Duration or number of years of experience required or mentioned. Examples: '3+ years', 'at least two years'",
 'BENEFI

In [17]:
model = GLiNER2.from_pretrained("fastino/gliner2-large-v1")

üß†  Model Configuration
Encoder model      : microsoft/deberta-v3-large
Counting layer     : count_lstm
Token pooling      : first


In [18]:
def assign_offsets(text, entities):
    """
    Assign start/end offsets for each entity mention in text,
    returning a list of (start, end, label) tuples sorted by start.
    """
    spans = []
    seen_spans = set()
    
    for ent in entities:
        ent_text = ent['text'].strip()
        label = ent['label'].upper()
        if not ent_text:
            continue
        
        # Word boundary regex for exact match, case-insensitive
        pattern = r'\b' + re.escape(ent_text) + r'\b'
        
        for match in re.finditer(pattern, text, flags=re.IGNORECASE):
            span = (match.start(), match.end())
            if span not in seen_spans:
                seen_spans.add(span)
                spans.append((match.start(), match.end(), label))
    
    # Sort spans by start position
    spans = sorted(spans, key=lambda x: x[0])
    return spans


def dict_to_entity_list(entities_dict):
    entities_list = []
    for label, texts in entities_dict.items():
        for text in texts:
            entities_list.append({"text": text, "label": label})
    return entities_list

annotations = []

for item in tqdm(texts, desc='Extracting entities'):
    result = model.extract_entities(item["text"], labels)
    entities_dict = result.get('entities', result)
    entity_list = dict_to_entity_list(entities_dict)
    entities_with_offsets = assign_offsets(item["text"], entity_list)
    annotations.append({
        "filename": item["filename"],
        "text": item["text"],
        "entities": entities_with_offsets
    })

Extracting entities:   0%|          | 0/702 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Extracting entities: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 702/702 [3:46:37<00:00, 19.37s/it]  


In [19]:
annotations[0]

{'filename': 'Python/0.txt',
 'text': 'Middle/Senior Full-Stack Engineer ‚Äî SL\nFaria is a forward-thinking company that consistently delivers new features\nand is passionate about staying ahead of the competition. Every day is\ndifferent, and you will be challenged to think creatively and innovate within\na multi-disciplined team of talented people. We‚Äôre a great team to work with,\nseriously committed to doing our best work, and we value individuals who can\nwork well as part of a team.\n\nWe are seeking a Middle/Senior Full-Stack Engineer to join the development\nteam of the SpotLight application, where you will develop features across the\nfull SpotLight technology stack, working on both data platform integration and\nuser interface components under the guidance of the Lead Engineer.\n\nKey Responsibilities\n\nImplement dashboard features requiring both back-end data processing and front-end visualization.\nDevelop API endpoints for educational data analytics functionality\nSupp

In [20]:
for folder in FOLDER_NAMES:
    output_folder = os.path.join(SAVE_DIR, folder)
    os.makedirs(output_folder, exist_ok=True)

for ann in annotations:
    # Extract folder from filename, e.g. 'Python/file.txt' ‚Üí 'Python'
    folder = ann['filename'].split('/')[0]
    output_folder = os.path.join(SAVE_DIR, folder)
    filename_json = os.path.basename(ann['filename']).replace('.txt', '.json')
    output_path = os.path.join(output_folder, filename_json)

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(ann, f, ensure_ascii=False, indent=2)


In [22]:
file = annotations[423]
entities = file['entities']
by_class = {}
text = file['text']
for start, end, label in entities:
    span = text[start:end]
    by_class.setdefault(label, []).append(span)

for label, words in by_class.items():
    print(label)
    for w in words:
        print(' -', w)

ROLE
 - Golang Engineer
 - Go Engineer
COMPANY_NAME
 - Solidgate
 - Solidgate
 - Solidgate
 - Solidgate
 - Solidgate
SKILL_HARD
 - Go
 - Go
 - PostgreSQL
 - microservices
 - Apache Kafka
 - RabbitMQ
 - AWS
 - CI/CD
 - CI/CD
EXPERIENCE_YEARS
 - 3+ years
 - 1.5+ years
SKILL_SOFT
 - decision-making skills
BENEFIT
 - 30+ days off
 - unlimited sick leave
 - free office meals
 - health coverage
 - Apple gear
 - conferences
 - wellness benefits


In [None]:
# for folder in FOLDER_NAMES:
#     os.makedirs(os.path.join(SAVE_DIR, folder), exist_ok=True)

# for ann in annotations:
#     folder = ann['filename'].split('/')[0]
#     output_folder = os.path.join(SAVE_DIR, folder)
#     filename_json = os.path.basename(ann['filename']).replace('.txt', '.json')
#     output_path = os.path.join(output_folder, filename_json)

#     data_to_save = {
#         "classes": list(labels.keys()),
#         "annotations": [
#             [
#                 ann["text"],
#                 {"entities": [[start, end, label] for start, end, label in ann["entities"]]}
#             ]
#         ]
#     }

#     with open(output_path, 'w', encoding='utf-8') as f:
#         json.dump(data_to_save, f, ensure_ascii=False, indent=2)
