In [1]:
import json
import random
from pathlib import Path
from tqdm import tqdm
import os
import spacy
from spacy.tokens import DocBin, Span
from sklearn.model_selection import train_test_split
import torch

In [2]:
BASE_DIR = Path.cwd().resolve().parents[2]
DATA_DIR = os.path.join(BASE_DIR, 'data', 'cleaned_standard_annotated')
SAVE_SPACY = os.path.join(BASE_DIR, 'data', 'spacy')
CLEANTED_TEXTS_DIR = os.path.join(BASE_DIR, 'data', 'texts_ner_cleaned_standard')
SCHEMA_PATH = os.path.join(BASE_DIR, 'data', 'annotated_manual', 'schema.json')
FOLDER_NAMES = ['Python', 'ML', 'Android', 'DevOps', 'dotNET', 'FrontEnd', 'Golang', 'Java', 'macOS', 'Node', 'PHP']

In [3]:
# Loading annotated NER data
def load_annotated_data(base_dir):
    data = []
    for folder in FOLDER_NAMES:
        folder_path = os.path.join(DATA_DIR, folder)
        if not os.path.exists(folder_path):
            continue

        for root, _, files in os.walk(folder_path):
            for filename in files:
                if filename.endswith(".json"):
                    file_path = os.path.join(root, filename)
                    with open(file_path, "r", encoding="utf-8") as f:
                        file = json.load(f)
                        text = file['text']
                        entities = [
                            (start, end, label)
                            for start, end, label in file.get("entities", [])
                        ]
                        if text.strip():
                            data.append((text, {"entities": entities}))

    print(f"Loaded {len(data)} annotated samples")
    return data

data = load_annotated_data(DATA_DIR)

Loaded 702 annotated samples


In [4]:
# Example of one sample
for text, item in data[:1]:
    print(item)
    for start, end, label in item['entities']:
        print(start, end, label)
        print(label)

{'entities': [(0, 13, 'EXPERIENCE_LEVEL'), (0, 6, 'EXPERIENCE_LEVEL'), (7, 13, 'EXPERIENCE_LEVEL'), (14, 33, 'ROLE'), (39, 44, 'COMPANY_NAME'), (466, 479, 'EXPERIENCE_LEVEL'), (466, 472, 'EXPERIENCE_LEVEL'), (473, 479, 'EXPERIENCE_LEVEL'), (480, 499, 'ROLE'), (733, 737, 'EXPERIENCE_LEVEL'), (1165, 1175, 'EXPERIENCE_YEARS'), (1226, 1232, 'SKILL_HARD'), (1234, 1238, 'SKILL_HARD'), (1243, 1249, 'SKILL_HARD'), (1251, 1254, 'SKILL_HARD'), (1287, 1297, 'SKILL_HARD'), (1299, 1304, 'SKILL_HARD'), (1308, 1314, 'SKILL_HARD'), (1358, 1369, 'SKILL_HARD'), (1424, 1434, 'SKILL_HARD'), (1509, 1512, 'SKILL_HARD'), (1859, 1871, 'BENEFIT'), (1940, 1950, 'BENEFIT'), (1967, 1983, 'BENEFIT'), (2013, 2021, 'BENEFIT'), (2105, 2124, 'BENEFIT'), (2127, 2132, 'COMPANY_NAME'), (2160, 2164, 'EXPERIENCE_LEVEL'), (2244, 2252, 'BENEFIT'), (2309, 2317, 'BENEFIT'), (2608, 2614, 'LOCATION'), (2622, 2637, 'LOCATION')]}
0 13 EXPERIENCE_LEVEL
EXPERIENCE_LEVEL
0 6 EXPERIENCE_LEVEL
EXPERIENCE_LEVEL
7 13 EXPERIENCE_LEVEL
EXP

In [5]:
def make_docbin_data(nlp, data):
    '''
    Converting text and entities into DocBin Spacy format
    '''
    db = DocBin()
    for text, item in data:
        doc = nlp(text)
        spans = []
        for start_char, end_char, label in item['entities']:
            # Find the start token index
            token_start = None
            token_end = None
            for token in doc:
                if token.idx <= start_char < token.idx + len(token):
                    token_start = token.i
                if token.idx < end_char <= token.idx + len(token):
                    token_end = token.i + 1
            if token_start is not None and token_end is not None:
                span = Span(doc, token_start, token_end, label=label)
                spans.append(span)
            else:
                print(f"Skipping invalid span: {text[start_char:end_char]} at ({start_char}, {end_char})")
        filtered_spans = spacy.util.filter_spans(spans)
        doc.ents = filtered_spans
        db.add(doc)
    return db

In [6]:
# Spliting data into test and validation for spacy
# Test data will be runed on inference

train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)
dev_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
nlp = spacy.blank('en')
os.makedirs(SAVE_SPACY, exist_ok=True)

# Creating specific DocBin format
train_db = make_docbin_data(nlp, train_data)
dev_db = make_docbin_data(nlp, dev_data)
test_db = make_docbin_data(nlp, test_data)

# Saving
train_db.to_disk(os.path.join(SAVE_SPACY, "train.spacy"))
dev_db.to_disk(os.path.join(SAVE_SPACY, "dev.spacy"))
test_db.to_disk(os.path.join(SAVE_SPACY, "test.spacy"))

In [7]:
print(os.path.join(SAVE_SPACY, "test.spacy"))

D:\Programing\kurs 4\sem 1\nlp\project\JobVacanciesNLP\data\spacy\test.spacy


In [8]:
def inspect(path):
    print("Inspecting:", path)
    db = DocBin().from_disk(path)
    n = 0
    ents = 0
    for doc in db.get_docs(spacy.blank("en").vocab):
        n += 1
        ents += len(doc.ents)
        if len(doc.ents) == 0:
            print("EMPTY DOC:", doc.text[:100])
    print("Docs:", n)
    print("Total entities:", ents)

inspect(os.path.join(SAVE_SPACY, "train.spacy"))
inspect(os.path.join(SAVE_SPACY, "dev.spacy"))


Inspecting: D:\Programing\kurs 4\sem 1\nlp\project\JobVacanciesNLP\data\spacy\train.spacy
Docs: 561
Total entities: 18159
Inspecting: D:\Programing\kurs 4\sem 1\nlp\project\JobVacanciesNLP\data\spacy\dev.spacy
Docs: 70
Total entities: 2292


In [9]:
!python -m spacy train config_transformer.cfg --output ./output --gpu-id 0

^C


In [None]:
# def filter_overlapping_spans_keep_longest(spans):
#     # Sort spans by start index, then by length descending (longest first)
#     sorted_spans = sorted(spans, key=lambda span: (span.start, -(span.end - span.start)))
#     filtered = []
#     token_owners = {}  # token index -> span that owns it

#     for span in sorted_spans:
#         overlap = False
#         for token_idx in range(span.start, span.end):
#             if token_idx in token_owners:
#                 # Check if current span is longer than the one owning this token
#                 owner = token_owners[token_idx]
#                 owner_len = owner.end - owner.start
#                 span_len = span.end - span.start
#                 if span_len <= owner_len:
#                     overlap = True
#                     break
#                 else:
#                     # Current span longer, remove owner span tokens from token_owners
#                     for t in range(owner.start, owner.end):
#                         if token_owners.get(t) == owner:
#                             del token_owners[t]
#                     # Also remove owner from filtered list
#                     if owner in filtered:
#                         filtered.remove(owner)
#                     break
#         if not overlap:
#             for token_idx in range(span.start, span.end):
#                 token_owners[token_idx] = span
#             filtered.append(span)

#     return filtered

# nlp = spacy.blank('en')
# db = DocBin()

# for text, item in data:
#     doc = nlp(text)
#     spans = []
#     for start_char, end_char, label in item['entities']:
#         span = doc.char_span(start_char, end_char, label=label, alignment_mode='expand')
#         if span is not None:
#             spans.append(span)
#         else:
#             print(f"Skipping invalid span: {text[start_char:end_char]} at ({start_char}, {end_char})")
#     spans = filter_overlapping_spans_keep_longest(spans)

#     doc.ents = spans
#     db.add(doc)


702