In [8]:
import spacy
import os
import json
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en")
doc_bin = DocBin()

DATASET_PATH = "dataset"

def spans_overlap(span1, span2):
    return span1.start < span2.end and span2.start < span1.end

total_docs = 0
successful_docs = 0
failed_docs = 0
skipped_spans = 0

for filename in tqdm(os.listdir(DATASET_PATH), desc="Processing JSON"):
    if not filename.endswith(".json"):
        continue

    path = os.path.join(DATASET_PATH, filename)
    with open(path, "r", encoding="utf-8") as f:
        data_list = json.load(f)

    for item in data_list:
        total_docs += 1
        try:
            text = item["data"]["text"]
            annotations = item.get("annotations")
        except Exception as e:
            failed_docs += 1
            print(f"❌ Failed to extract text or annotations from item: {e}")
            continue

        if not text or not annotations:
            failed_docs += 1
            continue

        results = annotations[0].get("result", [])
        doc = nlp.make_doc(text)
        ents = []

        for result in results:
            value = result.get("value", {})
            start = value.get("start")
            end = value.get("end")
            label_list = value.get("labels", [])

            if start is None or end is None or not label_list:
                continue

            label = label_list[0].strip()
            span = doc.char_span(start, end, label=label, alignment_mode="contract")

            if span is None:
                skipped_spans += 1
                continue

            if any(spans_overlap(span, existing) for existing in ents):
                skipped_spans += 1
                continue

            ents.append(span)

        try:
            doc.ents = ents
            doc_bin.add(doc)
            successful_docs += 1
        except Exception as e:
            failed_docs += 1
            print(f"❌ Error in item ID {item.get('id')}: {e}")

# Save output
output_file = "cv_skills_dataset.spacy"
doc_bin.to_disk(output_file)

print(f"\n✅ Saved to: {output_file}")
print(f"📊 Total: {total_docs}, ✅ Success: {successful_docs}, ❌ Failed: {failed_docs}, ⚠️ Skipped spans: {skipped_spans}")


Processing JSON: 100%|███████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.49it/s]


✅ Saved to: cv_skills_dataset.spacy
📊 Total: 15, ✅ Success: 15, ❌ Failed: 0, ⚠️ Skipped spans: 28





In [9]:
from spacy.tokens import DocBin
import random

db = DocBin().from_disk("cv_skills_dataset.spacy")
docs = list(db.get_docs(spacy.blank("en").vocab))

random.shuffle(docs)
split = int(0.8 * len(docs))

DocBin(docs=docs[:split]).to_disk("train.spacy")
DocBin(docs=docs[split:]).to_disk("dev.spacy")
print("✅ Done: train.spacy and dev.spacy saved.")


✅ Done: train.spacy and dev.spacy saved.
