# Paso 1: Crear el conjunto de entrenamiento

In [5]:
import spacy
from spacy.tokens import DocBin, Span

nlp = spacy.blank("es")

train_data = [
    ("Harry dijo Lumos y la sala se iluminó.", [(11, 16, "HECHIZO")]),
    ("Alohomora abrió la puerta mágicamente.", [(0, 9, "HECHIZO")]),
    ("Con un grito de Expecto patronum, un ciervo plateado apareció.", [(16, 33, "HECHIZO")]),
    ("Hermione murmuró Expeliarmo sin dudarlo.", [(19, 29, "HECHIZO")])
]

doc_bin = DocBin()

for text, entities in train_data:
    doc = nlp(text)
    spans = [Span(doc, doc.char_span(start, end).start, doc.char_span(start, end).end, label=label) 
             for start, end, label in entities if doc.char_span(start, end) is not None]
    doc.ents = spans
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")


# Paso 2: Entrenar el modelo con spaCy CLI

In [2]:
!python -m spacy init config config.cfg --lang es --pipeline ner

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: es
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [3]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy


[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     13.33    0.00    0.00    0.00    0.00
200     200          0.39    184.02  100.00  100.00  100.00    1.00
400     400          0.00      0.00  100.00  100.00  100.00    1.00
600     600          0.00      0.00  100.00  100.00  100.00    1.00
800     800          0.00      0.00  100.00  100.00  100.00    1.00
1000    1000          0.00      0.00  100.00  100.00  100.00    1.00
1200    1200          0.00      0.00  100.00  100.00  100.00    1.00
1400    1400          0.00      0.00  100.00  100.00  100.00    1.00
1600    1600          0.00      0.00  100.00  100.00  100.00   

[2025-05-02 12:41:27,279] [INFO] Set up nlp object from config
[2025-05-02 12:41:27,291] [INFO] Pipeline: ['tok2vec', 'ner']
[2025-05-02 12:41:27,298] [INFO] Created vocabulary
[2025-05-02 12:41:27,298] [INFO] Finished initializing nlp object
[2025-05-02 12:41:27,369] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


# Paso 3: Cargar el modelo entrenado y hacer predicciones

In [6]:
import spacy
from spacy.tokens import DocBin, Span

hechizos = [
    "Expelliarmus", "Alohomora", "Lumos", "Nox", "Wingardium Leviosa", 
    "Avada Kedavra", "Crucio", "Imperio", "Expecto Patronum", "Sectumsempra"
]

nlp = spacy.blank("es")

def generar_docs_anotados(texto, hechizos):
    doc = nlp(texto)
    spans = []
    for hechizo in hechizos:
        start = 0
        while True:
            start = texto.lower().find(hechizo.lower(), start)
            if start == -1:
                break
            end = start + len(hechizo)
            span = doc.char_span(start, end, label="HECHIZO", alignment_mode="contract")
            if span is not None:
                spans.append(span)
            start = end
    doc.ents = spans
    return doc

# Cargar el libro desde línea 38
with open("J.K. Rowling - Harry Potter 1 - La Piedra Filosofal.txt", 'r', encoding='utf-8') as f:
    lineas = f.readlines()

texto = "".join(lineas[37:])
doc = generar_docs_anotados(texto, hechizos)

# Verificar que los hechizos se anotaron correctamente
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

# Guardar en formato spaCy
doc_bin = DocBin()
doc_bin.add(doc)
doc_bin.to_disk("train.spacy")


Alohomora 237176 237185 HECHIZO
leviosa 253129 253136 HECHIZO
leviosa 260629 260636 HECHIZO
Alohomora 409597 409606 HECHIZO
