In [6]:
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
from pathlib import Path
import random
import pandas as pd

# Define labels
LABELS = ["LOCATION", "PRICE", "PROPERTY_TYPE", "BEDROOMS", "BATHROOMS", "AMENITY"]

# Function to load and auto-align entities from CSV
def generate_spacy_ner_data(csv_file):
    df = pd.read_csv(csv_file)
    formatted_data = []

    for _, row in df.iterrows():
        text = row['text']
        entities = []

        for label in LABELS:
            value = str(row.get(label, "")).strip()
            if value and value.lower() != "nan":
                start = text.lower().find(value.lower())
                if start != -1:
                    end = start + len(value)
                    entities.append((start, end, label))
                else:
                    print(f"⚠️ Could not find '{value}' in: '{text}'")

        formatted_data.append((text, {"entities": entities}))

    return formatted_data

# Load and process training data
TRAIN_DATA = generate_spacy_ner_data("real_estate_ner_dataset.csv")

# Validate entity alignment (optional but good practice)
def validate_entities(nlp, training_data):
    from spacy.training import offsets_to_biluo_tags
    for text, annotations in training_data:
        doc = nlp.make_doc(text)
        try:
            tags = offsets_to_biluo_tags(doc, annotations["entities"])
        except Exception as e:
            print(f"⚠️ Misaligned entity in: '{text}'\nReason: {e}")

# Load blank model
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

# Add labels to NER
for label in LABELS:
    ner.add_label(label)

# Train the model
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for iteration in range(30):
        print(f"Iteration {iteration + 1}")
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.5))
        for batch in batches:
            examples = []
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                examples.append(example)
            nlp.update(examples, drop=0.35, losses=losses)
        print("Losses:", losses)

# Save model
output_dir = Path("trained_realestate_ner_2")
output_dir.mkdir(exist_ok=True)
nlp.to_disk(output_dir)
print(f"\n✅ Model saved to {output_dir}")

# Load and test model
print("\n--- Test the trained model ---")
test_text = "I want a villa in Cap Cana with 5 bedrooms and a pool for $450,000"
loaded_nlp = spacy.load(output_dir)
doc = loaded_nlp(test_text)

# Show extracted entities
for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_}")

# Parse entities into model input format
def ner_output_to_model_input(doc, all_labels):
    input_dict = {label: None for label in all_labels}
    for ent in doc.ents:
        if ent.label_ in input_dict:
            input_dict[ent.label_] = ent.text
    return input_dict

parsed_input = ner_output_to_model_input(doc, LABELS)
print("\nParsed Input for Model:")
print(parsed_input)


Iteration 1
Losses: {'ner': np.float32(784.6445)}
Iteration 2
Losses: {'ner': np.float32(456.4576)}
Iteration 3
Losses: {'ner': np.float32(16.20474)}
Iteration 4
Losses: {'ner': np.float32(0.07802634)}
Iteration 5
Losses: {'ner': np.float32(4.641822e-12)}
Iteration 6
Losses: {'ner': np.float32(2.1370937e-18)}
Iteration 7
Losses: {'ner': np.float32(7.404255e-21)}
Iteration 8
Losses: {'ner': np.float32(2.5152164e-35)}
Iteration 9
Losses: {'ner': np.float32(2.3666992e-35)}
Iteration 10
Losses: {'ner': np.float32(2.9567711e-38)}
Iteration 11
Losses: {'ner': np.float32(2.9329711e-34)}
Iteration 12
Losses: {'ner': np.float32(1.059711e-38)}
Iteration 13
Losses: {'ner': np.float32(1.417541e-32)}
Iteration 14
Losses: {'ner': np.float32(3.1501428e-29)}
Iteration 15
Losses: {'ner': np.float32(1.2822e-41)}
Iteration 16
Losses: {'ner': np.float32(2.5972427e-35)}
Iteration 17
Losses: {'ner': np.float32(2.06271e-40)}
Iteration 18
Losses: {'ner': np.float32(1.59258e-40)}
Iteration 19
Losses: {'ner': n