In [3]:
import random
import re

# Configurable number of records to generate
def generate_train_data(num_records=100):
    # Options
    locations = [
        "Punta Cana", "Bavaro", "Santo Domingo", "Casa de Campo", "Jarabacoa",
        "Santiago", "Cap Cana", "La Romana", "Bavaro"
    ]
    property_types = ["house", "condo", "apartment", "villa"]
    bedroom_options = [1, 2, 3, 4, 5]
    bathroom_options = [1, 2, 3, 4]
    prices = [120000, 150000, 180000, 200000, 300000, 400000, 500000, 600000, 700000]
    amenities_options = [
        "balcony", "private garden", "rooftop terrace", "gated community",
        "pool", "parking", "pet friendly", "sea view"
    ]

    # Sentence templates with placeholders
    templates = [
        "Show me a {pt} in {loc} with {bd} bedrooms and {ba} bathrooms for {price}{amenity_part}.",
        "I'm looking for a {bd}-bedroom {pt} in {loc} for around {price}{amenity_part}.",
        "Need a {pt} in {loc} with {ba} bathrooms, price {price}{amenity_part}.",
        "Searching for a {bd}-bedroom, {ba}-bathroom {pt} in {loc} under {price}{amenity_part}.",
        "Looking for a {pt} with {bd} bedrooms in {loc}, price about {price}{amenity_part}.",
        "Interested in a property in {loc}, preferably a {pt} under {price}{amenity_part}.",
        "Do you have any {pt}s in {loc} for under {price} with {bd} bedrooms{amenity_part}?"
    ]

    TRAIN_DATA = []
    random.seed(0)

    price_pattern = re.compile(r"\$(?:\d{1,3}(?:,\d{3})*)")

    for _ in range(num_records):
        pt = random.choice(property_types)
        loc = random.choice(locations)
        bd = random.choice(bedroom_options)
        ba = random.choice(bathroom_options)
        pr = random.choice(prices)
        price_str = f"${pr:,}"

        # Randomly include amenities
        chosen = []
        if random.random() < 0.6:
            chosen = random.sample(amenities_options, random.randint(1, 2))
            amenity_part = " with " + ", ".join(chosen)
        else:
            amenity_part = ""

        tmpl = random.choice(templates)
        text = tmpl.format(pt=pt, loc=loc, bd=bd, ba=ba, price=price_str, amenity_part=amenity_part)
        text = text.replace("  ", " ")

        entities = []
        # PROPERTY_TYPE
        start = text.index(pt)
        end = start + len(pt)
        entities.append((start, end, "PROPERTY_TYPE"))

        # LOCATION
        start = text.index(loc)
        end = start + len(loc)
        entities.append((start, end, "LOCATION"))

        # BEDROOMS
        bd_dash = f"{bd}-bedroom"
        bd_space = f"{bd} bedrooms"
        for label in (bd_dash, bd_space):
            if label in text:
                start = text.index(label)
                end = start + len(label)
                entities.append((start, end, "BEDROOMS"))
                break

        # BATHROOMS
        ba_dash = f"{ba}-bathroom"
        ba_space = f"{ba} bathrooms"
        for label in (ba_dash, ba_space):
            if label in text:
                start = text.index(label)
                end = start + len(label)
                entities.append((start, end, "BATHROOMS"))
                break

        # PRICE using regex
        price_match = price_pattern.search(text)
        if price_match:
            start, end = price_match.span()
            entities.append((start, end, "PRICE"))

        # AMENITIES
        for amen in chosen:
            idx = text.find(amen)
            if idx != -1:
                entities.append((idx, idx + len(amen), "AMENITY"))

        TRAIN_DATA.append((text, {"entities": entities}))

    return TRAIN_DATA


# Example usage
if __name__ == "__main__":
    NUM_RECORDS = 50  # adjust as needed
    TRAIN_DATA = generate_train_data(NUM_RECORDS)
    with open("generated_train_data.py", "w") as f:
        f.write("TRAIN_DATA = ")
        f.write(repr(TRAIN_DATA))
    print(f"Generated {NUM_RECORDS} training examples to generated_train_data.py")


Generated 50 training examples to generated_train_data.py
