##### Event Data Generator 🏟️

Generate synthetic event data for the Veneto region.

- Random cities, venues, and event types
- Realistic dates and descriptions
- Output: `veneto_events.json`

In [1]:
import json
import random
from datetime import datetime, timedelta

# Load locations from external JSON file
with open("../dataset/villages_places.json", "r", encoding="utf-8") as f:
    locations = json.load(f)

event_types = [
    "Music", "Sport", "Food & Drink", "Arts & Crafts", "Theatre", "Tour", "Workshop",
    "Festival", "Conference", "Exhibition"
]

base_url = "https://example.com/veneto-events/"

def random_date():
    today = datetime.now()
    firstday = datetime(2025, 9, 1)
    end_date_limit = datetime(2025, 10, 31)
    days_difference = (end_date_limit - firstday).days
    if days_difference < 0:
        return None, None
    random_days = random.randint(0, days_difference)
    start_date = firstday + timedelta(days=random_days)
    random_hours = random.randint(0, 23)
    random_minutes = random.randint(0, 59)
    start_date = start_date.replace(hour=random_hours, minute=random_minutes, second=0, microsecond=0)
    duration_hours = random.randint(1, 10)
    end_date = start_date + timedelta(hours=duration_hours)
    if end_date > end_date_limit:
        end_date = end_date_limit
    return start_date.isoformat(timespec='seconds') + 'Z', end_date.isoformat(timespec='seconds') + 'Z'

def generate_random_description(event_type, city, venue):
    description_templates = [
        f"Prepare for an unforgettable {event_type.lower()} experience in the stunning city of {city}, held at the magnificent {venue}. This event promises to be a highlight of the season!",
        f"Dive into the world of {event_type.lower()} at this exciting gathering in {city}. Located at the renowned {venue}, it's an event you won't want to miss.",
        f"Explore the vibrant {event_type.lower()} scene in {city} with this special event at {venue}. Get ready for a day filled with discovery and enjoyment.",
        f"A unique {event_type.lower()} opportunity awaits you in {city}. Join us at the charming {venue} for an event designed to inspire and entertain.",
        f"Immerse yourself in the rich culture of {city} with this engaging {event_type.lower()} event at the iconic {venue}. It's the perfect way to spend your time.",
        f"Seeking adventure in {city}? Look no further than this captivating {event_type.lower()} event at {venue}. Fun and excitement are guaranteed!",
        f"Connect with fellow enthusiasts at this lively {event_type.lower()} event in {city}, taking place at the welcoming {venue}. Share your passion and make new friends.",
        f"Unwind and enjoy a delightful {event_type.lower()} experience in {city} at the picturesque {venue}. Relax and take in the atmosphere.",
        f"Expand your horizons with this insightful {event_type.lower()} event in {city}, hosted at the distinguished {venue}. Learn something new and be inspired.",
        f"Step into a world of wonder at this extraordinary {event_type.lower()} event in {city}, held at the historic {venue}. Prepare to be amazed!"
    ]
    return random.choice(description_templates)

events = []

for i in range(1, 5000):
    city = random.choice(list(locations.keys()))
    venue = random.choice(locations[city])
    event_type = random.choice(event_types)
    start, end = random_date()
    if start and end:
        event = {
            "id": str(i),
            "title": f"{city} {event_type} Event {i}",
            "category": event_type,
            "description": generate_random_description(event_type, city, venue),
            "city": city,
            "location": {
                "venue": venue,
                "address": f"{venue}, {city}, Veneto"
            },
            "start_date": start,
            "end_date": end,
            "url": f"{base_url}{i}"
        }
        events.append(event)

output = {
    "events": events
}

with open("../dataset/veneto_events.json", "w", encoding="utf-8") as f:
    json.dump(output, f, indent=2, ensure_ascii=False)

print(f"Generated veneto_events.json with {len(events)} events")


Generated veneto_events.json with 4999 events


#### Geocoding Veneto Events 🌍

Geocode synthetic event data for Veneto using the OpenStreetMap Nominatim API.

**Workflow:**
- Load events from `veneto_events.json`
- Clean addresses (venue, city, region, country)
- Geocode each event (with fallback strategies)
- Rate limit to respect API usage
- Add coordinates to each event
- Save results to `veneto_events_geocoded_structured.json`

In [None]:
import json
import asyncio
import httpx
from tqdm.asyncio import tqdm_asyncio
import threading

counter_lock = threading.Lock()
success_counter = 0  # Global counter variable


async def async_geocode_structured(venue, city, region="Veneto", country="Italy"):
    base_url = "https://nominatim.openstreetmap.org/search"
    headers = {'User-Agent': 'convert_to_geo/1.0'}

    params_list = [
        {'street': venue, 'city': city, 'state': region, 'country': country, 'format': 'json', 'limit': 1},
        {'city': city, 'state': region, 'country': country, 'format': 'json', 'limit': 1},
        {'street': venue, 'city': city, 'country': country, 'format': 'json', 'limit': 1},
        {'street': venue, 'state': region, 'country': country, 'format': 'json', 'limit': 1}
    ]

    async with httpx.AsyncClient() as client_http:
        for params in params_list:
            try:
                response = await client_http.get(base_url, params=params, headers=headers, timeout=10)
                response.raise_for_status()
                data = response.json()
                if data:
                    return float(data[0]['lat']), float(data[0]['lon'])
            except (httpx.HTTPError, ValueError):
                pass
            await asyncio.sleep(1)

    return None, None


async def process_events_geocoding(events):
    semaphore = asyncio.Semaphore(5)

    global success_counter  # use global here for global variable

    async def geocode_event(event):
        global success_counter
        venue = event.get('location', {}).get('venue', '').strip()
        city = event.get('city', '').strip()
        if venue and city:
            async with semaphore:
                lat, lon = await async_geocode_structured(venue, city)
                if lat is not None and lon is not None:
                    with counter_lock:
                        success_counter += 1
                        # Print progress only every 100 successes
                        if success_counter % 100 == 0 or success_counter == len(events):
                            print(f"Geocoding success {success_counter} / {len(events)}")
                event['location']['latitude'] = lat
                event['location']['longitude'] = lon
        else:
            event['location']['latitude'] = None
            event['location']['longitude'] = None

    await tqdm_asyncio.gather(*(geocode_event(event) for event in events))


async def main():
    input_path = '../dataset/veneto_events.json'
    output_path = '../dataset/veneto_events_geocoded_structured.json'

    with open(input_path, 'r', encoding='utf-8') as f:
        events_data = json.load(f)

    await process_events_geocoding(events_data['events'])

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(events_data, f, ensure_ascii=False, indent=2)

    print("Geocoding complete and saved to veneto_events_geocoded_structured.json")


# Run in your Jupyter notebook cell with:
await main()


  2%|▏         | 111/4999 [04:01<3:04:29,  2.26s/it]

Geocoding success 100 / 4999


  4%|▍         | 221/4999 [07:59<3:18:46,  2.50s/it]

Geocoding success 200 / 4999


  6%|▌         | 296/4999 [10:29<2:42:57,  2.08s/it]

### To check the fastembed models

In [None]:
from fastembed import TextEmbedding

# Get the list of supported models (each is a dict)
models_info = TextEmbedding.list_supported_models()

# Extract only the model names into a list
model_names = [model['model'] for model in models_info]

model_names


In [None]:
%pip install qdrant_client --upgrade
%pip install qdrant-client[fastembed] --upgrade
%pip install ipywidgets --upgrade
%pip install fastembed huggingface_hub --upgrade



### Qdrant Vector ingestion

This section uploads events into a Qdrant vector database.

Steps:
- Load environment variables from `.env`
- Load geocoded events from `veneto_events_geocoded_structured.json`
- Check or create the `veneto_events` collection in Qdrant 
- Upsert all event points to the Qdrant collection

#### Embedding on description

In [5]:
from qdrant_client import QdrantClient, models
import os
from dotenv import load_dotenv
import json
from tqdm import tqdm
from uuid import uuid4
import hashlib
from fastembed import TextEmbedding, SparseTextEmbedding

# -------------------------
# Load environment variables
# -------------------------
load_dotenv(dotenv_path="../.env")
QDRANT_SERVER = os.getenv("QDRANT_SERVER")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

DENSE_MODEL_NAME = os.getenv("DENSE_MODEL_NAME")
SPARSE_MODEL_NAME = os.getenv("SPARSE_MODEL_NAME")

if not QDRANT_SERVER or not QDRANT_API_KEY:
    raise ValueError("QDRANT_SERVER or QDRANT_API_KEY not defined in .env file")

# -------------------------
# Initialize clients
# -------------------------
dense_embedding_model = TextEmbedding(DENSE_MODEL_NAME)
sparse_embedding_model = SparseTextEmbedding(SPARSE_MODEL_NAME)

dense_vector_name = "dense_vector"
sparse_vector_name = "sparse_vector"
COLLECTION_NAME = "veneto_events"

client = QdrantClient(url=QDRANT_SERVER, api_key=QDRANT_API_KEY, timeout=200000)

def verify_qdrant_connection(client):
    try:
        client.get_collections()
        return True
    except Exception as e:
        print(f"Qdrant connection error: {e}")
        return False

if not verify_qdrant_connection(client):
    raise ConnectionError("Failed to connect to Qdrant server")

# -------------------------
# Utility Functions
# -------------------------
def calculate_hash(text: str) -> str:
    """Calculate SHA256 hash of text."""
    return hashlib.sha256(text.encode("utf-8")).hexdigest()

# -------------------------
# Load dataset
# -------------------------
with open("../dataset/veneto_events_geocoded_structured.json", "r", encoding="utf-8") as file:
    events_data = json.load(file)
events = events_data.get("events", [])

# -------------------------
# Create collection if not exists
# -------------------------
example_text = "Test for embedding dimension calculation."
example_emb = list(dense_embedding_model.passage_embed([example_text]))[0]
dense_dim = len(example_emb)

if not client.collection_exists(COLLECTION_NAME):
    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config={
            dense_vector_name: models.VectorParams(size=dense_dim, distance=models.Distance.COSINE)
        },
        sparse_vectors_config={
            sparse_vector_name: models.SparseVectorParams()
        }
    )

# Create payload indexes (safe if repeats)
client.create_payload_index(COLLECTION_NAME, "id", "keyword")
client.create_payload_index(COLLECTION_NAME, "location", "geo")
client.create_payload_index(COLLECTION_NAME, "start_date", "datetime")
client.create_payload_index(COLLECTION_NAME, "end_date", "datetime")

# -------------------------
# Insert / Update events in batches
# -------------------------
BATCH_SIZE = 32
inserted, updated, skipped_unchanged = 0, 0, 0

for start in tqdm(range(0, len(events), BATCH_SIZE)):
    batch = events[start : start + BATCH_SIZE]
    texts = [event.get("description", "") for event in batch]

    dense_embeddings = list(dense_embedding_model.passage_embed(texts))
    sparse_embeddings = list(sparse_embedding_model.passage_embed(texts))

    points = []
    for i, event in enumerate(batch):
        event_id = event.get("id")
        if not event_id:
            continue
        text = texts[i]
        chunk_hash = calculate_hash(text)

        existing_points, _ = client.scroll(
            collection_name=COLLECTION_NAME,
            scroll_filter=models.Filter(
                must=[models.FieldCondition(key="id", match=models.MatchValue(value=event_id))]
            ),
            limit=1,
        )

        if existing_points:
            existing_point = existing_points[0]
            existing_hash = existing_point.payload.get("hash", "")
            if existing_hash == chunk_hash:
                skipped_unchanged += 1
                continue
            else:
                client.delete(collection_name=COLLECTION_NAME, points_selector=models.PointIdsList(points=[existing_point.id]))
                updated += 1
        else:
            inserted += 1

        loc = event.get("location", {})
        loc_geo = {}
        if "latitude" in loc and "longitude" in loc:
            loc_geo = {"lat": loc["latitude"], "lon": loc["longitude"]}

        location_payload = {**loc, **loc_geo}  # Merges original location dict with lat/lon keys

        payload = {**event, "location": location_payload, "hash": chunk_hash}


        points.append(
            models.PointStruct(
                id=str(uuid4()),
                vector={
                    dense_vector_name: dense_embeddings[i].tolist(),
                    sparse_vector_name: models.SparseVector(
                        indices=list(sparse_embeddings[i].indices),
                        values=list(sparse_embeddings[i].values),
                    ),
                },
                payload=payload,
            )
        )

    if points:
        client.upsert(collection_name=COLLECTION_NAME, points=points, wait=True)

# -------------------------
# Final report
# -------------------------
print("\n✅ Operation completed:")
print(f"- Inserted: {inserted}")
print(f"- Updated: {updated}")
print(f"- Skipped (unchanged): {skipped_unchanged}")
print(f"Collection info: {client.get_collection(COLLECTION_NAME)}")


  dense_embedding_model = TextEmbedding(DENSE_MODEL_NAME)
100%|██████████| 16/16 [02:22<00:00,  8.93s/it]


✅ Operation completed:
- Inserted: 499
- Updated: 0
- Skipped (unchanged): 0
Collection info: status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=499 points_count=499 segments_count=2 config=CollectionConfig(params=CollectionParams(vectors={'dense_vector': VectorParams(size=768, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None)}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors={'sparse_vector': SparseVectorParams(index=None, modifier=None)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold




### Code to add a month from august to september

In [1]:
import json
from datetime import datetime

# Load the JSON file
with open('../dataset/veneto_events_geocoded_structured.json', 'r') as f:
    data = json.load(f)

# Function to change month from August (08) to September (09)
def change_month(date_str):
    dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ")
    # Change month from 8 to 9
    if dt.month == 8:
        dt = dt.replace(month=9)
    return dt.strftime("%Y-%m-%dT%H:%M:%SZ")

# Update each event
start_id = 201
for i, event in enumerate(data['events']):
    event['start_date'] = change_month(event['start_date'])
    event['end_date'] = change_month(event['end_date'])
    event['id'] = str(start_id + i)

# Write updated JSON back to file or just print
with open('../dataset/veneto_events_upd_geocoded_structured.json', 'w') as f:
    json.dump(data, f, indent=2)

# If you want to print the updated json as well
print(json.dumps(data, indent=2))


{
  "events": [
    {
      "id": "201",
      "title": "Cavarzere Conference Event #1",
      "category": "Conference",
      "description": "Unwind and enjoy a delightful conference experience in Cavarzere at the picturesque Museo Civico. Relax and take in the atmosphere.",
      "city": "Cavarzere",
      "location": {
        "venue": "Museo Civico",
        "address": "Museo Civico, Cavarzere, Veneto",
        "latitude": 45.1360987,
        "longitude": 12.0812854
      },
      "start_date": "2025-09-28T13:47:00Z",
      "end_date": "2025-09-28T19:47:00Z",
      "url": "https://example.com/veneto-events/1"
    },
    {
      "id": "202",
      "title": "Mestre Tour Event #2",
      "category": "Tour",
      "description": "Unwind and enjoy a delightful tour experience in Mestre at the picturesque Centro Culturale Candiani. Relax and take in the atmosphere.",
      "city": "Mestre",
      "location": {
        "venue": "Centro Culturale Candiani",
        "address": "Centro Cultu