##### Event Data Generator 🏟️

Generate synthetic event data for the Veneto region.

- Random cities, venues, and event types
- Realistic dates and descriptions
- Output: `veneto_events.json`

In [2]:
import json
import random
from datetime import datetime, timedelta

# Veneto cities and villages with multiple venues each (Veneto region only, expanded)
locations = {
    "Venice": [
        "Teatro La Fenice", "Piazza San Marco", "Lido di Venezia", "Murano Glass Museum",
        "Palazzo Ducale", "Gallerie dell'Accademia", "Rialto Bridge", "Ca' Rezzonico",
        "Basilica di San Marco", "Peggy Guggenheim Collection", "Isola di Burano",
        "Scuola Grande di San Rocco", "San Giorgio Maggiore", "Campo Santa Margherita",
        "Jewish Ghetto", "Palazzo Contarini del Bovolo", "Fondazione Querini Stampalia",
        "Ca' d'Oro", "Venetian Arsenal", "Scala Contarini del Bovolo", "Church of San Zaccaria",
        "Palazzo Mocenigo", "Palazzo Grimani", "Campo Santa Maria Formosa", "Sant’Elena Park",
        "Palazzo Querini Stampalia", "Isola di Torcello", "Ghetto Nuovo Synagogue"
    ],
    "Padua": [
        "Stadio Euganeo", "Prato della Valle", "Palazzo della Ragione", "Orto Botanico",
        "Cappella degli Scrovegni", "Basilica di Sant'Antonio", "Scrovegni Chapel",
        "Museo del Precinema", "Palazzo Zabarella", "Church of the Eremitani", "Palazzo Moroni",
        "Botanical Garden of Padua (UNESCO site)", "Civic Museum", "Piazza dei Frutti",
        "Piazza delle Erbe", "Cattedrale di Padova", "Museo Diocesano", "Museo Antoniano",
        "Museo Bottacin", "Palazzo Liviano", "Museo della Padova Ebraica", "Abbazia di Santa Giustina"
    ],
    "Verona": [
        "Arena di Verona", "Verona Exhibition Center", "Piazza Bra", "Juliet's House",
        "Castelvecchio", "Giardino Giusti", "Torre dei Lamberti", "Basilica of San Zeno",
        "Ponte Pietra", "Palazzo Barbieri", "Santa Anastasia Church", "Palazzo della Gran Guardia",
        "Roman Theatre", "Museum of Castelvecchio", "Ponte Scaligero", "Piazza delle Erbe",
        "Piazza dei Signori", "Giardino dell’Arnesa", "Arche Scaligere", "Chiesa di San Fermo Maggiore",
        "Chiesa di San Bernardino", "Palazzo Maffei", "Palazzo Canossa", "Museo Lapidario Maffeiano"
    ],
    "Vicenza": [
        "Piazza dei Signori", "Teatro Olimpico", "Villa La Rotonda", "Basilica Palladiana",
        "Museo Civico di Palazzo Chiericati", "Giardino Salvi", "Palazzo Thiene",
        "Villa Valmarana ai Nani", "Porta Castello", "Santa Corona Church", "Palazzo Chiericati Museum",
        "Palazzo Barbaran da Porto", "Villa Caldogno", "Palazzo Porto", "San Lorenzo Church",
        "Villa Trissino", "Villa Almerico Capra 'La Rotonda'", "Villa Bissari", "Rotonda di Vicenza",
        "Ponte San Michele", "Museo Naturalistico Archeologico", "Palazzo Bonin Longare"
    ],
    "Treviso": [
        "Piazza dei Signori", "Teatro Mario Del Monaco", "Fontane di Treviso", "Villa Revedin",
        "Museo di Santa Caterina", "Isola della Pescheria", "Duomo di Treviso",
        "Museo Bailo", "Porta San Tommaso", "Palazzo dei Trecento", "Parco degli Alberi Parlanti",
        "Villa Pisani", "Church of San Francesco", "Palazzo Bomben", "Villa Giovannelli Colonna",
        "Fontana delle Tette", "Villa Margherita", "Museo Luigi Bailo", "Chiesa di San Nicolò",
        "Villa Corner"
    ],
    "Belluno": [
        "Piazza dei Martiri", "Palazzo dei Rettori", "Museo Civico", "Basilica Cattedrale di San Martino",
        "Palazzo Fulcis", "Porta Dojona", "Chiesa di Santo Stefano", "Parco Nazionale Dolomiti Bellunesi",
        "Ponte della Vittoria", "Torre Civica"
    ],
    "Rovigo": [
        "Piazza Vittorio Emanuele II", "Palazzo Roverella", "Tempio della Beata Vergine del Soccorso",
        "Accademia dei Concordi", "Museo dei Grandi Fiumi", "Palazzo Roncale", "Torre Donà",
        "Chiesa della Beata Vergine del Soccorso", "Teatro Sociale", "Castello di Rovigo"
    ],
    "Chioggia": [
        "Cattedrale di Santa Maria Assunta", "Museo Civico della Laguna Sud", "Corso del Popolo",
        "Torre dell'Orologio", "Mercato del Pesce", "Isola di Pellestrina", "Sottomarina Beach",
        "Ponte Vigo", "Chiesa di San Domenico", "Laguna del Lusenzo"
    ],
    "Mestre": [
        "Parco San Giuliano", "Piazza Ferretto", "Church of San Lorenzo", "Parco Albanese",
        "Villa Querini", "Forte Marghera", "Centro Culturale Candiani", "Torre dell'Orologio",
        "Teatro Toniolo", "Galleria Matteotti"
    ],
    "Jesolo": [
        "Jesolo Beach", "Sea Life Aquarium", "Piazza Mazzini", "Pineta Beach",
        "Aqualandia Waterpark", "Parco Zoo Punta Verde", "Tropicarium Park", "Piazza Brescia",
        "Laguna del Mort", "Museo Civico di Jesolo"
    ],
    "Bassano del Grappa": [
        "Ponte degli Alpini", "Museo Civico", "Piazza Garibaldi", "Castello degli Ezzelini",
        "Villa Angarano", "Museo della Grappa", "Chiesa di San Giovanni Battista",
        "Palazzo Agostinelli", "Villa Recoaro", "Teatro Remondini"
    ],
    "Asolo": [
        "Castello della Regina", "Villa Freya", "Piazza Garibaldi", "Rocca di Asolo",
        "Museo Civico", "Cathedral of Asolo", "Villa Barbaro", "Church of Santa Caterina",
        "Palazzo della Ragione", "Villa Maser"
    ],
    "Cortina d'Ampezzo": [
        "Funivia Faloria", "Basilica Minore dei Santi Filippo e Giacomo", "Museo d'Arte Moderna Mario Rimoldi",
        "Lago di Pianozes", "Tofana di Mezzo", "Stadio Olimpico del Ghiaccio", "Passo Giau",
        "Monte Cristallo", "Golf Club Cortina", "Parco Naturale delle Dolomiti d'Ampezzo"
    ],
    "Caorle": [
        "Duomo di Santo Stefano", "Santuario della Madonna dell'Angelo", "Spiaggia di Levante",
        "Spiaggia di Ponente", "Scogliera Viva", "Museo Liturgico", "Centro Storico",
        "Luna Park", "Parco del Pescatore", "Porto Peschereccio"
    ],
    "Conegliano": [
        "Castello di Conegliano", "Duomo di Conegliano", "Museo Civico", "Piazza Cima",
        "Teatro Accademia", "Via XX Settembre", "Chiesa di San Pio X", "Parco Mozart",
        "Villa Gera", "Museo degli Alpini"
    ],
    "Adria": [
        "Museo Archeologico Nazionale", "Cattedrale dei Santi Pietro e Paolo", "Teatro Comunale",
        "Piazza Garibaldi", "Chiesa di Santa Maria Assunta della Tomba", "Museo della Cattedrale",
        "Villa Mecenati", "Ponte Castello", "Parco delle Rimembranze", "Palazzo Bocchi"
    ],
    "Este": [
        "Castello di Este", "Roman Amphitheatre ruins", "Duomo di Santa Tecla", "Museo Nazionale Atestino",
        "Piazza Maggiore", "Villa Contarini", "Basilica di Sant'Andrea", "Giardini del Castello",
        "Porta Vecchia", "Chiesa di San Martino"
    ],
    "Cittadella": [
        "City Walls of Cittadella", "Piazza Pierobon", "Torre di Malta", "Palazzo Pretorio",
        "Church of San Donato", "Museo della Città", "Teatro Sociale", "Porta Bassano",
        "Porta Padova", "Palazzo della Loggia"
    ],
    "Arquà Petrarca": [
        "Casa del Petrarca", "Chiesa di Santa Maria Assunta", "Piazza Petrarca",
        "Medieval village center", "Villa Pisani", "Parco della Rimembranza", "Oratorio della Santissima Trinità",
        "Fontana Petrarca", "Museo Petrarchesco", "Monte Ventolone"
    ],
    "Borgo Valsugana": [
        "Castel Telvana", "Ponte Vecchio", "Chiesa di San Rocco", "Museo della Grande Guerra",
        "Parco della Pace", "Palazzo Ceschi", "Via Roma", "Villa Angeli", "Piazza Degasperi", "Museo degli Spaventapasseri"
    ],
    "Feltre": [
        "Castello di Feltre", "Piazza Maggiore", "Cattedrale di San Pietro Apostolo",
        "Palazzo della Magnifica Comunità", "Museo Civico", "Teatro de la Sena", "Porta Imperiale",
        "Chiesa di San Giacomo", "Parco della Birreria", "Villa Pasole"
    ],
    "San Donà di Piave": [
        "Cittadella Medicea", "Piazza Indipendenza", "Church of San Giovanni",
        "Municipal Library", "Villa Ancillotto", "Museo della Bonifica", "Parco Fluviale"
    ],
    "Dolo": [
        "Villa Widmann", "Canale di Dolo", "Ponte di Dolo", "Chiesa di San Pio X",
        "Villa Jacur", "Piazza Cantiere", "Museo del Naviglio"
    ],
    "Monselice": [
        "Rocca di Monselice", "Villa Duodo", "Santuario delle Sette Chiese", "Piazza Mazzini",
        "Museo Rocca", "Castello di Monselice", "Chiesa di San Paolo"
    ],
    "Portogruaro": [
        "Duomo di Portogruaro", "Palazzo degli Asburgo", "Museo Nazionale Concordiese",
        "Piazza della Repubblica", "Chiesa di Sant'Andrea", "Mulini di Sant'Andrea", "Villa Comunale"
    ],
    "San Vito di Cadore": [
        "Chiesa di San Vito", "Museo Ladino Cadorino", "Lago di Antorno",
        "Piani di Pezzè", "Monte Antelao", "Parco NeveSole", "Piazza Medaglie d'Oro"
    ],
    "Campolongo Maggiore": [
        "Villa Manin", "Chiesa di San Giovanni Battista", "Parco delle Risorgive",
        "Villa Venier Contarini", "Piazza Municipio", "Museo della Civiltà Contadina"
    ],
    "Pedavena": [
        "Birrificio Pedavena", "Villa Pasolini dall’Onda", "Piazza Martiri",
        "Chiesa di San Antonio", "Parco della Birreria", "Museo Civico di Pedavena"
    ],
    "Marostica": [
        "Castello Superiore", "Castello Inferiore", "Piazza degli Scacchi", "Museo dei Costumi",
        "Chiesa di Sant'Antonio Abate", "Parco della Rimembranza", "Porta Vicentina"
    ],
    "Valeggio sul Mincio": [
        "Parco Giardino Sigurtà", "Borghetto sul Mincio", "Castello Scaligero",
        "Ponte Visconteo", "Chiesa di San Marco Evangelista", "Museo della Pesca"
    ],
    "Abano Terme": [
        "Parco Urbano Termale", "Museo Villa Bassi Rathgeb", "Duomo di San Lorenzo",
        "Montirone Park", "Piazza del Sole e della Pace", "Chiesa del Sacro Cuore"
    ],
    "Lazise": [
        "Castello Scaligero", "Lungolago Marconi", "Chiesa di San Nicolò", "Piazza Vittorio Emanuele II",
        "Museo del Castello", "Villa Pergolana"
    ],
    "Peschiera del Garda": [
        "Fortezza di Peschiera", "Santuario della Madonna del Frassino", "Museo della Pesca",
        "Porta Verona", "Piazza Ferdinando di Savoia", "Parco Catullo"
    ],
    "Malcesine": [
        "Castello Scaligero di Malcesine", "Funivia Malcesine-Monte Baldo", "Palazzo dei Capitani",
        "Chiesa di Santo Stefano", "Museo del Lago", "Piazza Statuto"
    ],
    "Cavarzere": [
        "Duomo di San Mauro", "Museo Civico", "Piazza Vittorio Emanuele II", "Villa Zennaro",
        "Chiesa di San Bartolomeo", "Parco della Rimembranza"
    ],
    "Noale": [
        "Rocca dei Tempesta", "Duomo di Noale", "Piazza Castello", "Museo Civico di Noale",
        "Chiesa di San Giovanni Battista", "Parco della Bujega"
    ],
    "Mirano": [
        "Villa Morosini XXV Aprile", "Piazza Martiri della Libertà", "Duomo di San Michele Arcangelo",
        "Parco Belvedere", "Museo del Paesaggio", "Villa Erizzo"
    ],
    "Spinea": [
        "Villa Simion", "Parco Nuove Gemme", "Chiesa di San Vito e Modesto", "Piazza Municipio",
        "Museo della Civiltà Contadina", "Villa Loredan"
    ],
    "Mogliano Veneto": [
        "Villa Condulmer", "Piazza Caduti", "Chiesa di Santa Maria Assunta", "Parco Arcobaleno",
        "Museo della Bonifica", "Villa Stucky"
    ],
    "Castelfranco Veneto": [
        "Castello di Castelfranco", "Duomo di Castelfranco", "Teatro Accademico", "Piazza Giorgione",
        "Museo Casa Giorgione", "Villa Revedin Bolasco"
    ],
    "San Bonifacio": [
        "Abbazia di Villanova", "Duomo di San Bonifacio", "Piazza Costituzione", "Museo Civico",
        "Villa Gritti", "Parco della Rimembranza"
    ],
    "Legnago": [
        "Teatro Salieri", "Museo Fioroni", "Duomo di San Martino", "Piazza Garibaldi",
        "Castello di Legnago", "Parco Comunale"
    ],
    "Schio": [
        "Duomo di San Pietro", "Fabbrica Alta", "Parco della Fabbrica Alta", "Museo Civico di Schio",
        "Piazza Alessandro Rossi", "Villa Rossi"
    ],
    "Valdagno": [
        "Villa Valle", "Parco delle Traine", "Roman Villa of Valdagno", "Church of San Clemente",
        "Villa Cerchiari", "Museo Civico di Valdagno"
    ],
    "Vittorio Veneto": [
        "Piazza del Popolo", "Villa Papadopoli", "Duomo di San Tiziano", "Museo della Battaglia",
        "Teatro Da Ponte", "Parco della Vittoria"
    ],
    "Bardolino": [
        "Lake Garda promenade", "Church of San Zeno", "Olive Oil Museum", "Piazza Matteotti",
        "Archaeological Museum", "Parco Baia delle Sirene"
    ],
}

event_types = [
    "Music", "Sport", "Food & Drink", "Arts & Crafts", "Theatre", "Tour", "Workshop",
    "Festival", "Conference", "Exhibition"
]

base_url = "https://example.com/veneto-events/"

def random_date():
    today = datetime.now()
    end_date_limit = datetime(2025, 8, 30)
    days_difference = (end_date_limit - today).days
    if days_difference < 0:
        return None, None
    random_days = random.randint(0, days_difference)
    start_date = today + timedelta(days=random_days)
    random_hours = random.randint(0, 23)
    random_minutes = random.randint(0, 59)
    start_date = start_date.replace(hour=random_hours, minute=random_minutes, second=0, microsecond=0)
    duration_hours = random.randint(1, 10)
    end_date = start_date + timedelta(hours=duration_hours)
    if end_date > end_date_limit:
        end_date = end_date_limit
    return start_date.isoformat(timespec='seconds') + 'Z', end_date.isoformat(timespec='seconds') + 'Z'

def generate_random_description(event_type, city, venue):
    description_templates = [
        f"Prepare for an unforgettable {event_type.lower()} experience in the stunning city of {city}, held at the magnificent {venue}. This event promises to be a highlight of the season!",
        f"Dive into the world of {event_type.lower()} at this exciting gathering in {city}. Located at the renowned {venue}, it's an event you won't want to miss.",
        f"Explore the vibrant {event_type.lower()} scene in {city} with this special event at {venue}. Get ready for a day filled with discovery and enjoyment.",
        f"A unique {event_type.lower()} opportunity awaits you in {city}. Join us at the charming {venue} for an event designed to inspire and entertain.",
        f"Immerse yourself in the rich culture of {city} with this engaging {event_type.lower()} event at the iconic {venue}. It's the perfect way to spend your time.",
        f"Seeking adventure in {city}? Look no further than this captivating {event_type.lower()} event at {venue}. Fun and excitement are guaranteed!",
        f"Connect with fellow enthusiasts at this lively {event_type.lower()} event in {city}, taking place at the welcoming {venue}. Share your passion and make new friends.",
        f"Unwind and enjoy a delightful {event_type.lower()} experience in {city} at the picturesque {venue}. Relax and take in the atmosphere.",
        f"Expand your horizons with this insightful {event_type.lower()} event in {city}, hosted at the distinguished {venue}. Learn something new and be inspired.",
        f"Step into a world of wonder at this extraordinary {event_type.lower()} event in {city}, held at the historic {venue}. Prepare to be amazed!"
    ]
    return random.choice(description_templates)

events = []

for i in range(1, 500):
    city = random.choice(list(locations.keys()))
    venue = random.choice(locations[city])
    event_type = random.choice(event_types)
    start, end = random_date()
    if start and end:
        event = {
            "id": str(i),
            "title": f"{city} {event_type} Event #{i}",
            "category": event_type,
            "description": generate_random_description(event_type, city, venue),
            "city": city,
            "location": {
                "venue": venue,
                "address": f"{venue}, {city}, Veneto"
            },
            "start_date": start,
            "end_date": end,
            "url": f"{base_url}{i}"
        }
        events.append(event)

output = {
    "events": events
}

with open("../dataset/veneto_events.json", "w", encoding="utf-8") as f:
    json.dump(output, f, indent=2, ensure_ascii=False)

print(f"Generated veneto_events.json with {len(events)} events")


Generated veneto_events.json with 499 events


#### Geocoding Veneto Events 🌍

Geocode synthetic event data for Veneto using the OpenStreetMap Nominatim API.

**Workflow:**
- Load events from `veneto_events.json`
- Clean addresses (venue, city, region, country)
- Geocode each event (with fallback strategies)
- Rate limit to respect API usage
- Add coordinates to each event
- Save results to `veneto_events_geocoded_structured.json`

In [None]:
import json
import asyncio
import httpx
from tqdm.asyncio import tqdm_asyncio
import threading

counter_lock = threading.Lock()
success_counter = 0  # Global counter variable

async def async_geocode_structured(venue, city, region="Veneto", country="Italy"):
    base_url = "https://nominatim.openstreetmap.org/search"
    headers = {'User-Agent': 'convert_to_geo/1.0'}

    params_list = [
        {'street': venue, 'city': city, 'state': region, 'country': country, 'format': 'json', 'limit': 1},
        {'city': city, 'state': region, 'country': country, 'format': 'json', 'limit': 1},
        {'street': venue, 'city': city, 'country': country, 'format': 'json', 'limit': 1},
        {'street': venue, 'state': region, 'country': country, 'format': 'json', 'limit': 1}
    ]

    async with httpx.AsyncClient() as client_http:
        for params in params_list:
            try:
                response = await client_http.get(base_url, params=params, headers=headers, timeout=10)
                response.raise_for_status()
                data = response.json()
                if data:
                    return float(data[0]['lat']), float(data[0]['lon'])
            except (httpx.HTTPError, ValueError):
                pass
            await asyncio.sleep(1)

    return None, None


async def process_events_geocoding(events):
    semaphore = asyncio.Semaphore(5)

    global success_counter  # use global here for global variable

    async def geocode_event(event):
        global success_counter
        venue = event.get('location', {}).get('venue', '').strip()
        city = event.get('city', '').strip()
        if venue and city:
            async with semaphore:
                lat, lon = await async_geocode_structured(venue, city)
                if lat is not None and lon is not None:
                    with counter_lock:
                        success_counter += 1
                        print(f"Geocoding success {success_counter}")
                event['location']['latitude'] = lat
                event['location']['longitude'] = lon
        else:
            event['location']['latitude'] = None
            event['location']['longitude'] = None

    await tqdm_asyncio.gather(*(geocode_event(event) for event in events))


async def main():
    input_path = '../dataset/veneto_events.json'
    output_path = '../dataset/veneto_events_geocoded_structured.json'

    with open(input_path, 'r', encoding='utf-8') as f:
        events_data = json.load(f)

    await process_events_geocoding(events_data['events'])

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(events_data, f, ensure_ascii=False, indent=2)

    print("Geocoding complete and saved to veneto_events_geocoded_structured.json")

# Run in notebook cell with:
# await main()


# Run in your Jupyter notebook cell with:
await main()

# Or if top-level await not supported:
# import asyncio
# asyncio.run(main())


  0%|          | 1/499 [00:02<19:39,  2.37s/it]

Geocoding success 1


  0%|          | 2/499 [00:03<12:43,  1.54s/it]

Geocoding success 2


  1%|          | 3/499 [00:05<14:41,  1.78s/it]

Geocoding success 3


  1%|          | 4/499 [00:06<11:59,  1.45s/it]

Geocoding success 4


  1%|          | 5/499 [00:08<13:56,  1.69s/it]

Geocoding success 5


  1%|          | 6/499 [00:09<11:32,  1.40s/it]

Geocoding success 6


  1%|▏         | 7/499 [00:12<15:49,  1.93s/it]

Geocoding success 7


  2%|▏         | 8/499 [00:14<16:17,  1.99s/it]

Geocoding success 8


  2%|▏         | 9/499 [00:15<13:34,  1.66s/it]

Geocoding success 9


  2%|▏         | 9/499 [00:16<14:49,  1.82s/it]


CancelledError: 

Geocoding success 10
Geocoding success 11
Geocoding success 12
Geocoding success 13
Geocoding success 14
Geocoding success 15
Geocoding success 16
Geocoding success 17
Geocoding success 18
Geocoding success 19
Geocoding success 20
Geocoding success 21
Geocoding success 22
Geocoding success 23
Geocoding success 24
Geocoding success 25
Geocoding success 26
Geocoding success 27
Geocoding success 28
Geocoding success 29
Geocoding success 30
Geocoding success 31
Geocoding success 32
Geocoding success 33
Geocoding success 34
Geocoding success 35
Geocoding success 36
Geocoding success 37
Geocoding success 38
Geocoding success 39
Geocoding success 40
Geocoding success 41
Geocoding success 42
Geocoding success 43
Geocoding success 44
Geocoding success 45
Geocoding success 46
Geocoding success 47
Geocoding success 48
Geocoding success 49
Geocoding success 50
Geocoding success 51
Geocoding success 52
Geocoding success 53
Geocoding success 54
Geocoding success 55
Geocoding success 56
Geocoding suc

### To check the fastembed models

In [None]:
from fastembed import TextEmbedding

# Get the list of supported models (each is a dict)
models_info = TextEmbedding.list_supported_models()

# Extract only the model names into a list
model_names = [model['model'] for model in models_info]

model_names


In [None]:
%pip install qdrant_client --upgrade
%pip install qdrant-client[fastembed] --upgrade
%pip install ipywidgets --upgrade
%pip install fastembed huggingface_hub --upgrade



### Qdrant Vector ingestion

This section uploads events into a Qdrant vector database.

Steps:
- Load environment variables from `.env`
- Load geocoded events from `veneto_events_geocoded_structured.json`
- Check or create the `veneto_events` collection in Qdrant 
- Upsert all event points to the Qdrant collection

#### Embedding on description

In [None]:
from qdrant_client import QdrantClient, models
import os
from dotenv import load_dotenv
import json
from tqdm import tqdm
from uuid import uuid4
import hashlib
from fastembed import TextEmbedding, SparseTextEmbedding

# -------------------------
# Load environment variables
# -------------------------
load_dotenv(dotenv_path="../.env")
QDRANT_SERVER = os.getenv("QDRANT_SERVER")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

DENSE_MODEL_NAME = os.getenv("DENSE_MODEL_NAME")
SPARSE_MODEL_NAME = os.getenv("SPARSE_MODEL_NAME")

if not QDRANT_SERVER or not QDRANT_API_KEY:
    raise ValueError("QDRANT_SERVER or QDRANT_API_KEY not defined in .env file")

# -------------------------
# Initialize clients
# -------------------------
dense_embedding_model = TextEmbedding(DENSE_MODEL_NAME)
sparse_embedding_model = SparseTextEmbedding(SPARSE_MODEL_NAME)

dense_vector_name = "dense_vector"
sparse_vector_name = "sparse_vector"
COLLECTION_NAME = "veneto_events"

client = QdrantClient(url=QDRANT_SERVER, api_key=QDRANT_API_KEY, timeout=200000)

def verify_qdrant_connection(client):
    try:
        client.get_collections()
        return True
    except Exception as e:
        print(f"Qdrant connection error: {e}")
        return False

if not verify_qdrant_connection(client):
    raise ConnectionError("Failed to connect to Qdrant server")

# -------------------------
# Utility Functions
# -------------------------
def calculate_hash(text: str) -> str:
    """Calculate SHA256 hash of text."""
    return hashlib.sha256(text.encode("utf-8")).hexdigest()

# -------------------------
# Load dataset
# -------------------------
with open("../dataset/veneto_events_geocoded_structured.json", "r", encoding="utf-8") as file:
    events_data = json.load(file)
events = events_data.get("events", [])

# -------------------------
# Create collection if not exists
# -------------------------
example_text = "Test for embedding dimension calculation."
example_emb = list(dense_embedding_model.passage_embed([example_text]))[0]
dense_dim = len(example_emb)

if not client.collection_exists(COLLECTION_NAME):
    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config={
            dense_vector_name: models.VectorParams(size=dense_dim, distance=models.Distance.COSINE)
        },
        sparse_vectors_config={
            sparse_vector_name: models.SparseVectorParams()
        }
    )

# Create payload indexes (safe if repeats)
client.create_payload_index(COLLECTION_NAME, "id", "keyword")
client.create_payload_index(COLLECTION_NAME, "location", "geo")
client.create_payload_index(COLLECTION_NAME, "start_date", "datetime")
client.create_payload_index(COLLECTION_NAME, "end_date", "datetime")

# -------------------------
# Insert / Update events in batches
# -------------------------
BATCH_SIZE = 32
inserted, updated, skipped_unchanged = 0, 0, 0

for start in tqdm(range(0, len(events), BATCH_SIZE)):
    batch = events[start : start + BATCH_SIZE]
    texts = [event.get("description", "") for event in batch]

    dense_embeddings = list(dense_embedding_model.passage_embed(texts))
    sparse_embeddings = list(sparse_embedding_model.passage_embed(texts))

    points = []
    for i, event in enumerate(batch):
        event_id = event.get("id")
        if not event_id:
            continue
        text = texts[i]
        chunk_hash = calculate_hash(text)

        existing_points, _ = client.scroll(
            collection_name=COLLECTION_NAME,
            scroll_filter=models.Filter(
                must=[models.FieldCondition(key="id", match=models.MatchValue(value=event_id))]
            ),
            limit=1,
        )

        if existing_points:
            existing_point = existing_points[0]
            existing_hash = existing_point.payload.get("hash", "")
            if existing_hash == chunk_hash:
                skipped_unchanged += 1
                continue
            else:
                client.delete(collection_name=COLLECTION_NAME, points_selector=models.PointIdsList(points=[existing_point.id]))
                updated += 1
        else:
            inserted += 1

        loc = event.get("location", {})
        loc_geo = {}
        if "latitude" in loc and "longitude" in loc:
            loc_geo = {"lat": loc["latitude"], "lon": loc["longitude"]}

        location_payload = {**loc, **loc_geo}  # Merges original location dict with lat/lon keys

        payload = {**event, "location": location_payload, "hash": chunk_hash}


        points.append(
            models.PointStruct(
                id=str(uuid4()),
                vector={
                    dense_vector_name: dense_embeddings[i].tolist(),
                    sparse_vector_name: models.SparseVector(
                        indices=list(sparse_embeddings[i].indices),
                        values=list(sparse_embeddings[i].values),
                    ),
                },
                payload=payload,
            )
        )

    if points:
        client.upsert(collection_name=COLLECTION_NAME, points=points, wait=True)

# -------------------------
# Final report
# -------------------------
print("\n✅ Operation completed:")
print(f"- Inserted: {inserted}")
print(f"- Updated: {updated}")
print(f"- Skipped (unchanged): {skipped_unchanged}")
print(f"Collection info: {client.get_collection(COLLECTION_NAME)}")


  dense_embedding_model = TextEmbedding(DENSE_MODEL_NAME)
100%|██████████| 16/16 [01:42<00:00,  6.39s/it]


✅ Operation completed:
- Inserted: 499
- Updated: 0
- Skipped (unchanged): 0
Collection info: status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=501 points_count=501 segments_count=2 config=CollectionConfig(params=CollectionParams(vectors={'dense_vector': VectorParams(size=768, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None)}, shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors={'sparse_vector': SparseVectorParams(index=None, modifier=None)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold


