In [None]:
import pandas as pd

import plotly.express as px
from pathlib import Path

from crossfilter.data_ingestion.lightroom import lightroom_parser

from crossfilter.data_ingestion.lightroom.ingest_lightroom_catalogs import (
    find_lightroom_catalogs,
    main,
    load_clip_embeddings_from_sqlite,
    compute_umap_projection,
)
import sqlite3
import msgpack_numpy as msgpack
import logging
from crossfilter.core.schema import SchemaColumns as C

logger = logging.getLogger(__name__)

import umap

In [None]:
def timestamp_str_to_tz(timestamp_str: str | None) -> pd.Timestamp | None:
    """Convert timestamp string to UTC, handling various timezone formats."""
    if pd.isna(timestamp_str) or timestamp_str is None:
        return pd.NaT

    try:
        # Parse with pandas, which handles ISO8601 formats including timezones
        parsed = pd.to_datetime(timestamp_str, format="ISO8601")
        return parsed.tz
    except (ValueError, TypeError, pd.errors.OutOfBoundsDatetime) as e:
        logger.warning(f"Failed to parse timestamp '{timestamp_str}': {e}")
        return pd.NaT

In [None]:
catalog_path = Path("/Users/thad/personal/lightroom/Lightroom Catalog-v13-3.lrcat")
df = lightroom_parser.parse_lightroom_catalog(catalog_path=catalog_path)

In [None]:
df["TIMESTAMP_MAYBE_TIMEZONE_AWARE"]

In [None]:
df["tz"] = df[C.TIMESTAMP_MAYBE_TIMEZONE_AWARE].map(timestamp_str_to_tz)

In [None]:
df["tz"].value_counts(dropna=False).index[1]

In [None]:
import datetime


df[df["tz"] == datetime.timezone(datetime.timedelta(days=-1, seconds=61200))]

In [None]:
db_path = Path("/Users/thad/personal/lightroom_embedding_vectors.sqlite")

In [None]:
# with sqlite3.connect(db_path) as conn:
#         # Load embeddings table
#         df = pd.read_sql(
#             """SELECT * FROM embeddings WHERE type_index = "CLIP_HF_EMBEDDINGS" """, conn
#         )


# msgpack.unpackb(df["embedding_msgpack"].iloc[0])

In [None]:
df = load_clip_embeddings_from_sqlite(db_path)

df.head()

In [None]:
new_df = compute_umap_projection(df)

In [None]:
new_df

In [None]:
fig = px.scatter_map(
    new_df[0],
    lat="CLIP_UMAP_HAVERSINE_LONGITUDE",
    lon="CLIP_UMAP_HAVERSINE_LATITUDE",
    map_style="open-street-map",
)

fig