In [2]:
# %% [markdown]
# # Explore and Merge LAP Coffee Datasets (with PM2.5 and Parks)
# 
# This notebook will:
# 1. Load all GeoPackages
# 2. Inspect structure and date ranges
# 3. Merge daily and static datasets (including PM2.5 and parks)
# 4. Keep only nearest park per address
# 5. Ensure full daily coverage from 2025-01-01
# 6. Deduplicate rating/user_ratings_total/place_id columns
# 7. Save final CSV for dbt

# %% [markdown]
# ## 1️⃣ Import libraries
import geopandas as gpd
import pandas as pd
from pathlib import Path

# %% [markdown]
# ## 2️⃣ Define file paths
data_dir = Path("/Users/tolgasabanoglu/Desktop/github/which-lap-coffee-should-i-visit/data/processed")

gpkg_files = {
    "airquality": data_dir / "lap_locations_pm25_daily.gpkg",  # PM2.5
    "elevation": data_dir / "lap_locations_elevation.gpkg",
    "weather": data_dir / "lap_locations_historical_weather.gpkg",
    "ndvi": data_dir / "lap_locations_ndvi_daily.gpkg",
    "nightlights": data_dir / "lap_locations_nightlights_daily.gpkg",
    # UPDATED: File name uses 'lap_locations_with_all_bars.gpkg'
    "open_bars": data_dir / "lap_locations_with_open_bars.gpkg", 
    "parks": data_dir / "lap_locations_with_park_counts.gpkg"  # Parks
}

# %% [markdown]
# ## 3️⃣ Load GeoPackages
gdfs = {}
for name, path in gpkg_files.items():
    print(f"Loading {name} from {path} ...")
    try:
        gdfs[name] = gpd.read_file(path, layer="lap_coffee")
        print(f"{name}: {gdfs[name].shape[0]} rows, columns: {list(gdfs[name].columns)}\n")
    except Exception as e:
        print(f"❌ ERROR: Could not load {name} from {path}. Skipping this file. Error: {e}\n")
        # Ensure the key is removed if loading fails to prevent KeyErrors later
        del gpkg_files[name]


# %% [markdown]
# ## 4️⃣ Check date ranges
for name, gdf in gdfs.items():
    print(f"### {name}")
    if "date" in gdf.columns:
        print("Date range:", gdf["date"].min(), "-", gdf["date"].max())
    if "weather_date" in gdf.columns:
        print("Weather date range:", gdf["weather_date"].min(), "-", gdf["weather_date"].max())
    print("\n")

# %% [markdown]
# ## 5️⃣ Normalize date columns
if "weather" in gdfs and "weather_date" in gdfs["weather"].columns:
    gdfs["weather"]["date"] = pd.to_datetime(gdfs["weather"]["weather_date"]).dt.strftime('%Y-%m-%d')

for key in ["airquality", "ndvi", "nightlights"]:
    if key in gdfs and "date" in gdfs[key].columns:
        gdfs[key]["date"] = pd.to_datetime(gdfs[key]["date"]).dt.strftime('%Y-%m-%d')


# %% [markdown]
# ## 7️⃣ Merge daily datasets (airquality, weather, ndvi, nightlights)
# Using "airquality" key for the initial merge (PM2.5).
daily_merged = gdfs["airquality"].copy()

for name in ["weather", "ndvi", "nightlights"]:
    if name in gdfs:
        merge_df = gdfs[name].drop(columns=["geometry", "address"], errors="ignore")
        daily_merged = daily_merged.merge(
            merge_df,
            on=["name", "lat", "lon", "date"],
            how="left",
            suffixes=("", f"_{name}")
        )

# %% [markdown]
# ## 8️⃣ Merge static datasets (elevation, parks, open_bars)
# Only includes elevation, parks, and open_bars.
for name in ["elevation", "parks", 'open_bars']:
    if name in gdfs:
        merge_df = gdfs[name].drop(columns=["geometry", "address"], errors="ignore")
        daily_merged = daily_merged.merge(
            merge_df,
            on=["name", "lat", "lon"],
            how="left",
            suffixes=("", f"_{name}")
        )

# %% [markdown]
# ## 9️⃣ Clean duplicate metadata columns
duplicate_cols = [c for c in daily_merged.columns if any(x in c for x in ["rating_", "user_ratings_total_", "place_id_"])]

if duplicate_cols:
    print("Removing duplicate columns:", duplicate_cols)
    daily_merged = daily_merged.drop(columns=duplicate_cols, errors="ignore")

rename_map = {
    "rating": "cafe_rating",
    "user_ratings_total": "cafe_user_ratings_total",
    "place_id": "cafe_place_id"
}
daily_merged = daily_merged.rename(columns=rename_map)

# %% [markdown]
# ## 🔟 Generate geometry column
daily_merged = gpd.GeoDataFrame(
    daily_merged,
    geometry=gpd.points_from_xy(daily_merged.lon, daily_merged.lat),
    crs="EPSG:4326"
)

# %% [markdown]
# ## 1️⃣1️⃣ Ensure full date coverage per café (2025-01-01 onward)
daily_merged["date"] = pd.to_datetime(daily_merged["date"])
all_dates = pd.date_range("2025-01-01", daily_merged["date"].max())

cafes = daily_merged[["name", "lat", "lon", "address"]].drop_duplicates().reset_index(drop=True)
full_index = pd.MultiIndex.from_product([cafes.index, all_dates], names=["cafe_idx", "date"])
full_df = pd.DataFrame(index=full_index).reset_index()

full_df = full_df.merge(cafes.reset_index(), left_on="cafe_idx", right_on="index", how="left")
full_df = full_df.drop(columns=["cafe_idx", "index"])

full_df["date"] = pd.to_datetime(full_df["date"])
daily_merged["date"] = pd.to_datetime(daily_merged["date"])

daily_merged = full_df.merge(
    daily_merged,
    on=["name", "lat", "lon", "address", "date"],
    how="left"
)

print(f"✅ Expanded dataset covers {len(all_dates)} days × {len(cafes)} cafes = {len(all_dates) * len(cafes):,} rows")

# %% [markdown]
# ## 1️⃣2️⃣ Convert date to string (for dbt / CSV)
daily_merged["date"] = daily_merged["date"].dt.strftime('%Y-%m-%d')

# %% [markdown]
# ## 1️⃣3️⃣ Save final merged dataset
output_csv = data_dir / "lap_locations_final_merged.csv"
daily_merged.to_csv(output_csv, index=False)
print(f"✅ Merged dataset saved: {output_csv}")

# %% [markdown]
# ## 1️⃣4️⃣ Quick summary
print("Columns:", list(daily_merged.columns))
print("Number of rows:", daily_merged.shape[0])
print("Sample rows:\n", daily_merged.head())


Loading airquality from /Users/tolgasabanoglu/Desktop/github/which-lap-coffee-should-i-visit/data/processed/lap_locations_pm25_daily.gpkg ...
airquality: 4752 rows, columns: ['name', 'address', 'lat', 'lon', 'date', 'pm25_aod_proxy', 'geometry']

Loading elevation from /Users/tolgasabanoglu/Desktop/github/which-lap-coffee-should-i-visit/data/processed/lap_locations_elevation.gpkg ...
elevation: 16 rows, columns: ['name', 'address', 'lat', 'lon', 'rating', 'user_ratings_total', 'place_id', 'elevation_m', 'geometry']

Loading weather from /Users/tolgasabanoglu/Desktop/github/which-lap-coffee-should-i-visit/data/processed/lap_locations_historical_weather.gpkg ...
weather: 4768 rows, columns: ['weather_date', 'temp_max', 'temp_min', 'precip_mm', 'name', 'address', 'lat', 'lon', 'rating', 'user_ratings_total', 'season', 'geometry']

Loading ndvi from /Users/tolgasabanoglu/Desktop/github/which-lap-coffee-should-i-visit/data/processed/lap_locations_ndvi_daily.gpkg ...
ndvi: 4752 rows, columns