In [11]:
# --- Forecast crawler ---
import requests, pandas as pd
from datetime import datetime, timezone
from time import sleep
import pytz

API_KEY = "492aa1d288cfdf972a104e0a55c35b68"
BASE_URL_FORECAST = "https://api.openweathermap.org/data/2.5/forecast"

# Clean city list (India)
city_list = ["Delhi","Mumbai","Bangalore","Chennai","Kolkata","Hyderabad","Pune",
             "Jaipur","Lucknow","Surat","Kanpur","Nagpur","Indore","Bhopal",
             "Patna","Vadodara","Ludhiana","Agra","Varanasi"]

session = requests.Session()
session.headers.update({"User-Agent":"weather-crawler/1.0"})
timeout = 15  # seconds

india_tz = pytz.timezone("Asia/Kolkata")
crawl_time_ist = datetime.now(india_tz).strftime("%Y-%m-%d %H:%M:%S")

rows = []

for city in city_list:
    params = {"q": city, "appid": API_KEY, "units": "metric"}
    try:
        r = session.get(BASE_URL_FORECAST, params=params, timeout=timeout)
        j = r.json()
        if r.status_code != 200:
            print(f"[WARN] Forecast failed for {city}: {j}")
            continue

        city_meta   = j.get("city", {}) or {}
        country     = city_meta.get("country")
        coord       = city_meta.get("coord", {}) or {}
        tz_offset   = city_meta.get("timezone")  # seconds from UTC; may be None
        # city sunrise/sunset are unix UTC seconds
        sr_unix     = city_meta.get("sunrise")
        ss_unix     = city_meta.get("sunset")

        # Convert to readable times
        def to_utc_dt(unix):
            if unix is None: return None
            return datetime.fromtimestamp(unix, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")

        def to_ist_dt(unix):
            if unix is None: return None
            return datetime.fromtimestamp(unix, tz=timezone.utc).astimezone(india_tz).strftime("%Y-%m-%d %H:%M:%S")

        sunrise_utc = to_utc_dt(sr_unix)
        sunset_utc  = to_utc_dt(ss_unix)
        sunrise_ist = to_ist_dt(sr_unix)
        sunset_ist  = to_ist_dt(ss_unix)

        for item in j.get("list", []):
            main   = item.get("main", {}) or {}
            wind   = item.get("wind", {}) or {}
            clouds = item.get("clouds", {}) or {}
            wx0    = (item.get("weather") or [{}])[0]
            rain   = item.get("rain", {}) or {}
            snow   = item.get("snow", {}) or {}

            rows.append({
                # lineage / meta
                "crawl_time":        crawl_time_ist,                     # when we crawled (IST)
                "city":              city_meta.get("name", city),
                "city_lat":          coord.get("lat"),
                "city_lon":          coord.get("lon"),
                "timezone_offset_s": tz_offset,

                # forecast timestamp (OpenWeather provides local dt_txt)
                "forecast_time":     item.get("dt_txt"),

                # weather metrics
                "temp_c":            main.get("temp"),
                "feels_like_c":      main.get("feels_like"),
                "temp_min_c":        main.get("temp_min"),
                "temp_max_c":        main.get("temp_max"),
                "pressure_hpa":      main.get("pressure"),
                "humidity_pct":      main.get("humidity"),
                "visibility_m":      item.get("visibility"),

                "wind_speed_ms":     wind.get("speed"),
                "wind_deg":          wind.get("deg"),
                "cloudiness_pct":    clouds.get("all"),

                "weather_main":      wx0.get("main"),
                "weather_desc":      wx0.get("description"),

                "pop":               item.get("pop"),          # probability of precipitation
                "rain_3h_mm":        rain.get("3h"),
                "snow_3h_mm":        snow.get("3h"),

                # NEW: sunrise/sunset from city block
                "sunrise_ist":       sunrise_ist,
                "sunset_ist":        sunset_ist
            })

        sleep(0.3)  # be gentle with rate limits

    except Exception as e:
        print(f"[ERR] {city}: {e}")

df_fc = pd.DataFrame(rows)
display(df_fc.head(10))


StatementMeta(, 4e7270fc-d818-4c78-8fde-ed9f63bacc1b, 13, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, d1241a64-6d73-4227-acab-2f3ab9b0549e)

In [12]:
import pandas as pd
from datetime import datetime
import pytz

# all_weather_data is your list of dicts
df_weather = pd.DataFrame(rows)

# today's date for filename
india_tz = pytz.timezone('Asia/Kolkata')
today_str = datetime.now(india_tz).strftime("%Y-%m-%d %H:%M:%S")

# save to Lakehouse Files with date in filename
file_path = f"/lakehouse/default/Files/forecast_data_bronze/weather_forecast {today_str}.csv"
df_weather.to_csv(file_path, index=False)

print(f"Saved: {file_path}")

StatementMeta(, 4e7270fc-d818-4c78-8fde-ed9f63bacc1b, 14, Finished, Available, Finished)

Saved: /lakehouse/default/Files/forecast_data_bronze/weather_forecast 2025-08-28 11:43:02.csv


In [13]:
import pandas as pd
# Load data into pandas DataFrame from "/lakehouse/default/Files/forecast_data_bronze/weather_forecast 2025-08-28 11:43:02.csv"
df = pd.read_csv("/lakehouse/default/Files/forecast_data_bronze/weather_forecast 2025-08-28 11:43:02.csv")
display(df)


StatementMeta(, 4e7270fc-d818-4c78-8fde-ed9f63bacc1b, 15, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, b4d20ebb-2451-4bc0-a591-1b10a547a9c4)