In [2]:
import pandas as pd
import datetime as dt
import requests
import os
from pathlib import Path
from typing import List, Dict, Any, Optional, Sequence

print("✓ Import thành công")

✓ Import thành công


## 1. Lấy thông tin Facilities (Metadata các trạm)

## Helper Functions (không dùng PySpark)

In [9]:
# Cấu hình API
DEFAULT_BASE_URL = os.environ.get("OPENELECTRICITY_API_URL", "https://api.openelectricity.org.au/v4")
FACILITIES_ENDPOINT = f"{DEFAULT_BASE_URL.rstrip('/')}/facilities/"

# Timezone mapping
FACILITY_TIMEZONES = {
    "NYNGAN": "Australia/Sydney",
    "GANNSF": "Australia/Melbourne",
}
DEFAULT_TIMEZONE = "Australia/Brisbane"

def get_facility_timezone(facility_code: str) -> str:
    """Lấy timezone của facility"""
    return FACILITY_TIMEZONES.get(facility_code.upper(), DEFAULT_TIMEZONE)

def load_api_key() -> str:
    """Load API key từ environment hoặc .env file"""
    # Thử các biến môi trường
    for key in ["OPENELECTRICITY_API_KEY", "OPEN_ELECTRICITY_API_KEY", "OPEN_NEM_PRIMARY"]:
        value = os.environ.get(key)
        if value:
            return value
    
    # Thử đọc từ .env file ở các vị trí khác nhau
    possible_env_paths = [
        Path.cwd().parent.parent.parent / ".env",
        Path.cwd().parent.parent.parent / "docker" / ".env",
        Path("/home/pvlakehouse/dlh-pv/.env"),
        Path("/home/pvlakehouse/dlh-pv/docker/.env"),
    ]
    
    for env_path in possible_env_paths:
        if env_path.exists():
            for line in env_path.read_text().splitlines():
                line = line.strip()
                if not line or line.startswith("#") or "=" not in line:
                    continue
                key, value = line.split("=", 1)
                key = key.strip()
                value = value.strip().strip('"').strip("'")
                if key in ["OPENELECTRICITY_API_KEY", "OPEN_ELECTRICITY_API_KEY", "OPEN_NEM_PRIMARY", "OPEN_NEM_SECONDARY"]:
                    return value
    
    raise RuntimeError("Không tìm thấy API key. Cần set biến môi trường OPENELECTRICITY_API_KEY")

print("✓ Helper functions đã được định nghĩa")

✓ Helper functions đã được định nghĩa


In [14]:
def fetch_facilities_dataframe(
    networks: List[str] = None,
    statuses: List[str] = None,
    fueltechs: List[str] = None,
) -> pd.DataFrame:
    """Lấy danh sách facilities từ OpenElectricity API"""
    
    api_key = load_api_key()
    networks = networks or ["NEM", "WEM"]
    statuses = statuses or ["operating"]
    fueltechs = fueltechs or ["solar_utility"]
    
    # Build query parameters
    params = []
    for net in networks:
        params.append(("network_id", net))
    for status in statuses:
        params.append(("status_id", status))
    for fuel in fueltechs:
        params.append(("fueltech_id", fuel))
    
    # Request
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Accept": "application/json",
    }
    
    response = requests.get(FACILITIES_ENDPOINT, headers=headers, params=params, timeout=60)
    response.raise_for_status()
    payload = response.json()
    
    # Parse facilities
    rows = []
    for facility in payload.get("data", []):
        unit_codes = [u.get("code") for u in facility.get("units", []) if u.get("code")]
        location = facility.get("location") or {}
        
        rows.append({
            "facility_code": facility.get("code"),
            "facility_name": facility.get("name"),
            "network_id": facility.get("network_id"),
            "network_region": facility.get("network_region"),
            "location_lat": location.get("latitude"),
            "location_lng": location.get("longitude"),
            "total_capacity_mw": facility.get("capacity_registered"),
            "unit_count": len(facility.get("units", [])),
            "unit_codes": ",".join(unit_codes) if unit_codes else None,
        })
    
    df = pd.DataFrame(rows)
    if not df.empty:
        df = df.sort_values(["network_id", "facility_name"]).reset_index(drop=True)
    
    return df

print("✓ fetch_facilities_dataframe đã được định nghĩa")

✓ fetch_facilities_dataframe đã được định nghĩa


In [23]:
def fetch_facility_timeseries_dataframe(
    facility_codes: List[str],
    metrics: List[str] = None,
    interval: str = "1h",
    date_start: str = None,
    date_end: str = None,
) -> pd.DataFrame:
    """Lấy timeseries data của facility"""
    
    api_key = load_api_key()
    metrics = metrics or ["energy"]
    
    # Lấy metadata của facility để biết network
    headers = {"Authorization": f"Bearer {api_key}"}
    params = [("facility_code", code) for code in facility_codes]
    response = requests.get(FACILITIES_ENDPOINT, headers=headers, params=params, timeout=60)
    response.raise_for_status()
    facilities_payload = response.json()
    
    # Map facility code -> network
    facility_network = {}
    for fac in facilities_payload.get("data", []):
        facility_network[fac.get("code")] = fac.get("network_id")
    
    # Fetch timeseries cho từng facility
    all_rows = []
    for facility_code in facility_codes:
        network = facility_network.get(facility_code, "NEM")
        data_url = f"{DEFAULT_BASE_URL.rstrip('/')}/data/facilities/{network}"
        
        # Build params theo format API
        params_list = [
            ("interval", interval),
            ("facility_code", facility_code),
        ]
        for metric in metrics:
            params_list.append(("metrics", metric))
        
        if date_start:
            params_list.append(("date_start", date_start))
        if date_end:
            params_list.append(("date_end", date_end))
        
        response = requests.get(data_url, headers=headers, params=params_list, timeout=60)
        response.raise_for_status()
        data_payload = response.json()
        
        # Parse timeseries từ results
        for result_group in data_payload.get("data", []):
            for result in result_group.get("results", []):
                unit_code = result.get("columns", {}).get("unit_code")
                for row in result.get("data", []):
                    if len(row) >= 2:
                        all_rows.append({
                            "facility_code": facility_code,
                            "unit_code": unit_code,
                            "interval_start": row[0],
                            "energy": row[1],
                            "network_id": network,
                        })
    
    df = pd.DataFrame(all_rows)
    if not df.empty:
        df = df.sort_values("interval_start").reset_index(drop=True)
    
    return df

print("✓ fetch_facility_timeseries_dataframe đã được định nghĩa")

✓ fetch_facility_timeseries_dataframe đã được định nghĩa


In [29]:
def fetch_weather_dataframe(
    facility_code: str,
    facility_name: str,
    latitude: float,
    longitude: float,
    start_date: dt.date,
    end_date: dt.date,
    timezone: str = "UTC",
) -> pd.DataFrame:
    """Lấy weather data từ Open-Meteo API"""
    
    # Open-Meteo endpoints
    archive_url = "https://archive-api.open-meteo.com/v1/archive"
    forecast_url = "https://api.open-meteo.com/v1/forecast"
    
    # Weather variables
    weather_vars = [
        "shortwave_radiation", "direct_radiation", "diffuse_radiation", "direct_normal_irradiance",
        "temperature_2m", "dew_point_2m", "cloud_cover", "precipitation", 
        "wind_speed_10m", "wind_direction_10m", "pressure_msl"
    ]
    
    # Determine which endpoint to use
    today = dt.date.today()
    archive_cutoff = today - dt.timedelta(days=5)
    
    all_frames = []
    
    # Split into archive and forecast if needed
    if end_date < archive_cutoff:
        # All archive
        endpoint = archive_url
        chunks = [(start_date, end_date)]
    elif start_date > archive_cutoff:
        # All forecast
        endpoint = forecast_url
        chunks = [(start_date, end_date)]
    else:
        # Split: archive + forecast
        chunks = [
            (start_date, archive_cutoff),
            (archive_cutoff + dt.timedelta(days=1), end_date)
        ]
    
    for chunk_start, chunk_end in chunks:
        if chunk_end < archive_cutoff:
            endpoint = archive_url
        else:
            endpoint = forecast_url
        
        params = {
            "latitude": f"{latitude:.5f}",
            "longitude": f"{longitude:.5f}",
            "hourly": ",".join(weather_vars),
            "start_date": chunk_start.isoformat(),
            "end_date": chunk_end.isoformat(),
            "timezone": timezone,
        }
        
        response = requests.get(endpoint, params=params, timeout=120)
        response.raise_for_status()
        payload = response.json()
        
        hourly = payload.get("hourly", {})
        if hourly:
            df_chunk = pd.DataFrame(hourly)
            if "time" in df_chunk.columns:
                df_chunk = df_chunk.rename(columns={"time": "date"})
            df_chunk["facility_code"] = facility_code
            df_chunk["facility_name"] = facility_name
            all_frames.append(df_chunk)
    
    if not all_frames:
        return pd.DataFrame()
    
    df = pd.concat(all_frames, ignore_index=True)
    return df

print("✓ fetch_weather_dataframe đã được định nghĩa")

✓ fetch_weather_dataframe đã được định nghĩa


## Kiểm tra API Key

In [10]:
# Kiểm tra xem có API key không
try:
    api_key = load_api_key()
    print(f"✓ Đã tìm thấy API key: {api_key[:10]}...")
except RuntimeError as e:
    print(f"❌ Lỗi: {e}")
    print("\nHướng dẫn:")
    print("1. Kiểm tra file .env ở thư mục gốc project")
    print("2. Hoặc set biến môi trường: export OPENELECTRICITY_API_KEY='your_key'")
    env_path = Path.cwd().parent.parent.parent / ".env"
    print(f"\nĐường dẫn .env file: {env_path}")
    print(f"File tồn tại: {env_path.exists()}")

✓ Đã tìm thấy API key: oe_3ZaLhV4...


In [15]:
# Lấy danh sách facilities
facilities_df = fetch_facilities_dataframe(
    networks=["NEM", "WEM"],
    statuses=["operating"],
    fueltechs=["solar_utility"]
)

print(f"Tổng số trạm: {len(facilities_df)}")
print(f"\nCác cột: {list(facilities_df.columns)}")
facilities_df.head()

Tổng số trạm: 118

Các cột: ['facility_code', 'facility_name', 'network_id', 'network_region', 'location_lat', 'location_lng', 'total_capacity_mw', 'unit_count', 'unit_codes']


Unnamed: 0,facility_code,facility_name,network_id,network_region,location_lat,location_lng,total_capacity_mw,unit_count,unit_codes
0,ADP,Adelaide Desalination,NEM,SA1,,,,3,"ADPPV1,ADPPV3,ADPPV2"
1,ALDGASF,Aldoga,NEM,QLD1,,,,1,ALDGASF1
2,AVLSF,Avonlie,NEM,NSW1,,,,1,AVLSF1
3,BAKING,Baking Board,NEM,QLD1,,,,1,BAKING1
4,BANNSP,Bannerton,NEM,VIC1,,,,1,BANN1


In [12]:
# Xem thông tin chi tiết
print("Thông tin chi tiết của facilities:")
facilities_df.info()

Thông tin chi tiết của facilities:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118 entries, 0 to 117
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   facility_code      118 non-null    object
 1   facility_name      118 non-null    object
 2   network_id         118 non-null    object
 3   network_region     118 non-null    object
 4   location_lat       0 non-null      object
 5   location_lng       0 non-null      object
 6   total_capacity_mw  0 non-null      object
 7   unit_count         118 non-null    int64 
 8   unit_codes         118 non-null    object
dtypes: int64(1), object(8)
memory usage: 8.4+ KB


In [13]:
# Lưu facilities vào CSV
output_dir = Path("sample_data")
output_dir.mkdir(exist_ok=True)

facilities_csv = output_dir / "facilities.csv"
facilities_df.to_csv(facilities_csv, index=False)
print(f"Đã lưu facilities vào: {facilities_csv}")

Đã lưu facilities vào: sample_data/facilities.csv


## 2. Chọn 1 trạm mẫu và lấy Timeseries (theo giờ địa phương)

In [16]:
# Chọn trạm NYNGAN làm mẫu (có timezone Australia/Sydney)
SAMPLE_FACILITY = "NYNGAN"
sample_timezone = get_facility_timezone(SAMPLE_FACILITY)

print(f"Trạm mẫu: {SAMPLE_FACILITY}")
print(f"Timezone: {sample_timezone}")

# Lấy thông tin chi tiết của trạm
sample_facility_info = facilities_df[facilities_df['facility_code'] == SAMPLE_FACILITY]
print(f"\nThông tin trạm:")
sample_facility_info

Trạm mẫu: NYNGAN
Timezone: Australia/Sydney

Thông tin trạm:


Unnamed: 0,facility_code,facility_name,network_id,network_region,location_lat,location_lng,total_capacity_mw,unit_count,unit_codes
76,NYNGAN,Nyngan,NEM,NSW1,,,,1,NYNGAN1


In [37]:
# Lấy dữ liệu timeseries 1 tháng (30 ngày) gần nhất
# API sẽ trả về theo giờ địa phương của trạm
end_date = dt.datetime.now()
start_date = end_date - dt.timedelta(days=30)

print(f"Lấy timeseries từ {start_date.date()} đến {end_date.date()}")

timeseries_df = fetch_facility_timeseries_dataframe(
    facility_codes=[SAMPLE_FACILITY],
    metrics=["energy"],  # Năng lượng sản xuất
    interval="1h",
    date_start=start_date.strftime("%Y-%m-%dT%H:%M:%S"),
    date_end=end_date.strftime("%Y-%m-%dT%H:%M:%S"),
)

print(f"\nSố dòng timeseries: {len(timeseries_df)}")
print(f"Các cột: {list(timeseries_df.columns)}")
timeseries_df.head(10)

Lấy timeseries từ 2025-10-16 đến 2025-11-15

Số dòng timeseries: 719
Các cột: ['facility_code', 'unit_code', 'interval_start', 'energy', 'network_id']

Số dòng timeseries: 719
Các cột: ['facility_code', 'unit_code', 'interval_start', 'energy', 'network_id']


Unnamed: 0,facility_code,unit_code,interval_start,energy,network_id
0,NYNGAN,NYNGAN1,2025-10-16T17:00:00+10:00,19.1251,NEM
1,NYNGAN,NYNGAN1,2025-10-16T18:00:00+10:00,3.2916,NEM
2,NYNGAN,NYNGAN1,2025-10-16T19:00:00+10:00,3.1248,NEM
3,NYNGAN,NYNGAN1,2025-10-16T20:00:00+10:00,2.8644,NEM
4,NYNGAN,NYNGAN1,2025-10-16T21:00:00+10:00,0.0,NEM
5,NYNGAN,NYNGAN1,2025-10-16T22:00:00+10:00,0.0,NEM
6,NYNGAN,NYNGAN1,2025-10-16T23:00:00+10:00,0.0,NEM
7,NYNGAN,NYNGAN1,2025-10-17T00:00:00+10:00,0.0,NEM
8,NYNGAN,NYNGAN1,2025-10-17T01:00:00+10:00,0.0,NEM
9,NYNGAN,NYNGAN1,2025-10-17T02:00:00+10:00,0.0,NEM


In [22]:
# Debug: Xem raw response từ API
import json

api_key = load_api_key()
headers = {"Authorization": f"Bearer {api_key}"}
data_url = f"{DEFAULT_BASE_URL.rstrip('/')}/data/facilities/NEM"

params_list = [
    ("interval", "1h"),
    ("facility_code", SAMPLE_FACILITY),
    ("metrics", "energy"),
    ("date_start", start_date.strftime("%Y-%m-%dT%H:%M:%S")),
    ("date_end", end_date.strftime("%Y-%m-%dT%H:%M:%S")),
]

response = requests.get(data_url, headers=headers, params=params_list, timeout=60)
print(f"Status: {response.status_code}")
print(f"\nResponse sample:")
data = response.json()
print(json.dumps(data, indent=2)[:2000])

Status: 200

Response sample:
{
  "version": "4.3.0",
  "created_at": "2025-11-15T20:28:33+11:00",
  "success": true,
  "error": null,
  "data": [
    {
      "network_code": "NEM",
      "metric": "energy",
      "unit": "MWh",
      "interval": "1h",
      "date_start": "2025-11-08T17:00:00+10:00",
      "date_end": "2025-11-15T16:00:00+10:00",
      "groupings": [],
      "results": [
        {
          "name": "energy_NYNGAN1",
          "date_start": "2025-11-08T17:00:00+10:00",
          "date_end": "2025-11-15T16:00:00+10:00",
          "columns": {
            "unit_code": "NYNGAN1"
          },
          "data": [
            [
              "2025-11-08T17:00:00+10:00",
              18.8269
            ],
            [
              "2025-11-08T18:00:00+10:00",
              17.2807
            ],
            [
              "2025-11-08T19:00:00+10:00",
              10.54
            ],
            [
              "2025-11-08T20:00:00+10:00",
              10.54
           

In [39]:
# Xem thông tin chi tiết timeseries
print("Thông tin chi tiết của timeseries:")
timeseries_df.info()
print("\nMô tả thống kê:")
timeseries_df.describe()

Thông tin chi tiết của timeseries:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 719 entries, 0 to 718
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   facility_code   719 non-null    object 
 1   unit_code       719 non-null    object 
 2   interval_start  719 non-null    object 
 3   energy          719 non-null    float64
 4   network_id      719 non-null    object 
dtypes: float64(1), object(4)
memory usage: 28.2+ KB

Mô tả thống kê:


Unnamed: 0,energy
count,719.0
mean,11.058329
std,20.421444
min,0.0
25%,0.0
50%,0.6252
75%,14.20835
max,100.3438


In [40]:
# Lưu timeseries vào CSV
timeseries_csv = output_dir / f"facility_timeseries_{SAMPLE_FACILITY}.csv"
timeseries_df.to_csv(timeseries_csv, index=False)
print(f"Đã lưu timeseries vào: {timeseries_csv}")

Đã lưu timeseries vào: sample_data/facility_timeseries_NYNGAN.csv


## 3. Lấy dữ liệu OpenMeteo Weather (theo giờ UTC)

In [28]:
# Debug: Xem toàn bộ thông tin facility
print("Thông tin đầy đủ của NYNGAN:")
print(sample_facility_info.to_dict('records')[0])

# Nếu không có tọa độ từ API, dùng tọa độ cứng (tra từ Google Maps)
# NYNGAN Solar Farm: -31.5669, 147.1994
latitude = sample_facility_info['location_lat'].values[0]
longitude = sample_facility_info['location_lng'].values[0]

if latitude is None or longitude is None:
    print("\n⚠️ API không trả về tọa độ, sử dụng tọa độ cứng cho NYNGAN")
    latitude = -31.5669
    longitude = 147.1994

facility_name = sample_facility_info['facility_name'].values[0]

print(f"\nTọa độ trạm {SAMPLE_FACILITY}:")
print(f"  Latitude: {latitude}")
print(f"  Longitude: {longitude}")
print(f"  Name: {facility_name}")

Thông tin đầy đủ của NYNGAN:
{'facility_code': 'NYNGAN', 'facility_name': 'Nyngan', 'network_id': 'NEM', 'network_region': 'NSW1', 'location_lat': None, 'location_lng': None, 'total_capacity_mw': None, 'unit_count': 1, 'unit_codes': 'NYNGAN1'}

⚠️ API không trả về tọa độ, sử dụng tọa độ cứng cho NYNGAN

Tọa độ trạm NYNGAN:
  Latitude: -31.5669
  Longitude: 147.1994
  Name: Nyngan


In [38]:
# Lấy dữ liệu thời tiết 1 tháng (30 ngày) gần nhất (theo UTC)
weather_start = dt.date.today() - dt.timedelta(days=30)
weather_end = dt.date.today()

print(f"Lấy dữ liệu thời tiết từ {weather_start} đến {weather_end} (UTC)")

weather_df = fetch_weather_dataframe(
    facility_code=SAMPLE_FACILITY,
    facility_name=facility_name,
    latitude=latitude,
    longitude=longitude,
    start_date=weather_start,
    end_date=weather_end,
    timezone="UTC",  # Lấy theo UTC như yêu cầu
)

print(f"\nSố dòng weather: {len(weather_df)}")
print(f"Các cột: {list(weather_df.columns)}")
weather_df.head(10)

Lấy dữ liệu thời tiết từ 2025-10-16 đến 2025-11-15 (UTC)

Số dòng weather: 744
Các cột: ['date', 'shortwave_radiation', 'direct_radiation', 'diffuse_radiation', 'direct_normal_irradiance', 'temperature_2m', 'dew_point_2m', 'cloud_cover', 'precipitation', 'wind_speed_10m', 'wind_direction_10m', 'pressure_msl', 'facility_code', 'facility_name']

Số dòng weather: 744
Các cột: ['date', 'shortwave_radiation', 'direct_radiation', 'diffuse_radiation', 'direct_normal_irradiance', 'temperature_2m', 'dew_point_2m', 'cloud_cover', 'precipitation', 'wind_speed_10m', 'wind_direction_10m', 'pressure_msl', 'facility_code', 'facility_name']


Unnamed: 0,date,shortwave_radiation,direct_radiation,diffuse_radiation,direct_normal_irradiance,temperature_2m,dew_point_2m,cloud_cover,precipitation,wind_speed_10m,wind_direction_10m,pressure_msl,facility_code,facility_name
0,2025-10-16T00:00,661.0,442.0,219.0,586.2,27.2,7.1,42,0.0,16.4,15,1019.5,NYNGAN,Nyngan
1,2025-10-16T01:00,808.0,622.0,186.0,722.6,29.2,11.7,52,0.0,16.9,358,1018.6,NYNGAN,Nyngan
2,2025-10-16T02:00,886.0,686.0,200.0,750.2,30.8,14.0,45,0.0,17.9,338,1016.7,NYNGAN,Nyngan
3,2025-10-16T03:00,900.0,750.0,150.0,823.0,32.1,13.3,0,0.0,18.2,335,1015.2,NYNGAN,Nyngan
4,2025-10-16T04:00,734.0,571.0,163.0,670.5,32.4,12.6,58,0.0,21.2,320,1013.7,NYNGAN,Nyngan
5,2025-10-16T05:00,656.0,537.0,119.0,726.2,32.6,11.1,82,0.0,16.6,318,1013.1,NYNGAN,Nyngan
6,2025-10-16T06:00,447.0,293.0,154.0,503.1,32.6,11.2,55,0.0,14.5,305,1012.8,NYNGAN,Nyngan
7,2025-10-16T07:00,254.0,139.0,115.0,355.2,31.3,12.1,67,0.1,14.2,344,1013.2,NYNGAN,Nyngan
8,2025-10-16T08:00,74.0,29.0,45.0,160.5,30.0,12.8,81,0.0,10.9,27,1013.2,NYNGAN,Nyngan
9,2025-10-16T09:00,2.0,0.0,2.0,0.0,27.1,13.4,61,0.0,11.3,17,1013.4,NYNGAN,Nyngan


In [41]:
# Xem thông tin chi tiết weather
print("Thông tin chi tiết của weather data:")
weather_df.info()
print("\nMô tả thống kê (chỉ các cột số):")
weather_df.describe()

Thông tin chi tiết của weather data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744 entries, 0 to 743
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   date                      744 non-null    object 
 1   shortwave_radiation       744 non-null    float64
 2   direct_radiation          744 non-null    float64
 3   diffuse_radiation         744 non-null    float64
 4   direct_normal_irradiance  744 non-null    float64
 5   temperature_2m            744 non-null    float64
 6   dew_point_2m              744 non-null    float64
 7   cloud_cover               744 non-null    int64  
 8   precipitation             744 non-null    float64
 9   wind_speed_10m            744 non-null    float64
 10  wind_direction_10m        744 non-null    int64  
 11  pressure_msl              744 non-null    float64
 12  facility_code             744 non-null    object 
 13  facility_name             74

Unnamed: 0,shortwave_radiation,direct_radiation,diffuse_radiation,direct_normal_irradiance,temperature_2m,dew_point_2m,cloud_cover,precipitation,wind_speed_10m,wind_direction_10m,pressure_msl
count,744.0,744.0,744.0,744.0,744.0,744.0,744.0,744.0,744.0,744.0,744.0
mean,291.123656,214.580645,76.543011,310.617339,21.751882,6.034274,46.071237,0.018683,12.990726,171.534946,1011.082258
std,358.1462,303.154308,101.128682,375.926886,7.12633,5.62149,40.940018,0.144296,7.035544,100.906758,4.572027
min,0.0,0.0,0.0,0.0,6.5,-9.5,0.0,0.0,0.8,1.0,1000.5
25%,0.0,0.0,0.0,0.0,16.275,1.9,0.0,0.0,7.2,82.75,1007.575
50%,49.5,11.0,28.5,62.65,21.35,6.4,44.0,0.0,11.8,188.0,1011.9
75%,608.0,384.0,104.25,674.125,26.7,10.4,93.0,0.0,17.7,244.0,1014.6
max,1093.0,990.0,486.0,1030.3,40.4,17.7,100.0,2.0,39.0,360.0,1022.0


In [42]:
# Lưu weather data vào CSV
weather_csv = output_dir / f"weather_{SAMPLE_FACILITY}_utc.csv"
weather_df.to_csv(weather_csv, index=False)
print(f"Đã lưu weather data vào: {weather_csv}")

Đã lưu weather data vào: sample_data/weather_NYNGAN_utc.csv


## 4. Tổng hợp và kiểm tra dữ liệu

In [43]:
print("=" * 60)
print("TỔNG HỢP DỮ LIỆU ĐÃ LẤY")
print("=" * 60)
print(f"\n1. FACILITIES (Metadata các trạm):")
print(f"   - Số trạm: {len(facilities_df)}")
print(f"   - File: {facilities_csv}")
print(f"   - Kích thước: {facilities_csv.stat().st_size / 1024:.2f} KB")

print(f"\n2. FACILITY TIMESERIES (Trạm {SAMPLE_FACILITY} - giờ địa phương):")
print(f"   - Số dòng: {len(timeseries_df)}")
print(f"   - File: {timeseries_csv}")
print(f"   - Kích thước: {timeseries_csv.stat().st_size / 1024:.2f} KB")
if len(timeseries_df) > 0:
    print(f"   - Khoảng thời gian: {timeseries_df['interval_start'].min()} → {timeseries_df['interval_start'].max()}")

print(f"\n3. WEATHER DATA (Trạm {SAMPLE_FACILITY} - giờ UTC):")
print(f"   - Số dòng: {len(weather_df)}")
print(f"   - File: {weather_csv}")
print(f"   - Kích thước: {weather_csv.stat().st_size / 1024:.2f} KB")
if len(weather_df) > 0 and 'date' in weather_df.columns:
    print(f"   - Khoảng thời gian: {weather_df['date'].min()} → {weather_df['date'].max()}")

print("\n" + "=" * 60)
print("Tất cả file CSV đã được lưu trong thư mục: sample_data/")
print("=" * 60)

TỔNG HỢP DỮ LIỆU ĐÃ LẤY

1. FACILITIES (Metadata các trạm):
   - Số trạm: 118
   - File: sample_data/facilities.csv
   - Kích thước: 4.89 KB

2. FACILITY TIMESERIES (Trạm NYNGAN - giờ địa phương):
   - Số dòng: 719
   - File: sample_data/facility_timeseries_NYNGAN.csv
   - Kích thước: 35.74 KB
   - Khoảng thời gian: 2025-10-16T17:00:00+10:00 → 2025-11-15T16:00:00+10:00

3. WEATHER DATA (Trạm NYNGAN - giờ UTC):
   - Số dòng: 744
   - File: sample_data/weather_NYNGAN_utc.csv
   - Kích thước: 60.10 KB
   - Khoảng thời gian: 2025-10-16T00:00 → 2025-11-15T23:00

Tất cả file CSV đã được lưu trong thư mục: sample_data/


## 5. Hiển thị mẫu dữ liệu cuối cùng

In [34]:
print("Mẫu FACILITIES (5 dòng đầu):")
display(facilities_df.head())

Mẫu FACILITIES (5 dòng đầu):


Unnamed: 0,facility_code,facility_name,network_id,network_region,location_lat,location_lng,total_capacity_mw,unit_count,unit_codes
0,ADP,Adelaide Desalination,NEM,SA1,,,,3,"ADPPV1,ADPPV3,ADPPV2"
1,ALDGASF,Aldoga,NEM,QLD1,,,,1,ALDGASF1
2,AVLSF,Avonlie,NEM,NSW1,,,,1,AVLSF1
3,BAKING,Baking Board,NEM,QLD1,,,,1,BAKING1
4,BANNSP,Bannerton,NEM,VIC1,,,,1,BANN1


In [35]:
print(f"Mẫu TIMESERIES - {SAMPLE_FACILITY} (10 dòng đầu):")
display(timeseries_df.head(10))

Mẫu TIMESERIES - NYNGAN (10 dòng đầu):


Unnamed: 0,facility_code,unit_code,interval_start,energy,network_id
0,NYNGAN,NYNGAN1,2025-11-08T17:00:00+10:00,18.8269,NEM
1,NYNGAN,NYNGAN1,2025-11-08T18:00:00+10:00,17.2807,NEM
2,NYNGAN,NYNGAN1,2025-11-08T19:00:00+10:00,10.54,NEM
3,NYNGAN,NYNGAN1,2025-11-08T20:00:00+10:00,10.54,NEM
4,NYNGAN,NYNGAN1,2025-11-08T21:00:00+10:00,0.0,NEM
5,NYNGAN,NYNGAN1,2025-11-08T22:00:00+10:00,0.0,NEM
6,NYNGAN,NYNGAN1,2025-11-08T23:00:00+10:00,0.0,NEM
7,NYNGAN,NYNGAN1,2025-11-09T00:00:00+10:00,0.0,NEM
8,NYNGAN,NYNGAN1,2025-11-09T01:00:00+10:00,0.0,NEM
9,NYNGAN,NYNGAN1,2025-11-09T02:00:00+10:00,0.0,NEM


In [36]:
print(f"Mẫu WEATHER - {SAMPLE_FACILITY} UTC (10 dòng đầu):")
display(weather_df.head(10))

Mẫu WEATHER - NYNGAN UTC (10 dòng đầu):


Unnamed: 0,date,shortwave_radiation,direct_radiation,diffuse_radiation,direct_normal_irradiance,temperature_2m,dew_point_2m,cloud_cover,precipitation,wind_speed_10m,wind_direction_10m,pressure_msl,facility_code,facility_name
0,2025-11-08T00:00,738.0,516.0,222.0,640.0,29.2,13.4,71,0.0,17.4,276,1005.0,NYNGAN,Nyngan
1,2025-11-08T01:00,952.0,835.0,117.0,919.3,30.8,8.5,0,0.0,22.7,245,1005.1,NYNGAN,Nyngan
2,2025-11-08T02:00,1042.0,946.0,96.0,986.8,31.5,11.8,0,0.0,23.8,241,1004.6,NYNGAN,Nyngan
3,2025-11-08T03:00,1032.0,936.0,96.0,981.2,32.1,10.2,7,0.0,26.0,251,1003.6,NYNGAN,Nyngan
4,2025-11-08T04:00,948.0,811.0,137.0,906.8,31.8,9.9,32,0.0,28.2,259,1003.2,NYNGAN,Nyngan
5,2025-11-08T05:00,790.0,603.0,187.0,769.1,30.9,9.7,38,0.0,28.7,258,1003.6,NYNGAN,Nyngan
6,2025-11-08T06:00,612.0,470.0,142.0,745.4,29.9,8.3,28,0.0,26.9,254,1004.0,NYNGAN,Nyngan
7,2025-11-08T07:00,411.0,324.0,87.0,729.1,28.3,7.0,2,0.0,26.2,249,1005.1,NYNGAN,Nyngan
8,2025-11-08T08:00,189.0,136.0,53.0,571.1,26.2,6.8,8,0.0,24.3,238,1005.9,NYNGAN,Nyngan
9,2025-11-08T09:00,21.0,9.0,12.0,88.0,23.3,7.4,9,0.0,20.1,226,1007.3,NYNGAN,Nyngan
