In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime, timezone
import pytz

print("✓ Import thành công")

✓ Import thành công


## 1. Load dữ liệu từ CSV

In [2]:
# Đường dẫn data
data_dir = Path("sample_data")

# Load facilities
facilities_df = pd.read_csv(data_dir / "facilities.csv")
print(f"Facilities: {len(facilities_df)} dòng")

# Load timeseries
timeseries_df = pd.read_csv(data_dir / "facility_timeseries_NYNGAN.csv")
print(f"Timeseries: {len(timeseries_df)} dòng")

# Load weather
weather_df = pd.read_csv(data_dir / "weather_NYNGAN_utc.csv")
print(f"Weather: {len(weather_df)} dòng")

Facilities: 118 dòng
Timeseries: 719 dòng
Weather: 744 dòng


## 2. Transform Weather Data: UTC → Local Time

In [3]:
# NYNGAN timezone: Australia/Sydney
FACILITY_TIMEZONE = "Australia/Sydney"
local_tz = pytz.timezone(FACILITY_TIMEZONE)
utc_tz = pytz.UTC

# Convert UTC time to local time
weather_df['date_utc'] = pd.to_datetime(weather_df['date'])
weather_df['date_utc'] = weather_df['date_utc'].dt.tz_localize('UTC')
weather_df['date_local'] = weather_df['date_utc'].dt.tz_convert(FACILITY_TIMEZONE)

print("\nMẫu transform:")
print(weather_df[['date', 'date_utc', 'date_local']].head())

# Extract date và time components (local)
weather_df['date_only'] = weather_df['date_local'].dt.date
weather_df['hour'] = weather_df['date_local'].dt.hour

print(f"\n✓ Weather data đã convert sang {FACILITY_TIMEZONE}")


Mẫu transform:
               date                  date_utc                date_local
0  2025-10-16T00:00 2025-10-16 00:00:00+00:00 2025-10-16 11:00:00+11:00
1  2025-10-16T01:00 2025-10-16 01:00:00+00:00 2025-10-16 12:00:00+11:00
2  2025-10-16T02:00 2025-10-16 02:00:00+00:00 2025-10-16 13:00:00+11:00
3  2025-10-16T03:00 2025-10-16 03:00:00+00:00 2025-10-16 14:00:00+11:00
4  2025-10-16T04:00 2025-10-16 04:00:00+00:00 2025-10-16 15:00:00+11:00

✓ Weather data đã convert sang Australia/Sydney


## 3. Tính Irradiance (kWh/m²/hour)

In [4]:
# Convert shortwave_radiation từ W/m² sang kWh/m²/hour
# W/m² trong 1 giờ = Wh/m² / 1000 = kWh/m²
weather_df['irr_kwh_m2_hour'] = weather_df['shortwave_radiation'] / 1000.0

print("Thống kê irradiance:")
print(weather_df['irr_kwh_m2_hour'].describe())

print("\nMẫu dữ liệu:")
print(weather_df[['date_local', 'shortwave_radiation', 'irr_kwh_m2_hour']].head(10))

Thống kê irradiance:
count    744.000000
mean       0.291124
std        0.358146
min        0.000000
25%        0.000000
50%        0.049500
75%        0.608000
max        1.093000
Name: irr_kwh_m2_hour, dtype: float64

Mẫu dữ liệu:
                 date_local  shortwave_radiation  irr_kwh_m2_hour
0 2025-10-16 11:00:00+11:00                661.0            0.661
1 2025-10-16 12:00:00+11:00                808.0            0.808
2 2025-10-16 13:00:00+11:00                886.0            0.886
3 2025-10-16 14:00:00+11:00                900.0            0.900
4 2025-10-16 15:00:00+11:00                734.0            0.734
5 2025-10-16 16:00:00+11:00                656.0            0.656
6 2025-10-16 17:00:00+11:00                447.0            0.447
7 2025-10-16 18:00:00+11:00                254.0            0.254
8 2025-10-16 19:00:00+11:00                 74.0            0.074
9 2025-10-16 20:00:00+11:00                  2.0            0.002


## 4. Transform Timeseries Data

In [5]:
# Parse interval_start (đã có timezone +10:00)
timeseries_df['interval_datetime'] = pd.to_datetime(timeseries_df['interval_start'])

# Convert to timezone-aware và normalize về local timezone
timeseries_df['interval_local'] = timeseries_df['interval_datetime'].dt.tz_convert(FACILITY_TIMEZONE)

# Extract date và time
timeseries_df['date_only'] = timeseries_df['interval_local'].dt.date
timeseries_df['hour'] = timeseries_df['interval_local'].dt.hour

# Rename energy to energy_mwh
timeseries_df['energy_mwh'] = timeseries_df['energy']

print("Timeseries transformed:")
print(timeseries_df[['facility_code', 'interval_start', 'interval_local', 'date_only', 'hour', 'energy_mwh']].head())

Timeseries transformed:
  facility_code             interval_start            interval_local  \
0        NYNGAN  2025-10-16T17:00:00+10:00 2025-10-16 18:00:00+11:00   
1        NYNGAN  2025-10-16T18:00:00+10:00 2025-10-16 19:00:00+11:00   
2        NYNGAN  2025-10-16T19:00:00+10:00 2025-10-16 20:00:00+11:00   
3        NYNGAN  2025-10-16T20:00:00+10:00 2025-10-16 21:00:00+11:00   
4        NYNGAN  2025-10-16T21:00:00+10:00 2025-10-16 22:00:00+11:00   

    date_only  hour  energy_mwh  
0  2025-10-16    18     19.1251  
1  2025-10-16    19      3.2916  
2  2025-10-16    20      3.1248  
3  2025-10-16    21      2.8644  
4  2025-10-16    22      0.0000  


## 5. Làm sạch dữ liệu

In [6]:
# Remove null values
print("Trước khi làm sạch:")
print(f"  Weather: {len(weather_df)} dòng")
print(f"  Timeseries: {len(timeseries_df)} dòng")

# Weather: remove null irradiance
weather_df_clean = weather_df.dropna(subset=['irr_kwh_m2_hour']).copy()

# Timeseries: remove null energy
timeseries_df_clean = timeseries_df.dropna(subset=['energy_mwh']).copy()

# Remove negative values
weather_df_clean = weather_df_clean[weather_df_clean['irr_kwh_m2_hour'] >= 0]
timeseries_df_clean = timeseries_df_clean[timeseries_df_clean['energy_mwh'] >= 0]

print("\nSau khi làm sạch:")
print(f"  Weather: {len(weather_df_clean)} dòng")
print(f"  Timeseries: {len(timeseries_df_clean)} dòng")

Trước khi làm sạch:
  Weather: 744 dòng
  Timeseries: 719 dòng

Sau khi làm sạch:
  Weather: 744 dòng
  Timeseries: 719 dòng


## 6. Join Facilities + Timeseries + Weather

In [None]:
# Join timeseries với facilities
fact_df = timeseries_df_clean.merge(
    facilities_df[['facility_code', 'facility_name', 'total_capacity_mw', 'network_region']],
    on='facility_code',
    how='left'
)

print(f"Sau join với facilities: {len(fact_df)} dòng")

# Fix: Nếu total_capacity_mw là null, dùng giá trị hardcode
# NYNGAN Solar Farm có công suất 102 MW (theo thông tin công khai)
NYNGAN_CAPACITY_MW = 102.0

fact_df['total_capacity_mw'] = fact_df['total_capacity_mw'].fillna(NYNGAN_CAPACITY_MW)
print(f"⚠️ Đã điền capacity cho NYNGAN: {NYNGAN_CAPACITY_MW} MW")

# Join với weather data (by date_only + hour)
fact_df = fact_df.merge(
    weather_df_clean[[
        'date_only', 'hour', 'irr_kwh_m2_hour', 
        'shortwave_radiation', 'temperature_2m', 'wind_speed_10m',
        'cloud_cover', 'precipitation'
    ]],
    on=['date_only', 'hour'],
    how='inner'
)

print(f"Sau join với weather: {len(fact_df)} dòng")

# Tạo các key dimensions
fact_df['date_key'] = pd.to_datetime(fact_df['date_only']).dt.strftime('%Y%m%d').astype(int)
fact_df['time_key'] = fact_df['hour'] * 100  # e.g., 1400 for 14:00
fact_df['facility_key'] = fact_df['facility_code']

print("\nFact table columns:")
print(fact_df.columns.tolist())

print("\nMẫu fact table:")
fact_df.head()

Sau join với facilities: 719 dòng
Sau join với weather: 719 dòng

Fact table columns:
['facility_code', 'unit_code', 'interval_start', 'energy', 'network_id', 'interval_datetime', 'interval_local', 'date_only', 'hour', 'energy_mwh', 'facility_name', 'total_capacity_mw', 'network_region', 'irr_kwh_m2_hour', 'shortwave_radiation', 'temperature_2m', 'wind_speed_10m', 'cloud_cover', 'precipitation', 'date_key', 'time_key', 'facility_key']

Mẫu fact table:


Unnamed: 0,facility_code,unit_code,interval_start,energy,network_id,interval_datetime,interval_local,date_only,hour,energy_mwh,...,network_region,irr_kwh_m2_hour,shortwave_radiation,temperature_2m,wind_speed_10m,cloud_cover,precipitation,date_key,time_key,facility_key
0,NYNGAN,NYNGAN1,2025-10-16T17:00:00+10:00,19.1251,NEM,2025-10-16 17:00:00+10:00,2025-10-16 18:00:00+11:00,2025-10-16,18,19.1251,...,NSW1,0.254,254.0,31.3,14.2,67,0.1,20251016,1800,NYNGAN
1,NYNGAN,NYNGAN1,2025-10-16T18:00:00+10:00,3.2916,NEM,2025-10-16 18:00:00+10:00,2025-10-16 19:00:00+11:00,2025-10-16,19,3.2916,...,NSW1,0.074,74.0,30.0,10.9,81,0.0,20251016,1900,NYNGAN
2,NYNGAN,NYNGAN1,2025-10-16T19:00:00+10:00,3.1248,NEM,2025-10-16 19:00:00+10:00,2025-10-16 20:00:00+11:00,2025-10-16,20,3.1248,...,NSW1,0.002,2.0,27.1,11.3,61,0.0,20251016,2000,NYNGAN
3,NYNGAN,NYNGAN1,2025-10-16T20:00:00+10:00,2.8644,NEM,2025-10-16 20:00:00+10:00,2025-10-16 21:00:00+11:00,2025-10-16,21,2.8644,...,NSW1,0.0,0.0,25.5,12.2,7,0.0,20251016,2100,NYNGAN
4,NYNGAN,NYNGAN1,2025-10-16T21:00:00+10:00,0.0,NEM,2025-10-16 21:00:00+10:00,2025-10-16 22:00:00+11:00,2025-10-16,22,0.0,...,NSW1,0.0,0.0,23.8,9.7,11,0.0,20251016,2200,NYNGAN


## 7. Tính toán Metrics theo công thức DAX

### 7.1. Total Energy (MWh)

In [8]:
# Total Energy (MWh) = SUM(fact_solar_environmental[energy_mwh])
total_energy_mwh = fact_df['energy_mwh'].sum()

print(f"Total Energy (MWh): {total_energy_mwh:,.2f}")

Total Energy (MWh): 7,950.94


### 7.2. Performance Ratio - PR (%)

In [9]:
# PR (%) = (Yf / Yr) * 100
# Yf = (E * 1000) / PkW  (kWh/kWp)
# Yr = SUM(irr_kwh_m2_hour)  (kWh/m²)

E = total_energy_mwh

# PkW = total capacity in kW
PkW = fact_df['total_capacity_mw'].iloc[0] * 1000  # NYNGAN capacity

# Yf = kWh / kWp
Yf = (E * 1000) / PkW

# Yr = total irradiance kWh/m²
Yr = fact_df['irr_kwh_m2_hour'].sum()

# PR
if Yr > 0:
    PR = (Yf / Yr) * 100
else:
    PR = None

print(f"\nPerformance Ratio Calculation:")
print(f"  E (Total Energy): {E:,.2f} MWh")
print(f"  PkW (Capacity): {PkW:,.2f} kW")
print(f"  Yf (Specific Yield): {Yf:,.2f} kWh/kWp")
print(f"  Yr (Reference Yield): {Yr:,.2f} kWh/m²")
print(f"  PR (Performance Ratio): {PR:.2f}%" if PR else "  PR: N/A")


Performance Ratio Calculation:
  E (Total Energy): 7,950.94 MWh
  PkW (Capacity): nan kW
  Yf (Specific Yield): nan kWh/kWp
  Yr (Reference Yield): 209.73 kWh/m²
  PR (Performance Ratio): nan%


### 7.3. Capacity Factor Calendar - CF Calendar (%)

In [10]:
# CF Calendar (%) = (E / (P * H)) * 100
# H = distinct dates * distinct hours

E = total_energy_mwh
P = fact_df['total_capacity_mw'].iloc[0]  # MW

# Calendar hours (all possible hours in the period)
num_distinct_dates = fact_df['date_key'].nunique()
num_distinct_hours = fact_df['time_key'].nunique()
H_calendar = num_distinct_dates * num_distinct_hours

CF_calendar = (E / (P * H_calendar)) * 100 if H_calendar > 0 else None

print(f"\nCapacity Factor (Calendar) Calculation:")
print(f"  E (Total Energy): {E:,.2f} MWh")
print(f"  P (Capacity): {P:,.2f} MW")
print(f"  Distinct Dates: {num_distinct_dates}")
print(f"  Distinct Hours: {num_distinct_hours}")
print(f"  H (Calendar Hours): {H_calendar}")
print(f"  CF Calendar: {CF_calendar:.2f}%" if CF_calendar else "  CF Calendar: N/A")


Capacity Factor (Calendar) Calculation:
  E (Total Energy): 7,950.94 MWh
  P (Capacity): nan MW
  Distinct Dates: 31
  Distinct Hours: 24
  H (Calendar Hours): 744
  CF Calendar: nan%


### 7.4. Capacity Factor Observed - CF Observed (%)

In [11]:
# CF Observed (%) = (E / (P * H)) * 100
# H = distinct count of (date_key & time_key) combinations

E = total_energy_mwh
P = fact_df['total_capacity_mw'].iloc[0]  # MW

# Observed hours (actual data points)
fact_df['datetime_key'] = fact_df['date_key'].astype(str) + '_' + fact_df['time_key'].astype(str)
H_observed = fact_df['datetime_key'].nunique()

CF_observed = (E / (P * H_observed)) * 100 if H_observed > 0 else None

print(f"\nCapacity Factor (Observed) Calculation:")
print(f"  E (Total Energy): {E:,.2f} MWh")
print(f"  P (Capacity): {P:,.2f} MW")
print(f"  H (Observed Hours): {H_observed}")
print(f"  CF Observed: {CF_observed:.2f}%" if CF_observed else "  CF Observed: N/A")


Capacity Factor (Observed) Calculation:
  E (Total Energy): 7,950.94 MWh
  P (Capacity): nan MW
  H (Observed Hours): 719
  CF Observed: nan%


### 7.5. Specific Energy Yield - SEY (kWh/kWp)

In [12]:
# SEY (kWh/kWp) = (Total Energy * 1000) / (Total Capacity * 1000)

E = total_energy_mwh
PkW = fact_df['total_capacity_mw'].iloc[0] * 1000

SEY = (E * 1000) / PkW if PkW > 0 else None

print(f"\nSpecific Energy Yield Calculation:")
print(f"  E (Total Energy): {E:,.2f} MWh = {E*1000:,.2f} kWh")
print(f"  PkW (Capacity): {PkW:,.2f} kW")
print(f"  SEY (Specific Energy Yield): {SEY:.2f} kWh/kWp" if SEY else "  SEY: N/A")


Specific Energy Yield Calculation:
  E (Total Energy): 7,950.94 MWh = 7,950,938.60 kWh
  PkW (Capacity): nan kW
  SEY: N/A


## 8. Tổng hợp tất cả Metrics

In [None]:
# Tạo summary metrics
metrics_summary = pd.DataFrame([
    {
        'Metric': 'Total Energy',
        'Value': total_energy_mwh,
        'Unit': 'MWh',
        'Description': 'Tổng năng lượng sản xuất'
    },
    {
        'Metric': 'Performance Ratio (PR)',
        'Value': PR,
        'Unit': '%',
        'Description': 'Tỷ lệ hiệu suất so với lý thuyết'
    },
    {
        'Metric': 'Capacity Factor (Calendar)',
        'Value': CF_calendar,
        'Unit': '%',
        'Description': 'Hệ số công suất (theo lịch)'
    },
    {
        'Metric': 'Capacity Factor (Observed)',
        'Value': CF_observed,
        'Unit': '%',
        'Description': 'Hệ số công suất (dữ liệu thực tế)'
    },
    {
        'Metric': 'Specific Energy Yield (SEY)',
        'Value': SEY,
        'Unit': 'kWh/kWp',
        'Description': 'Năng lượng riêng trên công suất'
    },
])

print("\n" + "="*70)
print("TỔNG HỢP METRICS - TRẠM NYNGAN")
print("="*70)
print(f"Khoảng thời gian: {fact_df['date_only'].min()} → {fact_df['date_only'].max()}")
print(f"Số giờ dữ liệu: {len(fact_df)}")
print(f"Công suất lắp đặt: {P:.2f} MW")
print("="*70)
print()
display(metrics_summary)

## 9. Lưu kết quả

In [None]:
# Lưu fact table
output_fact = data_dir / "fact_solar_environmental.csv"
fact_df.to_csv(output_fact, index=False)
print(f"✓ Đã lưu fact table: {output_fact}")
print(f"  Số dòng: {len(fact_df)}")
print(f"  Kích thước: {output_fact.stat().st_size / 1024:.2f} KB")

# Lưu metrics summary
output_metrics = data_dir / "metrics_summary.csv"
metrics_summary.to_csv(output_metrics, index=False)
print(f"\n✓ Đã lưu metrics summary: {output_metrics}")

## 10. Visualize một vài metrics

In [None]:
import matplotlib.pyplot as plt

# Aggregate daily
daily_df = fact_df.groupby('date_only').agg({
    'energy_mwh': 'sum',
    'irr_kwh_m2_hour': 'sum',
}).reset_index()

# Plot
fig, axes = plt.subplots(2, 1, figsize=(14, 8))

# Energy production
axes[0].plot(daily_df['date_only'], daily_df['energy_mwh'], marker='o', linewidth=1.5)
axes[0].set_title('Daily Energy Production', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Energy (MWh)')
axes[0].grid(True, alpha=0.3)

# Irradiance
axes[1].plot(daily_df['date_only'], daily_df['irr_kwh_m2_hour'], marker='o', color='orange', linewidth=1.5)
axes[1].set_title('Daily Solar Irradiance', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Irradiance (kWh/m²)')
axes[1].set_xlabel('Date')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.xticks(rotation=45)
plt.show()

print(f"\n✓ Đã visualize daily trends")

## 11. Chi tiết fact table

In [None]:
print("Columns trong fact table:")
print(fact_df.columns.tolist())

print("\nData types:")
fact_df.info()

print("\nMẫu dữ liệu (10 dòng đầu):")
display(fact_df.head(10))