# Import

In [1]:
import requests
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from datetime import datetime
from astral import LocationInfo
from astral.sun import sun
from pysolar.solar import get_altitude
import pytz

from dateutil.relativedelta import relativedelta

from tqdm import tqdm


# Variable

In [None]:
ELECTIC_API_KEY = "pz0pOPI21dJbBbf8RfnUWeTk5RTxNA1FDvip8vmv"
SERIES_ID = "EBA.CAL-ALL.D.H"  

START_DATE = "2020-01-01"
END_DATE = "2025-11-19"
YEAR = 2025

RENEWABLE = ["SUN", "WND"]

LAT = 36.7783
LON = -119.4179
TIME_ZONE_SEASON = 'US/Pacific'
TIME_ZONE_WATHER = "America/Los_Angeles"
CITY = LocationInfo(name="California", region="USA", timezone=TIME_ZONE_SEASON, latitude=LAT, longitude=LON)
tzinfo = pytz.timezone(TIME_ZONE_SEASON)

SOLAR = {
    2020: 34.95,
    2021: 37.75,
    2022: 42.27,
    2023: 50.48,
    2024: 56.27,
    2025: 65.43,
}

WIND = {
    2020: 2400,
    2021: 2491,
    2022: 2465,
    2023: 2514,
    2024: 2544,
    2025: 2593,
}

# Season

In [3]:
def get_season(d):
    m = d.month
    day = d.day
    if (m == 12 and day >= 21) or (m <= 3 and (m < 3 or (m == 3 and day <= 19))):
        return "Winter"
    elif (m == 3 and day >= 20) or (m < 6) or (m == 6 and day <= 20):
        return "Spring"
    elif (m == 6 and day >= 21) or (m < 9) or (m == 9 and day <= 21):
        return "Summer"
    else:
        return "Autumn"

In [None]:
def process_seasonal_data():
    period = pd.date_range(start=f"{START_DATE}", end=f"{END_DATE}", freq='h')
    season_df = pd.DataFrame({'period': period})

    season_df["date"] = season_df["period"].dt.normalize()

    season_df['day_of_year'] = season_df['date'].dt.dayofyear
    season_df['month'] = season_df['date'].dt.month

    season_df['sin_doy'] = np.sin(2 * np.pi * season_df['day_of_year'] / 365)
    season_df['cos_doy'] = np.cos(2 * np.pi * season_df['day_of_year'] / 365)

    season_df['season'] = season_df['date'].apply(get_season)

    season_df["time"] = season_df["period"].dt.strftime("%H:%M")
    
    sunrise_list, sunset_list, zenith_list = [], [], []

    for ts in tqdm(season_df['date']):
        s = sun(CITY.observer, date=ts.date(), tzinfo=tzinfo)
        sunrise_list.append(s['sunrise'].isoformat())
        sunset_list.append(s['sunset'].isoformat())

        local_noon = datetime(ts.year, ts.month, ts.day, 12, 0, 0, tzinfo=tzinfo)
        zenith_angle = 90 - get_altitude(LAT, LON, local_noon)
        zenith_list.append(zenith_angle)

    season_df['sunrise'] = sunrise_list
    season_df['sunset'] = sunset_list
    season_df['solar_zenith_noon_deg'] = zenith_list
    
    season_df['sunrise_time'] = season_df['sunrise'].apply(lambda x: x.split("T")[1].split(".")[0])
    season_df['sunset_time'] = season_df['sunset'].apply(lambda x: x.split("T")[1].split(".")[0])

    season_df['sunrise_time_h'] = season_df['sunrise'].apply(lambda x: int(x.split("T")[1].split(":")[0]))
    season_df['sunrise_time_m'] = season_df['sunrise'].apply(lambda x: int(x.split("T")[1].split(":")[1]))

    season_df['sunset_time_h'] = season_df['sunset'].apply(lambda x: int(x.split("T")[1].split(":")[0]))
    season_df['sunset_time_m'] = season_df['sunset'].apply(lambda x: int(x.split("T")[1].split(":")[1]))

    season_df['date'] = season_df['date'].dt.normalize()

    season_df["season"] = season_df["season"].map({
        "Winter": 1,
        "Spring": 2,
        "Summer": 3,
        "Autumn": 4
    })

    return season_df

In [None]:
season_df = process_seasonal_data()

feature_season = ['time', 'date', 'day_of_year', 'sin_doy', 'cos_doy', 'season', 'solar_zenith_noon_deg', 'sunrise_time_h', 'sunrise_time_m', 'sunset_time_h', 'sunset_time_m']
final_season_df = season_df[feature_season]

final_season_df

Unnamed: 0,time,date,day_of_year,sin_doy,cos_doy,season,solar_zenith_noon_deg,sunrise_time_h,sunrise_time_m,sunset_time_h,sunset_time_m
0,00:00,2020-01-01,1,0.017213,0.999852,1,59.774996,7,10,16,51
1,01:00,2020-01-01,1,0.017213,0.999852,1,59.774996,7,10,16,51
2,02:00,2020-01-01,1,0.017213,0.999852,1,59.774996,7,10,16,51
3,03:00,2020-01-01,1,0.017213,0.999852,1,59.774996,7,10,16,51
4,04:00,2020-01-01,1,0.017213,0.999852,1,59.774996,7,10,16,51
...,...,...,...,...,...,...,...,...,...,...,...
51572,20:00,2025-11-18,322,-0.674444,0.738326,4,56.230396,6,39,16,46
51573,21:00,2025-11-18,322,-0.674444,0.738326,4,56.230396,6,39,16,46
51574,22:00,2025-11-18,322,-0.674444,0.738326,4,56.230396,6,39,16,46
51575,23:00,2025-11-18,322,-0.674444,0.738326,4,56.230396,6,39,16,46


# Wather

In [9]:
hourly_wather_vars = [
    # Solar radiation
    "shortwave_radiation",
    "direct_radiation",
    "diffuse_radiation",
    "direct_normal_irradiance",

    # Cloud details
    "cloudcover",
    "cloudcover_low",
    "cloudcover_mid",
    "cloudcover_high",

    # Atmosphere
    "temperature_2m",
    "relativehumidity_2m",
    "dewpoint_2m",
    "surface_pressure",
    "vapour_pressure_deficit",

    # Wind (turbine levels + gust)
    "windspeed_10m",
    "winddirection_10m",
    "windspeed_100m",
    "winddirection_100m",
    "windgusts_10m",

    # Precipitation
    "precipitation",
    "rain",
    "snowfall",
]

In [None]:
def call_wather_data():
    wather_params = {
        "latitude": LAT,
        "longitude": LON,
        "timezone": TIME_ZONE_WATHER,
        "start_date": START_DATE,
        "end_date": END_DATE,
        "hourly": ",".join(hourly_wather_vars)
    }
    
    wather_url = "https://archive-api.open-meteo.com/v1/archive"
    wather_response = requests.get(wather_url, params=wather_params)
    wather_data = wather_response.json()

    if "error" in wather_data:
        print("Error:", wather_data["reason"])
    else:
        wather_hourly_df = pd.DataFrame(wather_data["hourly"])

    wather_hourly_df["date"] = pd.to_datetime(wather_hourly_df["time"]).dt.normalize()
    wather_hourly_df["time"] = pd.to_datetime(wather_hourly_df["time"]).dt.strftime("%H:%M")
    
    return wather_hourly_df

In [None]:
wather_hourly_df = call_wather_data()
wather_hourly_df

Unnamed: 0,time,shortwave_radiation,direct_radiation,diffuse_radiation,direct_normal_irradiance,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,temperature_2m,...,vapour_pressure_deficit,windspeed_10m,winddirection_10m,windspeed_100m,winddirection_100m,windgusts_10m,precipitation,rain,snowfall,date
0,00:00,0.0,0.0,0.0,0.0,89,0,0,89,5.4,...,0.14,2.6,56,3.6,217,8.3,0.0,0.0,0.0,2020-01-01
1,01:00,0.0,0.0,0.0,0.0,99,0,0,99,3.9,...,0.08,4.0,63,5.5,169,10.1,0.0,0.0,0.0,2020-01-01
2,02:00,0.0,0.0,0.0,0.0,71,0,0,71,4.0,...,0.10,4.8,48,5.1,129,12.6,0.0,0.0,0.0,2020-01-01
3,03:00,0.0,0.0,0.0,0.0,61,0,0,61,3.6,...,0.08,3.3,41,3.2,297,11.9,0.0,0.0,0.0,2020-01-01
4,04:00,0.0,0.0,0.0,0.0,82,0,0,44,3.4,...,0.07,3.8,49,2.2,90,11.9,0.0,0.0,0.0,2020-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51595,19:00,0.0,0.0,0.0,0.0,3,0,0,2,10.1,...,0.21,5.9,45,8.5,36,8.6,0.0,0.0,0.0,2025-11-19
51596,20:00,0.0,0.0,0.0,0.0,4,0,0,4,9.4,...,0.15,4.9,54,6.4,72,9.0,0.0,0.0,0.0,2025-11-19
51597,21:00,0.0,0.0,0.0,0.0,99,0,0,99,8.8,...,0.12,6.8,58,8.3,90,11.2,0.0,0.0,0.0,2025-11-19
51598,22:00,0.0,0.0,0.0,0.0,95,0,1,95,8.4,...,0.10,6.0,57,10.0,99,13.0,0.0,0.0,0.0,2025-11-19


# Electic Genaration

In [13]:
def fetch_eia_data(start_date, end_date, api_key, respondents=["CISO"], fueltypes=["SUN","WND"]):
    dfs = []
    current_start = pd.to_datetime(start_date)
    final_end = pd.to_datetime(end_date)

    while current_start <= final_end:
        current_end = min(current_start + relativedelta(months=3) - pd.Timedelta(days=1), final_end)
        
        elec_url = (
            "https://api.eia.gov/v2/electricity/rto/fuel-type-data/data/"
            f"?api_key={api_key}"
            "&frequency=hourly"
            "&data[0]=value"
        )
        
        for r in respondents:
            elec_url += f"&facets[respondent][]={r}"
        for f in fueltypes:
            elec_url += f"&facets[fueltype][]={f}"
        
        elec_url += f"&start={current_start.strftime('%Y-%m-%dT%H')}"
        elec_url += f"&end={current_end.strftime('%Y-%m-%dT%H')}"
        
        resp = requests.get(elec_url)
        resp.raise_for_status()
        data = resp.json()
        
        if "response" in data and "data" in data["response"]:
            df = pd.DataFrame(data["response"]["data"])
            dfs.append(df)
        
        current_start = current_end + pd.Timedelta(hours=1)

    full_df = pd.concat(dfs, ignore_index=True)
    
    return full_df

In [None]:
def electic_data_preprocess(elec_res_df, solar, wind):
    elec_res_df["period"] = pd.to_datetime(elec_res_df["period"])

    elec_res_df["date"] = elec_res_df["period"].dt.normalize()

    elec_res_df["year"] = elec_res_df["period"].dt.year
    elec_res_df["month"] = elec_res_df["period"].dt.month
    elec_res_df["day"] = elec_res_df["period"].dt.day

    elec_res_df["time"] = elec_res_df["period"].dt.strftime("%H:%M")
    
    elec_renewable = elec_res_df[elec_res_df["fueltype"].isin(RENEWABLE)]
    elec_renewable["fueltype"] = elec_renewable["fueltype"].map({
        "SUN": 1,
        "WND": 2
    })
    
    elec_renewable["solar_count"] = elec_renewable["year"].map(solar)
    elec_renewable["wind_turbine_count"] = elec_renewable["year"].map(wind)
    
    elec_renewable = elec_renewable.dropna()
    
    return elec_renewable

In [None]:
elec_res_df = fetch_eia_data(START_DATE, END_DATE, ELECTIC_API_KEY)
elec_renewable = electic_data_preprocess(elec_res_df, SOLAR, WIND)

In [32]:
feature_req = ["time", "date", "day", "month", "year", "fueltype", "type-name", "solar_count", "wind_turbine_count", "value"]
final_reg_df = elec_renewable[feature_req].copy() 
final_reg_df

Unnamed: 0,time,date,day,month,year,fueltype,type-name,solar_count,wind_turbine_count,value
0,00:00,2020-03-31,31,3,2020,1,Solar,34.95,2400,6676
1,00:00,2020-03-31,31,3,2020,2,Wind,34.95,2400,2712
2,23:00,2020-03-30,30,3,2020,1,Solar,34.95,2400,7032
3,23:00,2020-03-30,30,3,2020,2,Wind,34.95,2400,2140
4,22:00,2020-03-30,30,3,2020,1,Solar,34.95,2400,8007
...,...,...,...,...,...,...,...,...,...,...
103144,02:00,2025-09-08,8,9,2025,2,Wind,65.43,2593,1666
103145,01:00,2025-09-08,8,9,2025,1,Solar,65.43,2593,15468
103146,01:00,2025-09-08,8,9,2025,2,Wind,65.43,2593,1646
103147,00:00,2025-09-08,8,9,2025,1,Solar,65.43,2593,16671


# Merge

In [44]:
merged_df = final_reg_df.merge(
    wather_hourly_df,
    on=["date", "time"],
    how="left"
)

merged_df = merged_df.merge(
    final_season_df,
    on=["date", "time"],
    how="left"
)

merged_df = merged_df.sort_values(by=["date", "time"]).reset_index(drop=True)
merged_df

Unnamed: 0,time,date,day,month,year,fueltype,type-name,solar_count,wind_turbine_count,value,...,snowfall,day_of_year,sin_doy,cos_doy,season,solar_zenith_noon_deg,sunrise_time_h,sunrise_time_m,sunset_time_h,sunset_time_m
0,00:00,2020-01-01,1,1,2020,1,Solar,34.95,2400,1403,...,0.0,1,0.017213,0.999852,1,59.774996,7,10,16,51
1,00:00,2020-01-01,1,1,2020,2,Wind,34.95,2400,225,...,0.0,1,0.017213,0.999852,1,59.774996,7,10,16,51
2,01:00,2020-01-01,1,1,2020,1,Solar,34.95,2400,-35,...,0.0,1,0.017213,0.999852,1,59.774996,7,10,16,51
3,01:00,2020-01-01,1,1,2020,2,Wind,34.95,2400,222,...,0.0,1,0.017213,0.999852,1,59.774996,7,10,16,51
4,02:00,2020-01-01,1,1,2020,1,Solar,34.95,2400,-30,...,0.0,1,0.017213,0.999852,1,59.774996,7,10,16,51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103120,22:00,2025-11-18,18,11,2025,2,Wind,65.43,2593,1398,...,0.0,322,-0.674444,0.738326,4,56.230396,6,39,16,46
103121,23:00,2025-11-18,18,11,2025,1,Solar,65.43,2593,10192,...,0.0,322,-0.674444,0.738326,4,56.230396,6,39,16,46
103122,23:00,2025-11-18,18,11,2025,2,Wind,65.43,2593,1158,...,0.0,322,-0.674444,0.738326,4,56.230396,6,39,16,46
103123,00:00,2025-11-19,19,11,2025,1,Solar,65.43,2593,10026,...,0.0,323,-0.661635,0.749826,4,56.458286,6,40,16,45


In [45]:
merged_df.isna().sum()

time                        0
date                        0
day                         0
month                       0
year                        0
fueltype                    0
type-name                   0
solar_count                 0
wind_turbine_count          0
value                       0
shortwave_radiation         0
direct_radiation            0
diffuse_radiation           0
direct_normal_irradiance    0
cloudcover                  0
cloudcover_low              0
cloudcover_mid              0
cloudcover_high             0
temperature_2m              0
relativehumidity_2m         0
dewpoint_2m                 0
surface_pressure            0
vapour_pressure_deficit     0
windspeed_10m               0
winddirection_10m           0
windspeed_100m              0
winddirection_100m          0
windgusts_10m               0
precipitation               0
rain                        0
snowfall                    0
day_of_year                 0
sin_doy                     0
cos_doy   

# Post Process

In [None]:
def final_post_process(merged_df):
    merged_df["time"] = merged_df["time"].str.slice(0, 2).astype(int)

    merged_df["sin_time"] = np.sin(2 * np.pi * merged_df["time"] / 24)
    merged_df["cos_time"] = np.cos(2 * np.pi * merged_df["time"] / 24)

    merged_df["day_of_month_sin"] = np.sin(2 * np.pi * merged_df["day"] / 31)
    merged_df["day_of_month_cos"] = np.cos(2 * np.pi * merged_df["day"] / 31)

    merged_df["month_of_year_sin"] = np.sin(2 * np.pi * merged_df["month"] / 12)
    merged_df["month_of_year_cos"] = np.cos(2 * np.pi * merged_df["month"] / 12)
    
    merged_df['date'] = merged_df['date'].astype(str)
    df_cleaned = merged_df.dropna(subset=['value'])
    df_cleaned['value'] = df_cleaned['value'].astype(int)
    
    return df_cleaned

# Datatype Config

In [None]:
df_cleaned = final_post_process(merged_df)

In [48]:
for i in list(df_cleaned.columns):
  print(i,", Type: " + df_cleaned[i].dtype.name)

time , Type: int64
date , Type: object
day , Type: int32
month , Type: int32
year , Type: int32
fueltype , Type: int64
type-name , Type: object
solar_count , Type: float64
wind_turbine_count , Type: int64
value , Type: int64
shortwave_radiation , Type: float64
direct_radiation , Type: float64
diffuse_radiation , Type: float64
direct_normal_irradiance , Type: float64
cloudcover , Type: int64
cloudcover_low , Type: int64
cloudcover_mid , Type: int64
cloudcover_high , Type: int64
temperature_2m , Type: float64
relativehumidity_2m , Type: int64
dewpoint_2m , Type: float64
surface_pressure , Type: float64
vapour_pressure_deficit , Type: float64
windspeed_10m , Type: float64
winddirection_10m , Type: int64
windspeed_100m , Type: float64
winddirection_100m , Type: int64
windgusts_10m , Type: float64
precipitation , Type: float64
rain , Type: float64
snowfall , Type: float64
day_of_year , Type: int32
sin_doy , Type: float64
cos_doy , Type: float64
season , Type: int64
solar_zenith_noon_deg , T

# Upload to hugging face

In [49]:
from huggingface_hub import login
from datasets import Dataset, DatasetDict, load_dataset
from huggingface_hub import HfApi

In [50]:
repo_id = "wachawich/REG-Forecasting_v2"
dataset = Dataset.from_pandas(df_cleaned)

In [None]:
login("YOUR_HF_TOKEN")

In [51]:
dataset.push_to_hub(repo_id, private=False)

Creating parquet from Arrow format: 100%|██████████| 104/104 [00:00<00:00, 496.06ba/s]
Processing Files (0 / 0)                : |          |  0.00B /  0.00B            
[A
Processing Files (0 / 1)                :  16%|█▌        |  674kB / 4.19MB,  337kB/s  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Processing Files (0 / 1)                :  32%|███▏      | 1.35MB / 4.19MB,  306kB/s  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Processing Files (0 / 1)                :  48%|████▊     | 2.02MB / 4.19MB,  280kB/s  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Processing Files (0 / 1)                :  64%|██████▍   | 2.69MB / 4.19MB,  281kB/s  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Processing Files (0 / 1)                :  80%|████████  | 3.37MB / 4.19MB,  264kB/s  
[A
[A
[A
[A
[A
[A
[A
[A
[A
Processing Files (0 / 1)                :  97%|█████████▋| 4.04MB / 4.19MB,  330kB/s  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Processing Files (1 / 1)

CommitInfo(commit_url='https://huggingface.co/datasets/wachawich/REG-Forecasting_v2/commit/b6b6294e697e670f07edbef9ac80bc44befb41da', commit_message='Upload dataset', commit_description='', oid='b6b6294e697e670f07edbef9ac80bc44befb41da', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/wachawich/REG-Forecasting_v2', endpoint='https://huggingface.co', repo_type='dataset', repo_id='wachawich/REG-Forecasting_v2'), pr_revision=None, pr_num=None)

## Test Load

In [52]:
dss = load_dataset("wachawich/REG-Forecasting_v2")
df = dss["train"].to_pandas()
df.isna().sum()

Generating train split: 100%|██████████| 103125/103125 [00:00<00:00, 619114.59 examples/s]


time                        0
date                        0
day                         0
month                       0
year                        0
fueltype                    0
type-name                   0
solar_count                 0
wind_turbine_count          0
value                       0
shortwave_radiation         0
direct_radiation            0
diffuse_radiation           0
direct_normal_irradiance    0
cloudcover                  0
cloudcover_low              0
cloudcover_mid              0
cloudcover_high             0
temperature_2m              0
relativehumidity_2m         0
dewpoint_2m                 0
surface_pressure            0
vapour_pressure_deficit     0
windspeed_10m               0
winddirection_10m           0
windspeed_100m              0
winddirection_100m          0
windgusts_10m               0
precipitation               0
rain                        0
snowfall                    0
day_of_year                 0
sin_doy                     0
cos_doy   

In [53]:
df

Unnamed: 0,time,date,day,month,year,fueltype,type-name,solar_count,wind_turbine_count,value,...,sunrise_time_h,sunrise_time_m,sunset_time_h,sunset_time_m,sin_time,cos_time,day_of_month_sin,day_of_month_cos,month_of_year_sin,month_of_year_cos
0,0,2020-01-01,1,1,2020,1,Solar,34.95,2400,1403,...,7,10,16,51,0.000000,1.000000,0.201299,0.979530,0.5,0.866025
1,0,2020-01-01,1,1,2020,2,Wind,34.95,2400,225,...,7,10,16,51,0.000000,1.000000,0.201299,0.979530,0.5,0.866025
2,1,2020-01-01,1,1,2020,1,Solar,34.95,2400,-35,...,7,10,16,51,0.258819,0.965926,0.201299,0.979530,0.5,0.866025
3,1,2020-01-01,1,1,2020,2,Wind,34.95,2400,222,...,7,10,16,51,0.258819,0.965926,0.201299,0.979530,0.5,0.866025
4,2,2020-01-01,1,1,2020,1,Solar,34.95,2400,-30,...,7,10,16,51,0.500000,0.866025,0.201299,0.979530,0.5,0.866025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103120,22,2025-11-18,18,11,2025,2,Wind,65.43,2593,1398,...,6,39,16,46,-0.500000,0.866025,-0.485302,-0.874347,-0.5,0.866025
103121,23,2025-11-18,18,11,2025,1,Solar,65.43,2593,10192,...,6,39,16,46,-0.258819,0.965926,-0.485302,-0.874347,-0.5,0.866025
103122,23,2025-11-18,18,11,2025,2,Wind,65.43,2593,1158,...,6,39,16,46,-0.258819,0.965926,-0.485302,-0.874347,-0.5,0.866025
103123,0,2025-11-19,19,11,2025,1,Solar,65.43,2593,10026,...,6,40,16,45,0.000000,1.000000,-0.651372,-0.758758,-0.5,0.866025
