In [2]:
import pandas as pd

df = pd.read_csv('../Data/raw/forecast weather/forecast_weather_2024-2049_raw.csv')

In [3]:
def extract_year(date_str):
    parts = date_str.split('/')
    year = parts[-1]
    if len(year) == 2:  # If the year is in two-digit format
        year = '20' + year  # Assuming all years are in the 2000s
    return int(year)

def classify_season(date_str):
    month = int(date_str.split('/')[1])  # Extracting the month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in range(3, 10):
        return 'Summer'
    else:  # Covering months October and November
        return 'Autumn'

In [4]:
df['year'] = df['time'].apply(extract_year)
df['season'] = df['time'].apply(classify_season)

In [5]:
# Grouping by state, year, and season and aggregating the specified fields to get the average values
df = df.groupby(['state', 'year', 'season']).agg({
    'temperature_2m_min (°C)': 'mean',
    'temperature_2m_max (°C)': 'mean',
    'precipitation_sum (mm)': 'mean',
    'snowfall_sum (cm)': 'mean'
}).reset_index()

df = df.fillna(0)
df.columns = df.columns.str.replace(r' \(\D+\)', '', regex=True)

# Time frame limit
df = df[(df['year'] >= 2024) & (df['year'] <= 2028)]

In [7]:
# Filter only for states within the historical weather data
historical_weather = pd.read_excel('../Data/clean/historical_weather.xlsx')
states = pd.DataFrame(historical_weather['state'].unique())
states.columns = ['state']
merged_df = pd.merge(df, states, on='state', how='inner')

merged_df.to_excel('../Data/clean/forecast_weather.xlsx', index = False)