**Riyadh data preprocessing**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib

# STEP 1: Load dataset
data = pd.read_csv(r'D:\Projects\quantum forecasting\dataset\riyadh data\cleaned_weather_data.csv')

# STEP 2: Create date column from year and julian day
data['date'] = pd.to_datetime(data['year'].astype(str), format='%Y') + pd.to_timedelta(data['julian_day'] - 1, unit='D')

# STEP 3: Drop irrelevant columns
data.drop(columns=["site_id", "max_temp_fahrenheit", "min_temp_fahrenheit"], inplace=True)

# STEP 4: Feature engineering
data["mean_temp_celsius"] = (data["max_temp_celsius"] + data["min_temp_celsius"]) / 2
data["temp_range"] = data["max_temp_celsius"] - data["min_temp_celsius"]
data.drop(columns=["max_temp_celsius", "min_temp_celsius"], inplace=True)

# STEP 5: Seasonality encoding - keep and add more time features
data["year_days"] = np.where((data["year"] == 2012) | (data["year"] == 2016), 366, 365)
data["sin_day"] = np.sin(2 * np.pi * data["julian_day"] / data["year_days"])
data["cos_day"] = np.cos(2 * np.pi * data["julian_day"] / data["year_days"])

# Additional time features for 
data['month'] = data['date'].dt.month
data['weekday'] = data['date'].dt.weekday

data.drop(columns=["julian_day", "year_days"], inplace=True)

# STEP 6: Remove missing values
data.dropna(inplace=True)

# STEP 7: Split train/test
train_data = data[data["year"] <= 2014].copy()
test_data = data[data["year"] > 2014].copy()

# STEP 8: Define feature columns
feature_cols = [c for c in data.columns if c not in ["year", "solar_radiation_langley", "date"]]

# Separate features to scale and those to keep as-is
feature_cols_to_scale = [c for c in feature_cols if c not in ['sin_day', 'cos_day']]
feature_cols_no_scale = ['sin_day', 'cos_day']

# STEP 9: Scale features (excluding sin_day and cos_day)
scaler_X = StandardScaler()
train_scaled_array = scaler_X.fit_transform(train_data[feature_cols_to_scale])
test_scaled_array = scaler_X.transform(test_data[feature_cols_to_scale])

# STEP 10: Scale target separately
scaler_y = StandardScaler()
train_target_scaled = scaler_y.fit_transform(train_data[['solar_radiation_langley']])
test_target_scaled = scaler_y.transform(test_data[['solar_radiation_langley']])

# STEP 11: Replace scaled features and target in copies of original data
train_data_scaled = train_data.copy()
test_data_scaled = test_data.copy()

# Replace scaled feature columns
train_data_scaled[feature_cols_to_scale] = train_scaled_array
test_data_scaled[feature_cols_to_scale] = test_scaled_array

# Keep sin_day and cos_day unchanged
train_data_scaled[feature_cols_no_scale] = train_data[feature_cols_no_scale]
test_data_scaled[feature_cols_no_scale] = test_data[feature_cols_no_scale]

# Replace scaled target column
train_data_scaled['solar_radiation_langley'] = train_target_scaled
test_data_scaled['solar_radiation_langley'] = test_target_scaled

# STEP 12: Save processed datasets for Informer (keep 'date' for indexing)
train_data_scaled.to_csv(r'D:\Projects\quantum forecasting\dataset\train_data.csv', index=False)
test_data_scaled.to_csv(r'D:\Projects\quantum forecasting\dataset\test_data.csv', index=False)


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib

# STEP 1: Load dataset
data = pd.read_csv(r'D:\Projects\quantum forecasting\dataset\riyadh data\cleaned_weather_data.csv')
data.columns

Index(['site_id', 'year', 'julian_day', 'day_length_seconds',
       'precipitation_mm', 'solar_radiation_langley',
       'snow_water_equivalent_mm', 'max_temp_celsius', 'min_temp_celsius',
       'vapor_pressure_pa', 'max_temp_fahrenheit', 'min_temp_fahrenheit'],
      dtype='object')