In [8]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

# Read the Energy and Weather datasets from CSV files
energy_dataset = pd.read_csv(r'C:\Users\torst\anaconda3\O4\Datat\Energy and weather datasets\energy_dataset.csv')
weather_dataset = pd.read_csv(r'C:\Users\torst\anaconda3\O4\Datat\Energy and weather datasets\weather_features.csv')

# Remove unnecessary weather features
weather_features_to_drop = ['weather_main', 'weather_description', 'pressure', 'rain_1h', 'rain_3h', 'snow_3h', 'weather_id']
weather_dataset.drop(weather_features_to_drop, inplace=True, axis=1)

# Remove specific energy generation types
energy_generation_to_drop = ['generation fossil coal-derived gas', 'generation fossil peat', 'generation geothermal', 
                             'generation fossil oil shale', 'forecast wind offshore eday ahead', 
                             'generation hydro pumped storage aggregated', 'generation marine', 
                             'generation wind offshore']
energy_dataset.drop(energy_generation_to_drop, inplace=True, axis=1)

# Calculate the mean of weather data for each timestamp
weather_dataset_mean = weather_dataset.groupby('dt_iso', as_index=False).mean()
weather_dataset_mean = weather_dataset_mean.rename(columns={'dt_iso': 'time'})

# Merge the energy and weather datasets based on their datetime index
merged_dataset = pd.merge(energy_dataset, weather_dataset_mean, left_index=False, right_index=False)


In [12]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

# Fill missing values with 0
merged_dataset = merged_dataset.fillna(0)

# Calculate price discrepancy
price_actual = merged_dataset['price actual'].values
price_day_ahead = merged_dataset['price day ahead'].values
price_discrep = abs(price_actual - price_day_ahead)

# Remove unnecessary columns
merged_dataset.drop(['price actual', 'price day ahead'], inplace=True, axis=1)
time_stamps = merged_dataset['time']
merged_dataset.drop(['time', 'generation hydro pumped storage consumption',
                     'generation fossil brown coal/lignite'], inplace=True, axis=1)

# List of relevant generation features
generationList = ["generation biomass", "generation fossil gas", "generation fossil hard coal",
                  "generation fossil oil", "generation hydro run-of-river and poundage",
                  "generation hydro water reservoir", "generation nuclear", "generation other",
                  "generation other renewable", "generation solar", "generation waste",
                  "generation wind onshore", "total load actual", "temp", "temp_min", "temp_max",
                  "humidity", "wind_speed", "wind_deg", "clouds_all"]

# Extract relevant features
newDataset = merged_dataset[generationList]

# Drop irrelevant features
merged_dataset.drop(generationList, inplace=True, axis=1)

# Shift data for 24 hours ago
newDataset_24h_ago = newDataset.shift(-24)
merged_dataset['time'] = time_stamps
newDataset_24h_ago['time'] = time_stamps

# Merge shifted dataset with current dataset
merged_24_dataset = pd.merge(newDataset_24h_ago, merged_dataset, left_index=True, right_index=True)
merged_24_dataset = merged_24_dataset[:-24]
merged_24_dataset.drop("time", inplace=True, axis=1)

# Visualize histogram
merged_24_dataset.hist(bins=100, figsize=(15, 15))
plt.show()

# Remove corresponding entries from price_discrep
price_discrep = price_discrep[:-24]

# Split the data for training and validation
X = merged_24_dataset
y = price_discrep

X_train, X_val, y_train, y_val = train_test_split(
    X, y, shuffle=True, test_size=0.5, random_state=42
)

degrees = [1, 2]

for degree in degrees:
    polynomial_features = PolynomialFeatures(degree=degree, include_bias=False)
    linear_regression = LinearRegression()
    pipeline = Pipeline([
        ("polynomial_features", polynomial_features),
        ("linear_regression", linear_regression)
    ])
    X_train_poly = pipeline.fit(X_train, y_train)

    # Evaluate using cross-validation
    scores = cross_val_score(pipeline, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
    score_mean = scores.mean()
    
    print(f"Degree={degree:4d}, Mean Score={score_mean:4.2f}, Polynomial Features: {polynomial_features}")
    for i, score in enumerate(scores):
        print(f"CV fold {i}  =>  Score = {score:.2}")

# Set up preprocessing pipeline for polynomial regression
poly_scaler = Pipeline([
    ("poly_features", PolynomialFeatures(degree=2, include_bias=False)),
    ("std_scaler", StandardScaler())
])

X_train_poly_scaled = poly_scaler.fit_transform(X_train)
X_val_poly_scaled = poly_scaler.transform(X_val)

# Define function for training
def Train(X_train, y_train, X_val, y_val, n_epochs, verbose=False):
    print(f"Training... n_epochs={n_epochs}")

    train_errors, val_errors = [], []

    sgd_reg = SGDRegressor(
        max_iter=10,
        penalty=None,
        eta0=0.005,
        warm_start=True,
        early_stopping=False,
        learning_rate="constant",
        tol=-float("inf"),
        random_state=42
    )

    for epoch in range(n_epochs):
        sgd_reg.fit(X_train, y_train)

        y_train_predict = sgd_reg.predict(X_train)
        y_val_predict = sgd_reg.predict(X_val)

        mse_train = mean_squared_error(y_train, y_train_predict)
        mse_val = mean_squared_error(y_val, y_val_predict)

        train_errors.append(mse_train)
        val_errors.append(mse_val)
        if verbose:
            print(f"Epoch={epoch:4d}, MSE Train={mse_train:4.2f}, MSE Validation={mse_val:4.2f}")

    return train_errors, val_errors

n_epochs = 500
train_errors, val_errors = Train(X_train_poly_scaled, y_train, X_val_poly_scaled, y_val, n_epochs, True)

# Visualize histogram of the weather dataset
weather_dataset.hist(bins=50, figsize=(20, 15))
plt.show()

# Compute correlation matrix
corr_matrix = energy_dataset.corr()
sorted_corr = corr_matrix["price day ahead"].sort_values(ascending=False)

# Set 'time' column as index and merge datasets
energy_dataset.set_index('time', inplace=True)
weather_dataset.set_index('dt_iso', inplace=True)
merged_dataset = pd.merge(energy_dataset, weather_dataset, left_index=True, right_index=True)
merged_dataset.reset_index(inplace=True)

# Display first few rows of datasets
energy_dataset.head()
weather_dataset.head()

# Visualize histogram of weather dataset
weather_dataset.hist(bins=50, figsize=(20, 15))
plt.show()

# Downsample weather dataset and merge
downsampled_weather = weather_dataset.iloc[::5, :]
downsampled_merged_dataset = pd.merge(energy_dataset, downsampled_weather, left_index=True, right_index=True)
downsampled_merged_dataset.reset_index(inplace=True)

# Compute correlation matrix for downsampled dataset
corr_matrix_downsampled = downsampled_merged_dataset.corr()
sorted_corr_downsampled = corr_matrix_downsampled["price day ahead"].sort_values(ascending=False)
downsampled_merged_dataset.describe()

# Compute correlation matrix for merged dataset
corr_matrix_merged = merged_dataset.corr()
sorted_corr_merged = corr_matrix_merged["price day ahead"].sort_values(ascending=False)
merged_dataset.describe()

# Visualize histogram of downsampled merged dataset
downsampled_merged_dataset.hist(bins=50, figsize=(20, 15))
plt.show()



Ranked correlation matrix for merged dataset:
price day ahead                                1.000000
price actual                                   0.732155
generation fossil hard coal                    0.671596
generation fossil gas                          0.640895
generation hydro pumped storage consumption    0.600460
generation fossil brown coal/lignite           0.567905
total load forecast                            0.474649
total load actual                              0.473869
forecast wind onshore day ahead                0.428874
generation other renewable                     0.428078
generation wind onshore                        0.424899
generation waste                               0.368036
generation hydro run-of-river and poundage     0.294718
generation fossil oil                          0.292793
wind_deg                                       0.161827
wind_speed                                     0.124252
generation biomass                             0.108945
te