In [9]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from statsmodels.tsa.stattools import adfuller

# Load dataset
data = pd.read_csv('../datasets/CropSDEData/METEO_DEKADS_NUTS2_NL.csv')

# Feature Selection
features = ['TAVG', 'VPRES', 'WSPD', 'RELH']
target = 'PREC'

# Drop rows with missing values
data = data.dropna(subset=features + [target])

# Prepare data
X = data[features]
y = data[target]

# Ensure stationarity of the target variable
if adfuller(y)[1] > 0.05:
    print("Target variable is non-stationary. Applying differencing...")
    y = y.diff().dropna()
    X = X.iloc[1:]  # Align X with y after differencing

# Align X and y to ensure consistent lengths
if len(X) > len(y):
    X = X.iloc[:len(y)]
elif len(y) > len(X):
    y = y.iloc[:len(X)]

# Scale the Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Maximum Likelihood Estimation (MLE) for Vasicek Model
def vasicek_mle(params, data):
    a, b, sigma = params
    dt = 1  # Assuming daily intervals
    X = data

    # Vasicek model residuals
    residuals = X[1:] - (X[:-1] + a * (b - X[:-1]) * dt)
    
    # Log likelihood
    log_likelihood = -np.sum(0.5 * np.log(2 * np.pi * sigma**2 * dt) + (residuals**2 / (2 * sigma**2 * dt)))
    return -log_likelihood  # Negative for minimization

# Initial guesses and bounds for MLE
initial_guess = [0.1, np.mean(y), 0.1]
bounds = [(1e-5, None), (None, None), (1e-5, None)]

# Perform MLE for Vasicek Model
res_mle = minimize(vasicek_mle, initial_guess, args=(y.values,), method='L-BFGS-B', bounds=bounds)

if res_mle.success:
    a_mle, b_mle, sigma_mle = res_mle.x
    print("\nEstimated Vasicek Parameters using Maximum Likelihood Estimation (MLE):")
    print(f"Alpha (a): {a_mle}, Beta (b): {b_mle}, Sigma: {sigma_mle}")
else:
    print("\nMLE failed to converge.")
    print(f"Message: {res_mle.message}")
    exit()

# Scale Vasicek Parameters for Stability
a_mle /= 10
b_mle /= 10
sigma_mle /= 10

# Simplified NN Model with Batch Normalization and Reduced Layers
class VasicekNN(nn.Module):
    def __init__(self, input_size, alpha, beta, sigma):
        super(VasicekNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 32)
        self.batchnorm1 = nn.BatchNorm1d(32)
        self.fc2 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)

        # Initialize weights using Vasicek parameters
        nn.init.normal_(self.fc1.weight, mean=beta, std=sigma)
        nn.init.constant_(self.fc1.bias, alpha)
        nn.init.normal_(self.fc2.weight, mean=beta, std=sigma)
        nn.init.constant_(self.fc2.bias, alpha)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.batchnorm1(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Feature Engineering
X['TAVG_VPRES'] = X['TAVG'] * X['VPRES']
X['WSPD_RELH'] = X['WSPD'] * X['RELH']
X_scaled = scaler.fit_transform(X)

# Update Dataset Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

# Initialize and Train the Neural Network
model = VasicekNN(X_train_tensor.shape[1], a_mle, b_mle, sigma_mle)

# Training loop with early stopping
epochs = 500
early_stopping_patience = 50
best_loss = np.inf
patience_counter = 0

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    scheduler.step()

    if loss.item() < best_loss:
        best_loss = loss.item()
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter > early_stopping_patience:
        print(f"Early stopping at epoch {epoch + 1}")
        break

    if (epoch + 1) % 50 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

# Evaluate the Model
model.eval()
with torch.no_grad():
    y_pred_test = model(X_test_tensor).numpy()

test_mse = mean_squared_error(y_test, y_pred_test)
test_r2 = r2_score(y_test, y_pred_test)

print(f"\nNeural Network Test MSE: {test_mse}")
print(f"Neural Network Test R^2 Score: {test_r2}")

# Compare with Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_r2 = r2_score(y_test, y_pred_lr)

print(f"\nLinear Regression Test MSE: {lr_mse}")
print(f"Linear Regression Test R^2 Score: {lr_r2}")

if test_mse < lr_mse:
    print("\nNeural Network outperforms Linear Regression.")
else:
    print("\nLinear Regression outperforms Neural Network.")


Estimated Vasicek Parameters using Maximum Likelihood Estimation (MLE):
Alpha (a): 0.9685389235036493, Beta (b): 1.920622930269423, Sigma: 1.6015054801592965


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['TAVG_VPRES'] = X['TAVG'] * X['VPRES']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['WSPD_RELH'] = X['WSPD'] * X['RELH']


Epoch [50/500], Loss: 31.2401
Early stopping at epoch 61

Neural Network Test MSE: 30.228676616409523
Neural Network Test R^2 Score: -10.607941905036531

Linear Regression Test MSE: 2.0847388140520513
Linear Regression Test R^2 Score: 0.19945132405980537

Linear Regression outperforms Neural Network.
