In [3]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

# Load dataset
data = pd.read_csv('../datasets/CropSDEData/METEO_DEKADS_NUTS2_NL.csv')

# Feature Selection
features = ['TAVG', 'VPRES', 'WSPD', 'RELH']
target = 'PREC'

# Drop rows with missing values
data = data.dropna(subset=features + [target])

# Prepare data
X = data[features]
y = data[target]

# Maximum Likelihood Estimation (MLE) for Vasicek Model
def vasicek_mle(params, data):
    a, b, sigma = params
    dt = 1  # Assuming daily intervals
    X = data

    # Vasicek model residuals
    residuals = X[1:] - (X[:-1] + a * (b - X[:-1]) * dt)
    
    # Log likelihood
    log_likelihood = -np.sum(0.5 * np.log(2 * np.pi * sigma**2 * dt) + (residuals**2 / (2 * sigma**2 * dt)))
    return -log_likelihood  # Negative for minimization

# Initial guess for MLE parameters
initial_guess = [0.1, np.mean(y), 0.1]

# Estimate Vasicek parameters using MLE
res_mle = minimize(vasicek_mle, initial_guess, args=(y.values,), method='L-BFGS-B', 
                   bounds=[(1e-5, None), (None, None), (1e-5, None)])

# Extract MLE parameters
a_mle, b_mle, sigma_mle = res_mle.x

# Add Vasicek parameters as features
X['alpha_mle'] = a_mle
X['beta_mle'] = b_mle
X['sigma_mle'] = sigma_mle

# Feature Engineering: Add Interaction Terms
X['TAVG_VPRES'] = X['TAVG'] * X['VPRES']
X['WSPD_RELH'] = X['WSPD'] * X['RELH']

# Scaling the Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Baseline Model: Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_r2 = r2_score(y_test, y_pred_lr)

print(f"Linear Regression Baseline MSE: {lr_mse}")
print(f"Linear Regression R^2 Score: {lr_r2}")

# Neural Network Model
nn_model = MLPRegressor(hidden_layer_sizes=(128, 64, 32), max_iter=1000, activation='relu', random_state=42,
                        learning_rate_init=0.001, alpha=0.001)
nn_model.fit(X_train, y_train)
y_pred_nn = nn_model.predict(X_test)
nn_mse = mean_squared_error(y_test, y_pred_nn)
nn_r2 = r2_score(y_test, y_pred_nn)

print(f"\nNeural Network MSE: {nn_mse}")
print(f"Neural Network R^2 Score: {nn_r2}")

# Comparison
if nn_mse < lr_mse:
    print("\nNeural Network outperforms Linear Regression.")
else:
    print("\nLinear Regression outperforms Neural Network.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['alpha_mle'] = a_mle
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['beta_mle'] = b_mle
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sigma_mle'] = sigma_mle
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the c

Linear Regression Baseline MSE: 2.084738814052051
Linear Regression R^2 Score: 0.19945132405980548

Neural Network MSE: 2.0356118668173675
Neural Network R^2 Score: 0.2183162832079818

Neural Network outperforms Linear Regression.
