In [7]:
import numpy as np
import pandas as pd
from scipy.stats import chi2

In [8]:
# Function to perform OU and GBM process fitting and rank-based GOF test
def perform_ou_gbm_rank_gof_test(df, yield_column):
    # Extract the YIELD column for testing
    yield_data = df[yield_column].dropna()

    # Step 1: Fitting the Ornstein-Uhlenbeck (OU) process
    mu_ou = np.mean(yield_data)  # Long-term mean set to the mean of the actual data
    theta_ou = 0.15  # Mean reversion rate (how fast the process reverts to the mean)
    sigma_ou = 0.1  # Volatility
    T = len(yield_data)  # Number of time points from the actual data

    # Simulate OU Process for trend
    ou_trend = np.zeros(T)
    ou_trend[0] = yield_data.iloc[0]  # Start from the first actual data point
    for t in range(1, T):
        ou_trend[t] = ou_trend[t - 1] + theta_ou * (mu_ou - ou_trend[t - 1]) + sigma_ou * np.random.normal()

    # Step 2: Fitting the Geometric Brownian Motion (GBM) process
    mu_gbm = 0.02  # Drift
    sigma_gbm = 0.1  # Volatility

    gbm_trend = np.zeros(T)
    gbm_trend[0] = yield_data.iloc[0]
    for t in range(1, T):
        gbm_trend[t] = gbm_trend[t - 1] * np.exp((mu_gbm - 0.5 * sigma_gbm ** 2) + sigma_gbm * np.random.normal())

    # Combine OU and GBM into a 2D array for comparison at each time step
    simulated_data = np.vstack((ou_trend, gbm_trend))

    # Step 3: Calculate ranks for the observed data compared to the simulated data
    ranks = np.zeros(T)

    # For each time step, rank the observed value compared to the simulated values
    for t in range(T):
        combined_simulations = np.append(simulated_data[:, t], yield_data[t])  # Include the observed value
        sorted_simulations = np.sort(combined_simulations)  # Sort them
        ranks[t] = np.where(sorted_simulations == yield_data[t])[0][0] + 1  # Find the rank of the observed value

    # Step 4: Calculate the test statistic based on the rank frequencies
    M = simulated_data.shape[0] + 1  # Total number of simulated values + 1 for observed
    expected_frequency = (M + 1) / 2  # Expected rank is the middle of the distribution

    # Chi-square test statistic
    chi_square_stat = np.sum((ranks - expected_frequency) ** 2 / expected_frequency)

    # Degrees of freedom
    df = T - 1  # One degree of freedom for each time point minus one
    p_value_rank = 1 - chi2.cdf(chi_square_stat, df)

    # Output the results
    rank_gof_result = {
        'Chi-Square Statistic': chi_square_stat,
        'P-Value': p_value_rank,
        'Result': "Reject H0" if p_value_rank < 0.05 else "Fail to reject H0"
    }

    print(rank_gof_result)

In [9]:
# Load the datasets
file_path_nuts0 = '../datasets/CropSDEData/YIELD_NUTS0_NL.csv'
file_path_nuts2 = '../datasets/CropSDEData/YIELD_NUTS2_NL_transposed.csv'  # Updated NUTS2 dataset
file_path_mcyfs = '../datasets/CropSDEData/YIELD_PRED_MCYFS_NUTS0_NL.csv'

nuts0_df = pd.read_csv(file_path_nuts0)
nuts2_df = pd.read_csv(file_path_nuts2)
mcyfs_df = pd.read_csv(file_path_mcyfs)

In [10]:
# Run tests for NUTS0 dataset
print("\n--- NUTS0 Dataset (National Level) ---")
perform_ou_gbm_rank_gof_test(nuts0_df, yield_column='YIELD')



--- NUTS0 Dataset (National Level) ---
{'Chi-Square Statistic': 53.5, 'P-Value': 1.0, 'Result': 'Fail to reject H0'}


In [11]:
# Run tests for NUTS2 dataset
print("\n--- NUTS2 Dataset (Regional Level) ---")
perform_ou_gbm_rank_gof_test(nuts2_df, yield_column='yield')


--- NUTS2 Dataset (Regional Level) ---
{'Chi-Square Statistic': 348.0, 'P-Value': 1.0, 'Result': 'Fail to reject H0'}


In [12]:
# Run tests for MCYFS Predicted Data
print("\n--- MCYFS Predicted Data ---")
perform_ou_gbm_rank_gof_test(mcyfs_df, yield_column='YIELD_PRED')


--- MCYFS Predicted Data ---
{'Chi-Square Statistic': 147.5, 'P-Value': 1.0, 'Result': 'Fail to reject H0'}


When the result of the Goodness of Fit (GOF) test is to fail to reject 𝐻0, it means that the model's predictions are consistent with the observed data, according to the statistical test.

- Null Hypothesis (𝐻0): The model fits the observed data well. In this case, the combined Ornstein-Uhlenbeck process and Geometric Brownian Motion fit the actual crop yield data.
- Fail to Reject 𝐻0: The statistical test does not find enough evidence to conclude that the model is a poor fit. Therefore, the model seems to fit the data reasonably well.