In [12]:
import numpy as np
import pandas as pd
from scipy.stats import chi2
import matplotlib.pyplot as plt

In [13]:
# Load the dataset
file_path = '../datasets/CropSDEData/YIELD_NUTS0_NL.csv'
data = pd.read_csv(file_path)

# Extract the YIELD column for testing
yield_data = data['YIELD']

# Step 1: Fitting the Ornstein-Uhlenbeck (OU) process
mu_ou = np.mean(yield_data)  # Long-term mean set to the mean of the actual data
theta_ou = 0.15  # Mean reversion rate (how fast the process reverts to the mean)
sigma_ou = 0.1  # Volatility
T = len(yield_data)  # Number of time points from the actual data

# Simulate OU Process for trend
ou_trend = np.zeros(T)
ou_trend[0] = yield_data.iloc[0]  # Start from the first actual data point
for t in range(1, T):
    ou_trend[t] = ou_trend[t - 1] + theta_ou * (mu_ou - ou_trend[t - 1]) + sigma_ou * np.random.normal()


In [14]:
# Step 2: Fitting the Geometric Brownian Motion (GBM) process
mu_gbm = 0.02  # Drift
sigma_gbm = 0.1  # Volatility

gbm_trend = np.zeros(T)
gbm_trend[0] = yield_data.iloc[0]
for t in range(1, T):
    gbm_trend[t] = gbm_trend[t - 1] * np.exp((mu_gbm - 0.5 * sigma_gbm ** 2) + sigma_gbm * np.random.normal())

# Combine OU and GBM into a 2D array for comparison at each time step
simulated_data = np.vstack((ou_trend, gbm_trend))

In [15]:
# Step 3: Calculate ranks for the observed data compared to the simulated data
ranks = np.zeros(T)

# For each time step, rank the observed value compared to the simulated values
for t in range(T):
    combined_simulations = np.append(simulated_data[:, t], yield_data[t])  # Include the observed value
    sorted_simulations = np.sort(combined_simulations)  # Sort them
    ranks[t] = np.where(sorted_simulations == yield_data[t])[0][0] + 1  # Find the rank of the observed value

In [16]:
# Step 4: Calculate the test statistic based on the rank frequencies
M = simulated_data.shape[0] + 1  # Total number of simulated values + 1 for observed
expected_frequency = (M + 1) / 2  # Expected rank is the middle of the distribution

# Chi-square test statistic
chi_square_stat = np.sum((ranks - expected_frequency) ** 2 / expected_frequency)

# Degrees of freedom
df = T - 1  # One degree of freedom for each time point minus one
p_value_rank = 1 - chi2.cdf(chi_square_stat, df)

# Output the results
rank_gof_result = {
    'Chi-Square Statistic': chi_square_stat,
    'P-Value': p_value_rank,
    'Result': "Reject H0" if p_value_rank < 0.05 else "Fail to reject H0"
}

print(rank_gof_result)

{'Chi-Square Statistic': 55.0, 'P-Value': 1.0, 'Result': 'Fail to reject H0'}
