In [1]:
import pandas as pd
from statsmodels.tsa.stattools import adfuller, kpss

In [2]:
# Function to perform ADF and KPSS tests
def perform_stationarity_tests(df, yield_column, crop_column=None, region_column=None, crop_name=None, region_code=None):
    # Filter dataset if crop and region are provided
    if crop_column and region_column and crop_name and region_code:
        filtered_data = df[(df[crop_column] == crop_name) & (df[region_column] == region_code)]
    else:
        filtered_data = df

    # Extract yield values and drop missing data
    yield_values = filtered_data[yield_column].dropna().values

    # ADF and KPSS tests before differencing
    print(f"=== ADF and KPSS Tests Before Differencing for {crop_name if crop_name else 'Dataset'} ===")

    # ADF Test
    adf_result = adfuller(yield_values)
    print("ADF Test:")
    print(f"Test Statistic: {adf_result[0]}")
    print(f"p-value: {adf_result[1]}")
    print(f"Critical Values: {adf_result[4]}")

    if adf_result[1] < 0.05:
        print("Result: Reject H0 (The series is stationary according to the ADF test)")
    else:
        print("Result: Fail to reject H0 (The series is non-stationary according to the ADF test)")

    # KPSS Test
    kpss_result, kpss_p_value, kpss_lags, kpss_crit = kpss(yield_values, regression='c')
    print("\nKPSS Test:")
    print(f"Test Statistic: {kpss_result}")
    print(f"p-value: {kpss_p_value}")
    print(f"Critical Values: {kpss_crit}")

    if kpss_p_value < 0.05:
        print("Result: Reject H0 (The series is non-stationary according to the KPSS test)")
    else:
        print("Result: Fail to reject H0 (The series is stationary according to the KPSS test)")

    # Differencing the series
    differenced_yield = pd.Series(yield_values).diff().dropna()

    # ADF and KPSS tests after differencing
    print(f"\n\n=== ADF and KPSS Tests After Differencing for {crop_name if crop_name else 'Dataset'} ===")

    # ADF Test on Differenced Series
    adf_diff_result = adfuller(differenced_yield)
    print("ADF Test (Differenced Series):")
    print(f"Test Statistic: {adf_diff_result[0]}")
    print(f"p-value: {adf_diff_result[1]}")
    print(f"Critical Values: {adf_diff_result[4]}")

    if adf_diff_result[1] < 0.05:
        print("Result: Reject H0 (The differenced series is stationary according to the ADF test)")
    else:
        print("Result: Fail to reject H0 (The differenced series is non-stationary according to the ADF test)")

    # KPSS Test on Differenced Series
    kpss_diff_result, kpss_diff_p_value, kpss_diff_lags, kpss_diff_crit = kpss(differenced_yield, regression='c')
    print("\nKPSS Test (Differenced Series):")
    print(f"Test Statistic: {kpss_diff_result}")
    print(f"p-value: {kpss_diff_p_value}")
    print(f"Critical Values: {kpss_diff_crit}")

    if kpss_diff_p_value < 0.05:
        print("Result: Reject H0 (The differenced series is non-stationary according to the KPSS test)")
    else:
        print("Result: Fail to reject H0 (The differenced series is stationary according to the KPSS test)")

In [3]:
# Load the datasets
file_path_nuts0 = '../datasets/CropSDEData/YIELD_NUTS0_NL.csv'
file_path_nuts2 = '../datasets/CropSDEData/YIELD_NUTS2_NL_transposed.csv'  # Updated NUTS2 dataset
file_path_mcyfs = '../datasets/CropSDEData/YIELD_PRED_MCYFS_NUTS0_NL.csv'

nuts0_df = pd.read_csv(file_path_nuts0)
nuts2_df = pd.read_csv(file_path_nuts2)
mcyfs_df = pd.read_csv(file_path_mcyfs)

In [4]:
# Run tests for NUTS0 dataset
print("\n--- NUTS0 Dataset (National Level) ---")
perform_stationarity_tests(nuts0_df, yield_column='YIELD', crop_column='CROP', region_column='IDREGION', crop_name='spring barley', region_code='NL')


--- NUTS0 Dataset (National Level) ---
=== ADF and KPSS Tests Before Differencing for spring barley ===
ADF Test:
Test Statistic: -1.565882427117624
p-value: 0.5007055725466372
Critical Values: {'1%': -3.584828853223594, '5%': -2.9282991495198907, '10%': -2.6023438271604937}
Result: Fail to reject H0 (The series is non-stationary according to the ADF test)

KPSS Test:
Test Statistic: 1.0035689037553381
p-value: 0.01
Critical Values: {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739}
Result: Reject H0 (The series is non-stationary according to the KPSS test)


=== ADF and KPSS Tests After Differencing for spring barley ===
ADF Test (Differenced Series):
Test Statistic: -8.157366913787719
p-value: 9.34144324233832e-13
Critical Values: {'1%': -3.584828853223594, '5%': -2.9282991495198907, '10%': -2.6023438271604937}
Result: Reject H0 (The differenced series is stationary according to the ADF test)

KPSS Test (Differenced Series):
Test Statistic: 0.36294261884444906
p-value: 0.093128

look-up table. The actual p-value is smaller than the p-value returned.

  kpss_result, kpss_p_value, kpss_lags, kpss_crit = kpss(yield_values, regression='c')


In [5]:
# Run tests for NUTS2 dataset
print("\n--- NUTS2 Dataset (Regional Level) ---")
perform_stationarity_tests(nuts2_df, yield_column='yield', crop_column='CROP', region_column='IDREGION', crop_name='Spring barley', region_code='NL13')


--- NUTS2 Dataset (Regional Level) ---
=== ADF and KPSS Tests Before Differencing for Spring barley ===
ADF Test:
Test Statistic: -4.45370309659887
p-value: 0.00023819402487446027
Critical Values: {'1%': -3.7377092158564813, '5%': -2.9922162731481485, '10%': -2.635746736111111}
Result: Reject H0 (The series is stationary according to the ADF test)

KPSS Test:
Test Statistic: 0.636400637719211
p-value: 0.019327214752799003
Critical Values: {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739}
Result: Reject H0 (The series is non-stationary according to the KPSS test)


=== ADF and KPSS Tests After Differencing for Spring barley ===
ADF Test (Differenced Series):
Test Statistic: -6.773897286508758
p-value: 2.6000096543302015e-09
Critical Values: {'1%': -3.7883858816542486, '5%': -3.013097747543462, '10%': -2.6463967573696143}
Result: Reject H0 (The differenced series is stationary according to the ADF test)

KPSS Test (Differenced Series):
Test Statistic: 0.5000000000000004
p-value: 0

In [6]:
# Run tests for MCYFS Predicted Data
print("\n--- MCYFS Predicted Data ---")
perform_stationarity_tests(mcyfs_df, yield_column='YIELD_PRED', crop_column='CROP', region_column='IDREGION', crop_name='Spring barley', region_code='NL')



--- MCYFS Predicted Data ---
=== ADF and KPSS Tests Before Differencing for Spring barley ===
ADF Test:
Test Statistic: -2.521624673389196
p-value: 0.11030106918270954
Critical Values: {'1%': -3.503514579651927, '5%': -2.893507960466837, '10%': -2.583823615311909}
Result: Fail to reject H0 (The series is non-stationary according to the ADF test)

KPSS Test:
Test Statistic: 0.40768464720382724
p-value: 0.07384282448110895
Critical Values: {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739}
Result: Fail to reject H0 (The series is stationary according to the KPSS test)


=== ADF and KPSS Tests After Differencing for Spring barley ===
ADF Test (Differenced Series):
Test Statistic: -10.297548362569644
p-value: 3.4395285170456132e-18
Critical Values: {'1%': -3.4968181663902103, '5%': -2.8906107514600103, '10%': -2.5822770483285953}
Result: Reject H0 (The differenced series is stationary according to the ADF test)

KPSS Test (Differenced Series):
Test Statistic: 0.09395166763108724
p-va

look-up table. The actual p-value is greater than the p-value returned.

  kpss_diff_result, kpss_diff_p_value, kpss_diff_lags, kpss_diff_crit = kpss(differenced_yield, regression='c')
