In [8]:
import pandas as pd
from statsmodels.stats.diagnostic import acorr_ljungbox

In [9]:
# Function to perform the Box-Ljung test
def perform_box_ljung_test(df, yield_column, crop_column=None, region_column=None, crop_name=None, region_code=None):
    # Filter dataset if crop and region are provided
    if crop_column and region_column and crop_name and region_code:
        filtered_data = df[(df[crop_column] == crop_name) & (df[region_column] == region_code)]
    else:
        filtered_data = df

    # Extract yield values
    yield_values = filtered_data[yield_column].dropna().values

    # Perform the Box-Ljung test for autocorrelation (using lag 10)
    box_ljung_test = acorr_ljungbox(yield_values, lags=[10], return_df=True)

    # Get the p-value from the test result
    box_ljung_p_value = box_ljung_test['lb_pvalue'].values[0]

    # Set significance level
    alpha = 0.05

    # Hypothesis result
    box_ljung_result = "Reject H0" if box_ljung_p_value < alpha else "Fail to reject H0"

    # Add result to the dataframe
    box_ljung_test['Result'] = box_ljung_result

    # Display the results
    print(box_ljung_test)

In [10]:
file_path_nuts0 = '../datasets/CropSDEData/YIELD_NUTS0_NL.csv'
file_path_nuts2 = '../datasets/CropSDEData/YIELD_NUTS2_NL_transposed.csv'  
file_path_mcyfs = '../datasets/CropSDEData/YIELD_PRED_MCYFS_NUTS0_NL.csv'

nuts0_df = pd.read_csv(file_path_nuts0)
nuts2_df = pd.read_csv(file_path_nuts2)
mcyfs_df = pd.read_csv(file_path_mcyfs)

In [11]:
# Run tests for NUTS0 dataset
print("\n--- NUTS0 Dataset (National Level) ---")
perform_box_ljung_test(nuts0_df, yield_column='YIELD', crop_column='CROP', region_column='IDREGION', crop_name='spring barley', region_code='NL')


--- NUTS0 Dataset (National Level) ---
      lb_stat     lb_pvalue     Result
10  152.70255  1.035534e-27  Reject H0


In [12]:
# Run tests for NUTS2 dataset
print("\n--- NUTS2 Dataset (Regional Level) ---")
perform_box_ljung_test(nuts2_df, yield_column='yield', crop_column='CROP', region_column='IDREGION', crop_name='Spring barley', region_code='NL13')


--- NUTS2 Dataset (Regional Level) ---
     lb_stat  lb_pvalue             Result
10  6.441436   0.776913  Fail to reject H0


In [13]:
# Run tests for MCYFS Predicted Data
print("\n--- MCYFS Predicted Data ---")
perform_box_ljung_test(mcyfs_df, yield_column='YIELD_PRED', crop_column='CROP', region_column='IDREGION', crop_name='Spring barley', region_code='NL')



--- MCYFS Predicted Data ---
      lb_stat     lb_pvalue     Result
10  50.652616  2.024012e-07  Reject H0


Reject H0, which indicates that there is significant autocorrelation in the data (i.e., the data is not independently distributed).