In [1]:
import pandas as pd
from scipy.stats import shapiro, jarque_bera

In [2]:
# Function to perform Shapiro-Wilk and Jarque-Bera tests
def perform_normality_tests(df, yield_column, crop_column=None, region_column=None, crop_name=None, region_code=None):
    # Filter dataset if crop and region are provided
    if crop_column and region_column and crop_name and region_code:
        filtered_data = df[(df[crop_column] == crop_name) & (df[region_column] == region_code)]
    else:
        filtered_data = df

    # Extract yield values and drop missing data
    yield_values = filtered_data[yield_column].dropna()

    # Perform the Shapiro-Wilk test
    shapiro_stat, shapiro_p_value = shapiro(yield_values)

    # Perform the Jarque-Bera test
    jb_stat, jb_p_value = jarque_bera(yield_values)

    # Set significance level
    alpha = 0.05

    # Hypothesis results
    shapiro_result = "Reject H0" if shapiro_p_value < alpha else "Fail to reject H0"
    jb_result = "Reject H0" if jb_p_value < alpha else "Fail to reject H0"

    # Display results
    test_results = {
        'Shapiro-Wilk Test': {
            'Test Statistic': shapiro_stat,
            'P-Value': shapiro_p_value,
            'Result': shapiro_result
        },
        'Jarque-Bera Test': {
            'Test Statistic': jb_stat,
            'P-Value': jb_p_value,
            'Result': jb_result
        }
    }

    # Convert to DataFrame for display
    results_df = pd.DataFrame(test_results)
    print(results_df)

In [3]:
# Load the datasets
file_path_nuts0 = '../datasets/CropSDEData/YIELD_NUTS0_NL.csv'
file_path_nuts2 = '../datasets/CropSDEData/YIELD_NUTS2_NL_transposed.csv'  
file_path_mcyfs = '../datasets/CropSDEData/YIELD_PRED_MCYFS_NUTS0_NL.csv'

nuts0_df = pd.read_csv(file_path_nuts0)
nuts2_df = pd.read_csv(file_path_nuts2)
mcyfs_df = pd.read_csv(file_path_mcyfs)

In [4]:
# Run tests for NUTS0 dataset
print("\n--- NUTS0 Dataset (National Level) ---")
perform_normality_tests(nuts0_df, yield_column='YIELD', crop_column='CROP', region_column='IDREGION', crop_name='spring barley', region_code='NL')


--- NUTS0 Dataset (National Level) ---
                Shapiro-Wilk Test   Jarque-Bera Test
Test Statistic           0.959025           2.644891
P-Value                  0.086368           0.266483
Result          Fail to reject H0  Fail to reject H0


In [5]:
# Run tests for NUTS2 dataset
print("\n--- NUTS2 Dataset (Regional Level) ---")
perform_normality_tests(nuts2_df, yield_column='yield', crop_column='CROP', region_column='IDREGION', crop_name='Spring barley', region_code='NL13')


--- NUTS2 Dataset (Regional Level) ---
                Shapiro-Wilk Test   Jarque-Bera Test
Test Statistic           0.969481           0.737914
P-Value                  0.631947           0.691455
Result          Fail to reject H0  Fail to reject H0


In [6]:
# Run tests for MCYFS Predicted Data
print("\n--- MCYFS Predicted Data ---")
perform_normality_tests(mcyfs_df, yield_column='YIELD_PRED', crop_column='CROP', region_column='IDREGION', crop_name='Spring barley', region_code='NL')


--- MCYFS Predicted Data ---
               Shapiro-Wilk Test   Jarque-Bera Test
Test Statistic          0.965383           4.116284
P-Value                 0.007621           0.127691
Result                 Reject H0  Fail to reject H0
