In [None]:
import os
import time
import pandas as pd
import pymannkendall as mk
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

# Define Constants
base_input_path = r"Z:\Thesis\Data\ML_Data"
base_output_path = r"Z:\Thesis\Data\Met\stat_analysis"
countries = ["Bahrain", "Yemen", "Kuwait", "Oman", "Qatar", "Saudi_Arabia", "United_Arab_Emirates"]
years = list(range(1981, 1989)) + [1993] + list(range(2016, 2024))
folders = ["surface", "pressure" ]
surface_variables = {
    '2m_temperature': 't2m',
    '2m_dewpoint_temperature': 'd2m',
    '10m_u_component_of_wind': 'u10',
    '10m_v_component_of_wind': 'v10',
    'surface_solar_radiation_downwards': 'ssrd',
    'surface_thermal_radiation_downwards': 'strd',
    'evaporation': 'e',
    'potential_evaporation': 'pev',
    'sea_surface_temperature': 'sst',
    'soil_temperature_level_1': 'stl1',
    'soil_temperature_level_2': 'stl2',
    'soil_temperature_level_3': 'stl3',
    'soil_temperature_level_4': 'stl4',
    'volumetric_soil_water_layer_1': 'swvl1',
    'volumetric_soil_water_layer_2': 'swvl2',
    'volumetric_soil_water_layer_3': 'swvl3',
    'volumetric_soil_water_layer_4': 'swvl4',
    'leaf_area_index_high_vegetation': 'lai_hv',
    'leaf_area_index_low_vegetation': 'lai_lv',
    'surface_pressure': 'sp',
    'mean_sea_level_pressure': 'msl',
    'convective_available_potential_energy': 'cape',
    'geopotential': 'z',
    'instantaneous_10m_wind_gust': 'i10fg',
    'total_precipitation': 'tp',
    'k_index': 'kx'
}

pressure_variables = {
    'geopotential': 'z',
    'relative_humidity': 'r',
    'temperature': 't',
    'u_component_of_wind': 'u',
    'v_component_of_wind': 'v',
    'vertical_velocity': 'w',
    'vorticity': 'vo'
}

pressure_levels = ['1000', '925', '850', '700', '500', '300', '200', '100', '50', '10']

# Start timer
start_time = time.time()

# Create output directory if it doesn't exist
os.makedirs(base_output_path, exist_ok=True)

# Iterate over years, countries, and folders
for year in tqdm(years, desc="Years"):
    for country in tqdm(countries, desc=f"Countries ({year})"):
        for folder in tqdm(folders, desc=f"Folders ({year}, {country})"):
            input_folder_path = os.path.join(base_input_path, str(year), country, folder)
            output_folder_path = os.path.join(base_output_path, str(year), country, folder)
            os.makedirs(output_folder_path, exist_ok=True)

            # Get variables based on the folder type
            variables = surface_variables if folder == "surface" else pressure_variables

            # Perform analysis for each variable
            for full_name, abbreviation in tqdm(variables.items(), desc=f"Variables ({folder})"):
                file_name = f"{year}_{country}_{folder}_{full_name}.parquet"
                file_path = os.path.join(input_folder_path, file_name)

                # Skip if the file does not exist
                if not os.path.exists(file_path):
                    tqdm.write(f"File not found: {file_path}")
                    continue

                # Print statement indicating processing
                tqdm.write(f"Processing file: {file_path}")

                # Load the data
                try:
                    data = pd.read_parquet(file_path)
                except Exception as e:
                    tqdm.write(f"Error reading file {file_path}: {e}")
                    continue

                # Ensure 'time' column exists and is properly formatted
                if 'time' not in data.columns:
                    tqdm.write(f"Time column missing in file {file_path}")
                    continue

                data['time'] = pd.to_datetime(data['time'], errors='coerce')
                data.dropna(subset=['time'], inplace=True)

                # For pressure data, analyze each level
                if folder == "pressure":
                    for level in pressure_levels:
                        level_data = data[data['level'] == float(level)]

                        if level_data.empty:
                            tqdm.write(f"No data for level {level} in file {file_path}")
                            continue

                        # Perform Mann-Kendall and Sen's Slope analysis
                        try:
                            mk_result = mk.original_test(level_data[abbreviation])
                            sens_slope = mk.sens_slope(level_data[abbreviation])
                        except Exception as e:
                            tqdm.write(f"Error processing variable {abbreviation} at level {level} in file {file_path}: {e}")
                            continue

                        # Save results to parquet
                        results_pq_path = os.path.join(output_folder_path, f"{abbreviation}_level_{level}_MK_SS_results.parquet")
                        results_df = pd.DataFrame({
                            'Parameter': [abbreviation],
                            'Level': [level],
                            'MK Trend': [mk_result.trend],
                            'MK p-value': [mk_result.p],
                            'MK Tau': [mk_result.Tau],
                            'Sens Slope': [sens_slope.slope],
                            'Sens Intercept': [sens_slope.intercept]
                        })
                        results_df.to_parquet(results_pq_path, index=False)

                        # Plot the data and trend
                        #plt.figure(figsize=(10, 6))
                        #sns.lineplot(x=level_data['time'], y=level_data[abbreviation])
                        #plt.title(f'{abbreviation} Trend Analysis at Level {level} ({year} - {country} - {folder})')
                        #plt.xlabel('Time')
                        #plt.ylabel(abbreviation)
                        #plot_file_path = os.path.join(output_folder_path, f"{abbreviation}_level_{level}_trend_plot.png")
                        #plt.savefig(plot_file_path)
                        #plt.close()

                        tqdm.write(f"Processed {abbreviation} at level {level} for {year}/{country}/{folder}")
                else:
                    # Perform Mann-Kendall and Sen's Slope analysis for surface data
                    try:
                        mk_result = mk.original_test(data[abbreviation])
                        sens_slope = mk.sens_slope(data[abbreviation])
                    except Exception as e:
                        tqdm.write(f"Error processing variable {abbreviation} in file {file_path}: {e}")
                        continue

                    # Save results to parquet
                    results_pq_path = os.path.join(output_folder_path, f"{abbreviation}_MK_SS_results.parquet")
                    results_df = pd.DataFrame({
                        'Parameter': [abbreviation],
                        'MK Trend': [mk_result.trend],
                        'MK p-value': [mk_result.p],
                        'MK Tau': [mk_result.Tau],
                        'Sens Slope': [sens_slope.slope],
                        'Sens Intercept': [sens_slope.intercept]
                    })
                    results_df.to_parquet(results_pq_path, index=False)

                    # Plot the data and trend
                    #plt.figure(figsize=(10, 6))
                    #sns.lineplot(x=data['time'], y=data[abbreviation])
                    #plt.title(f'{abbreviation} Trend Analysis ({year} - {country} - {folder})')
                    #plt.xlabel('Time')
                    #plt.ylabel(abbreviation)
                    #plot_file_path = os.path.join(output_folder_path, f"{abbreviation}_trend_plot.png")
                    #plt.savefig(plot_file_path)
                    #plt.close()

                    tqdm.write(f"Processed {abbreviation} for {year}/{country}/{folder}")

# End Timer
end_time = time.time()
elapsed_time = end_time - start_time
print(f"\nTotal execution time: {elapsed_time:.2f} seconds")
