<a href="https://colab.research.google.com/github/yessimkhanova/cmip6_models_evaluation/blob/main/cmip6_models_errors_against_meteo_stations_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This code is calculating RMSE, MAE, MBE errors

In [3]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from google.colab import drive
drive.mount('/content/drive')


# Define a function to calculate error metrics for a single station
def calculate_error_metrics(insitu_data, model_data):
    # Reshape insitu and model data to long format
    insitu_data = insitu_data.melt(id_vars=['Year'], var_name='Month', value_name='Temperature_insitu')
    model_data = model_data.melt(id_vars=['Year'], var_name='Month', value_name='Temperature_model')

    # Merge insitu and model data on 'Year' and 'Month' columns
    merged_data = pd.merge(insitu_data, model_data, on=['Year', 'Month'])
    if merged_data.empty:
        raise ValueError("Merged data is empty. Check data consistency.")

    # Calculate error metrics
    rmse = np.sqrt(mean_squared_error(merged_data['Temperature_insitu'], merged_data['Temperature_model']))
    mae = mean_absolute_error(merged_data['Temperature_insitu'], merged_data['Temperature_model'])
    mbe = np.mean(merged_data['Temperature_model'] - merged_data['Temperature_insitu'])
    return rmse, mae, mbe

# Directories
insitu_directory = "/content/drive/My Drive/koppen_maps/insitu/temperature/"
base_dir = "/content/drive/My Drive/koppen_maps/models_errors/models_pixelValue/"
output_directory = "/content/drive/My Drive/koppen_maps/models_errors/temperature_error/"

# Ensure the output directory exists
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Validate all model directories
insitu_files = {file for file in os.listdir(insitu_directory) if file.endswith('.csv')}
expected_file_count = 34

for model_folder in os.listdir(base_dir):
    model_directory = os.path.join(base_dir, model_folder, "temperature/")
    if not os.path.isdir(model_directory):
        continue

    # Get the list of .csv files in the model directory
    model_files = {file for file in os.listdir(model_directory) if file.endswith('.csv')}

    # Check the file count
    if len(model_files) != expected_file_count:
        raise ValueError(f"{model_folder} contains {len(model_files)} .csv files, but {expected_file_count} are required.")

    # Check for missing or extra files
    missing_files = insitu_files - model_files
    extra_files = model_files - insitu_files

    if missing_files:
        raise FileNotFoundError(f"{model_folder} is missing .csv files: {missing_files}")
    if extra_files:
        raise ValueError(f"{model_folder} contains extra .csv files: {extra_files}")

print("All model directories validated successfully.")

# Calculate Metrics
RMSE, MAE, MBE = {}, {}, {}

for model_folder in os.listdir(base_dir):
    model_directory = os.path.join(base_dir, model_folder, "temperature/")
    if not os.path.isdir(model_directory):
        continue

    # Dictionaries to store metrics for the current model
    rmse_dict, mae_dict, mbe_dict = {}, {}, {}

    for filename in insitu_files:
        try:
            # Load insitu and model data
            insitu_data = pd.read_csv(os.path.join(insitu_directory, filename))
            model_data = pd.read_csv(os.path.join(model_directory, filename))

            # Calculate error metrics
            rmse, mae, mbe = calculate_error_metrics(insitu_data, model_data)

            # Store results
            station_name = filename.split('.')[0]
            rmse_dict[station_name] = rmse
            mae_dict[station_name] = mae
            mbe_dict[station_name] = mbe
        except Exception as e:
            print(f"Error processing {filename} in {model_folder}: {e}")
            raise  # Stop execution if there's a critical issue

    # Store metrics in global dictionaries
    RMSE[model_folder] = rmse_dict
    MAE[model_folder] = mae_dict
    MBE[model_folder] = mbe_dict

# Save Metrics to CSV Files
for metric, data in [("RMSE", RMSE), ("MAE", MAE), ("MBE", MBE)]:
    df = pd.DataFrame(data)
    df.to_csv(os.path.join(output_directory, f"temperature_{metric.lower()}.csv"))
    print(f"{metric} DataFrame saved successfully!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
All model directories validated successfully.
RMSE DataFrame saved successfully!
MAE DataFrame saved successfully!
MBE DataFrame saved successfully!


In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from google.colab import drive
drive.mount('/content/drive')


# Define a function to calculate error metrics for a single station
def calculate_error_metrics(insitu_data, model_data):
    # Reshape insitu and model data to long format
    insitu_data = insitu_data.melt(id_vars=['Year'], var_name='Month', value_name='Precipitation_insitu')
    model_data = model_data.melt(id_vars=['Year'], var_name='Month', value_name='Precipitation_model')

    # Merge insitu and model data on 'Year' and 'Month' columns
    merged_data = pd.merge(insitu_data, model_data, on=['Year', 'Month'])
    if merged_data.empty:
        raise ValueError("Merged data is empty. Check data consistency.")

    # Calculate error metrics
    rmse = np.sqrt(mean_squared_error(merged_data['Precipitation_insitu'], merged_data['Precipitation_model']))
    mae = mean_absolute_error(merged_data['Precipitation_insitu'], merged_data['Precipitation_model'])
    mbe = np.mean(merged_data['Precipitation_model'] - merged_data['Precipitation_insitu'])
    return rmse, mae, mbe

# Directories
insitu_directory = "/content/drive/My Drive/koppen_maps/insitu/precipitation/"
base_dir = "/content/drive/My Drive/koppen_maps/models_errors/models_pixelValue/"
output_directory = "/content/drive/My Drive/koppen_maps/models_errors/precipitation_error/"

# Ensure the output directory exists
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Validate all model directories
insitu_files = {file for file in os.listdir(insitu_directory) if file.endswith('.csv')}
expected_file_count = 34

for model_folder in os.listdir(base_dir):
    model_directory = os.path.join(base_dir, model_folder, "precipitation/")
    if not os.path.isdir(model_directory):
        continue

    # Get the list of .csv files in the model directory
    model_files = {file for file in os.listdir(model_directory) if file.endswith('.csv')}

    # Check the file count
    if len(model_files) != expected_file_count:
        raise ValueError(f"{model_folder} contains {len(model_files)} .csv files, but {expected_file_count} are required.")

    # Check for missing or extra files
    missing_files = insitu_files - model_files
    extra_files = model_files - insitu_files

    if missing_files:
        raise FileNotFoundError(f"{model_folder} is missing .csv files: {missing_files}")
    if extra_files:
        raise ValueError(f"{model_folder} contains extra .csv files: {extra_files}")

print("All model directories validated successfully.")

# Calculate Metrics
RMSE, MAE, MBE = {}, {}, {}

for model_folder in os.listdir(base_dir):
    model_directory = os.path.join(base_dir, model_folder, "precipitation/")
    if not os.path.isdir(model_directory):
        continue

    # Dictionaries to store metrics for the current model
    rmse_dict, mae_dict, mbe_dict = {}, {}, {}

    for filename in insitu_files:
        try:
            # Load insitu and model data
            insitu_data = pd.read_csv(os.path.join(insitu_directory, filename))
            model_data = pd.read_csv(os.path.join(model_directory, filename))

            # Calculate error metrics
            rmse, mae, mbe = calculate_error_metrics(insitu_data, model_data)

            # Store results
            station_name = filename.split('.')[0]
            rmse_dict[station_name] = rmse
            mae_dict[station_name] = mae
            mbe_dict[station_name] = mbe
        except Exception as e:
            print(f"Error processing {filename} in {model_folder}: {e}")
            raise  # Stop execution if there's a critical issue

    # Store metrics in global dictionaries
    RMSE[model_folder] = rmse_dict
    MAE[model_folder] = mae_dict
    MBE[model_folder] = mbe_dict

# Save Metrics to CSV Files
for metric, data in [("RMSE", RMSE), ("MAE", MAE), ("MBE", MBE)]:
    df = pd.DataFrame(data)
    df.to_csv(os.path.join(output_directory, f"precip_{metric.lower()}.csv"))
    print(f"{metric} DataFrame saved successfully!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
All model directories validated successfully.
RMSE DataFrame saved successfully!
MAE DataFrame saved successfully!
MBE DataFrame saved successfully!


Best fit model based on temperature RMSE

In [5]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import necessary libraries
import pandas as pd

# Read the CSV file into a Pandas DataFrame
file_path = '/content/drive/My Drive/koppen_maps/models_errors/temperature_error/temperature_rmse.csv'
data = pd.read_csv(file_path, index_col=0)

# Find the indices of the 3 lowest values for each station
best_models_indices = data.apply(lambda x: x.argsort().head(3), axis=1)

# Retrieve the corresponding models for the best four values
best_models = pd.DataFrame(columns=['Best Model 1', 'Best Model 2', 'Best Model 3'])

for index, row in best_models_indices.iterrows():
    best_models.loc[index] = data.columns[row].values

# Output the results
#print("Best models for each station based on RMSE:")
#print(best_models)

modelscore = {}
for (st, r) in best_models.iterrows():
    for i in range(3):
        m = r.array[i]
        if m not in modelscore:
            modelscore[m] = 0
        modelscore[m] += 3 - i

# Sort the dictionary items by value in descending order
sorted_modelscore = sorted(modelscore.items(), key=lambda x: x[1], reverse=True)

# Display the sorted modelscore as a column
for model, score in sorted_modelscore:
    print(f"{model}: {score}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
GISS-E2-1-G: 83
MIROC-ES2L: 55
EC-Earth3-Veg-LR: 19
UKESM1-0-LL: 7
INM-CM5-0: 7
MRI-ESM2-0: 5
TaiESM1: 5
CMCC-CM2-SR5: 5
MIROC6: 4
KIOST-ESM: 4
BCC-CSM2-MR: 2
HadGEM3-GC31-LL: 2
GFDL-ESM4: 2
CESM2-WACCM: 2
MPI-ESM1-2-HR: 1
CNRM-ESM2-1: 1


Best fit model based on precipitation RMSE

In [8]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import necessary libraries
import pandas as pd

# Read the CSV file into a Pandas DataFrame
file_path = '/content/drive/My Drive/koppen_maps/models_errors/precipitation_error/precipitation_rmse.csv'
data = pd.read_csv(file_path, index_col=0)

# Find the indices of the 3 lowest values for each station
best_models_indices = data.apply(lambda x: x.argsort().head(3), axis=1)

# Retrieve the corresponding models for the best four values
best_models = pd.DataFrame(columns=['Best Model 1', 'Best Model 2', 'Best Model 3'])

for index, row in best_models_indices.iterrows():
    best_models.loc[index] = data.columns[row].values

# Output the results
#print("Best models for each station based on RMSE:")
#print(best_models)

modelscore = {}
for (st, r) in best_models.iterrows():
    for i in range(3):
        m = r.array[i]
        if m not in modelscore:
            modelscore[m] = 0
        modelscore[m] += 3 - i

# Sort the dictionary items by value in descending order
sorted_modelscore = sorted(modelscore.items(), key=lambda x: x[1], reverse=True)

# Display the sorted modelscore as a column
for model, score in sorted_modelscore:
    print(f"{model}: {score}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
MIROC-ES2L: 48
FGOALS-g3: 33
UKESM1-0-LL: 23
GISS-E2-1-G: 22
INM-CM5-0: 13
MIROC6: 9
INM-CM4-8: 8
NorESM2-LM: 7
ACCESS-CM2: 6
MPI-ESM1-2-LR: 5
CanESM5: 4
KACE-1-0-G: 4
HadGEM3-GC31-LL: 3
TaiESM1: 3
IPSL-CM6A-LR: 3
CESM2: 3
CMCC-CM2-SR5: 3
BCC-CSM2-MR: 2
CNRM-CM6-1: 1
GFDL-ESM4: 1
MRI-ESM2-0: 1
EC-Earth3: 1
NorESM2-MM: 1


Best fit model based on temperature MAE

In [9]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import necessary libraries
import pandas as pd

# Read the CSV file into a Pandas DataFrame
file_path = '/content/drive/My Drive/koppen_maps/models_errors/temperature_error/temperature_mae.csv'
data = pd.read_csv(file_path, index_col=0)

# Find the indices of the 3 lowest values for each station
best_models_indices = data.apply(lambda x: x.argsort().head(3), axis=1)

# Retrieve the corresponding models for the best four values
best_models = pd.DataFrame(columns=['Best Model 1', 'Best Model 2', 'Best Model 3'])

for index, row in best_models_indices.iterrows():
    best_models.loc[index] = data.columns[row].values

# Output the results
#print("Best models for each station based on RMSE:")
#print(best_models)

modelscore = {}
for (st, r) in best_models.iterrows():
    for i in range(3):
        m = r.array[i]
        if m not in modelscore:
            modelscore[m] = 0
        modelscore[m] += 3 - i

# Sort the dictionary items by value in descending order
sorted_modelscore = sorted(modelscore.items(), key=lambda x: x[1], reverse=True)

# Display the sorted modelscore as a column
for model, score in sorted_modelscore:
    print(f"{model}: {score}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
GISS-E2-1-G: 81
INM-CM5-0: 35
MIROC-ES2L: 23
EC-Earth3-Veg-LR: 17
UKESM1-0-LL: 8
KIOST-ESM: 8
TaiESM1: 5
CMCC-CM2-SR5: 5
CESM2-WACCM: 4
ACCESS-CM2: 3
BCC-CSM2-MR: 2
MIROC6: 2
ACCESS-ESM1-5: 2
NorESM2-LM: 2
MRI-ESM2-0: 2
CNRM-ESM2-1: 2
HadGEM3-GC31-LL: 1
INM-CM4-8: 1
GFDL-ESM4: 1


Best fit model based on precipitation MAE

In [10]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import necessary libraries
import pandas as pd

# Read the CSV file into a Pandas DataFrame
file_path = '/content/drive/My Drive/koppen_maps/models_errors/precipitation_error/precipitation_mae.csv'
data = pd.read_csv(file_path, index_col=0)

# Find the indices of the 3 lowest values for each station
best_models_indices = data.apply(lambda x: x.argsort().head(3), axis=1)

# Retrieve the corresponding models for the best four values
best_models = pd.DataFrame(columns=['Best Model 1', 'Best Model 2', 'Best Model 3'])

for index, row in best_models_indices.iterrows():
    best_models.loc[index] = data.columns[row].values

# Output the results
#print("Best models for each station based on RMSE:")
#print(best_models)

modelscore = {}
for (st, r) in best_models.iterrows():
    for i in range(3):
        m = r.array[i]
        if m not in modelscore:
            modelscore[m] = 0
        modelscore[m] += 3 - i

# Sort the dictionary items by value in descending order
sorted_modelscore = sorted(modelscore.items(), key=lambda x: x[1], reverse=True)

# Display the sorted modelscore as a column
for model, score in sorted_modelscore:
    print(f"{model}: {score}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
MIROC-ES2L: 48
GISS-E2-1-G: 35
FGOALS-g3: 34
INM-CM5-0: 21
UKESM1-0-LL: 16
ACCESS-CM2: 11
NorESM2-LM: 7
KACE-1-0-G: 6
INM-CM4-8: 5
MIROC6: 5
HadGEM3-GC31-LL: 4
CanESM5: 4
TaiESM1: 3
CNRM-CM6-1: 2
CESM2-WACCM: 1
IPSL-CM6A-LR: 1
BCC-CSM2-MR: 1


Best fit model based on temperature MBE

In [11]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import necessary libraries
import pandas as pd

# Read the CSV file into a Pandas DataFrame
file_path = '/content/drive/My Drive/koppen_maps/models_errors/temperature_error/temperature_mbe.csv'
data = pd.read_csv(file_path, index_col=0)

# Find the indices of the 3 lowest values for each station
best_models_indices = data.apply(lambda x: x.argsort().head(3), axis=1)

# Retrieve the corresponding models for the best four values
best_models = pd.DataFrame(columns=['Best Model 1', 'Best Model 2', 'Best Model 3'])

for index, row in best_models_indices.iterrows():
    best_models.loc[index] = data.columns[row].values

# Output the results
#print("Best models for each station based on RMSE:")
#print(best_models)

modelscore = {}
for (st, r) in best_models.iterrows():
    for i in range(3):
        m = r.array[i]
        if m not in modelscore:
            modelscore[m] = 0
        modelscore[m] += 3 - i

# Sort the dictionary items by value in descending order
sorted_modelscore = sorted(modelscore.items(), key=lambda x: x[1], reverse=True)

# Display the sorted modelscore as a column
for model, score in sorted_modelscore:
    print(f"{model}: {score}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
GFDL-CM4: 60
MPI-ESM1-2-LR: 42
INM-CM4-8: 30
NESM3: 23
MIROC6: 22
KIOST-ESM: 10
EC-Earth3: 6
BCC-CSM2-MR: 6
FGOALS-g3: 3
NorESM2-MM: 2


Best fit model based on precipitation MBE

In [12]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import necessary libraries
import pandas as pd

# Read the CSV file into a Pandas DataFrame
file_path = '/content/drive/My Drive/koppen_maps/models_errors/precipitation_error/precipitation_mbe.csv'
data = pd.read_csv(file_path, index_col=0)

# Find the indices of the 3 lowest values for each station
best_models_indices = data.apply(lambda x: x.argsort().head(3), axis=1)

# Retrieve the corresponding models for the best four values
best_models = pd.DataFrame(columns=['Best Model 1', 'Best Model 2', 'Best Model 3'])

for index, row in best_models_indices.iterrows():
    best_models.loc[index] = data.columns[row].values

# Output the results
#print("Best models for each station based on RMSE:")
#print(best_models)

modelscore = {}
for (st, r) in best_models.iterrows():
    for i in range(3):
        m = r.array[i]
        if m not in modelscore:
            modelscore[m] = 0
        modelscore[m] += 3 - i

# Sort the dictionary items by value in descending order
sorted_modelscore = sorted(modelscore.items(), key=lambda x: x[1], reverse=True)

# Display the sorted modelscore as a column
for model, score in sorted_modelscore:
    print(f"{model}: {score}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
KACE-1-0-G: 57
IPSL-CM6A-LR: 35
FGOALS-g3: 24
HadGEM3-GC31-LL: 20
GISS-E2-1-G: 15
MPI-ESM1-2-HR: 12
NorESM2-LM: 8
NESM3: 6
CanESM5: 6
MIROC-ES2L: 5
GFDL-ESM4: 3
MPI-ESM1-2-LR: 3
KIOST-ESM: 2
GFDL-CM4: 2
CMCC-CM2-SR5: 2
UKESM1-0-LL: 1
BCC-CSM2-MR: 1
ACCESS-ESM1-5: 1
INM-CM4-8: 1
