#### Import necessary libraries for data handling and visualization.

In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib.ticker as mticker

#### Load csv files with data from various CEO Validation (EPR 4000 points, 1st Validation 835 points, 2nd Validation 3467 points)

In [4]:
pts835 = pd.read_csv('/home/sepal-user/eSBAE_CIV/835_final.csv', delimiter=';')
pts3k = pd.read_csv('/home/sepal-user/eSBAE_CIV/3K_final.csv', delimiter=';')
erp4k = pd.read_csv('/home/sepal-user/eSBAE_CIV/erp4K_final.csv', delimiter=';')

#### Merge pts835 and pts3k into a new dataframe > {merged_national} = 4302 points / which could be use as training dataset

In [5]:
merged_national = pd.concat([pts835, pts3k])
len(merged_national)

4302

#### Load the country 1km grid csv or gpkg file with all the points and columns including strata or kmeans, chg_prob etc.. 
#### For Ivory Coast we have 325631 points 

In [None]:
df320k = gpd.read_file('/home/sepal-user/module_results/esbae/Cote_Ivoire_MRV/cote_ivoire_all_classified_20231114.gpkg')
len(df320k)

In [None]:
df320k.to_parquet('/home/sepal-user/module_results/esbae/Cote_Ivoire_MRV/cote_ivoire_all_classified_20231114.parquet')

In [None]:
# List all columns 
df320k.columns.tolist()

In [None]:
column_of_interest = 'ocs_2020' # your classes/redd activities/target column that contains the classes for which you want to get the area estimations

In [None]:
# merge your full database columns, point_id, kmeans with your 3000 thousand points

In [None]:
df_merged = df320k[['point_id', 'kmeans_multi']].merge(pts3k[['point_id', column_of_interest]], how='left', on='point_id')

In [None]:
np.unique(df_merged.kmeans_multi, return_counts=True)

#### Perform area calculation using the stratum column. In this case the column is called kmeans. Use the merge dataframe (national grid points + CEO validated points)

In [None]:
def calculate_areas(df_merged, kozak_column, target_column, total_area, z_score):
    
    df_full = df_merged.copy()
    df_merged = df_merged[~df_merged[target_column].isna()]
    
    # get all attributes
    categories = df_merged[target_column].unique()
    
    # get strata
    strata, d = df_merged[kozak_column].unique(), {}
    print(categories)
    # create stats for each entry
    for category in categories:
        
        if str(category) == 'nan':
            continue
            
        print(f' Calculating stats for {category}')
        # create binary class column
        df_merged[category] = df_merged[target_column].apply(lambda x: 1 if x == category else 0)
        print(f'There are {df_merged[category].sum()} entries of {category} in {target_column}.')
        
        # initialize variables for category <> check the catergories > def, deg, gain
        categories_area, se_total = 0, 0
        d2 = {}
        for kmeans in strata:
                        
            if str(kmeans) == 'nan':
                continue
            
            # subset to stratum
            kmeans_df = df_full[df_full[kozak_column] == kmeans]
            
            # get area proportion for that stratum on full dataset
            proportion_strata = len(kmeans_df)/len(df_full)

            # get stratum area
            stratum_area = proportion_strata * total_area

            # get proportion of forest change within strata from interpreted data
            proportion_category = len(
                df_merged[(df_merged[kozak_column] == kmeans) & (df_merged[category] == 1)]
            ) / len(
                df_merged[df_merged[kozak_column] == kmeans]
            )
            
            # get area from proportion and full area
            area = proportion_category * stratum_area

            # get error from interpreted data for full stratum area
            #var = np.var(df_merged[category][df_merged[kozak_column] == kmeans]) / len(df_merged[df_merged[kozak_column] == kmeans])
            #se = np.sqrt(var) * stratum_area

            var = np.var(df_merged[category][df_merged[kozak_column] == kmeans])
            sd = np.sqrt(var)               
            n = len(df_merged[df_merged[kozak_column] == kmeans])
            se = sd/np.sqrt(n) * stratum_area
                          
            # add for totals
            categories_area += area
            se_total += se**2
            
            # add to dictionary
            d2[f'area_stratum_{kmeans}'] = area
            d2[f'ci_stratum_{kmeans}']=1.67*se
        
        d2['area_total'] = categories_area
        d2['ci_total'] = z_score*np.sqrt(se_total)
        d2['relative CI'] =  z_score*np.sqrt(se_total) / categories_area * 100
        d[category] = d2
    
    
    return pd.DataFrame.from_dict(d, orient='index')

In [None]:
calculate_areas(df_merged, 'kmeans_multi', column_of_interest, total_area=len(df_merged), z_score=1.645)