# Comparison of Zonal Stats (from different processing methods)

This notebook is an exploratory analyses that compares statistics for different processing extents and methods. Primarily, it explores differences between partially processed and the fully processed (wall to wall) TML data for specific regions. It additionally looks at the differences between zonal statistics produced by the Data API / GFW backend.

In [None]:
import os
import rasterio as rs
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd 
import pandas as pd
import fiona

from rasterio.plot import show
from rasterio.plot import show_hist
from osgeo import gdal

from numpy.ma import masked_array

## Comparison of statistics for full vs partial processing extent

## Area sampled (per land cover class)

In [None]:
def compare_lc_sampled(country):
    
    ## This needs to include a check for land cover class IDs that were missed in partial processing ##
    
    # get the full and partial processing extent for the country
    full_proc = pd.read_csv(f'comparisons/full_processing_area.csv')
    partial_proc = pd.read_csv(f'comparisons/partial_processing_area.csv')
    extent_full = full_proc[full_proc.country == country]
    extent_partial = partial_proc[partial_proc.country == country]
    
    full = pd.read_csv(f'comparisons/{country}_statistics_full.csv')
    partial = pd.read_csv(f'comparisons/{country}_statistics_partial.csv')

    # set up the dataframe and calculate the area sampled for the full
    full = full[['country', 'admin', 'esa_class', 'esa_id', 'esa_sampled_ha', 'esa_total_ha']]
    full = full.drop_duplicates(keep='first', ignore_index=True)
    full['esa_not_sampled'] = full['esa_total_ha'] - full['esa_sampled_ha']
    full = full.groupby(by=['esa_class']).sum().reset_index()
    full = full[full.esa_class != 'No Data (flag)']
    #full['esa_perc_sampled'] = round((full['esa_sampled_ha'] / full['esa_total_ha']) * 100, 1)

    # apply the same to the partial 
    partial = partial[['country', 'admin', 'esa_class','esa_id','esa_sampled_ha', 'esa_total_ha']]
    partial = partial.drop_duplicates(keep='first', ignore_index=True)
    partial['esa_not_sampled'] = partial['esa_total_ha'] - partial['esa_sampled_ha']
    partial = partial.groupby(by=['esa_class']).sum().reset_index()
    partial = partial[partial.esa_class != 'No Data (flag)']
    #partial['esa_perc_sampled'] = round((partial['esa_sampled_ha'] / partial['esa_total_ha']) * 100, 1)
    
    print(f'Processing extent for {country}:')
    print(f'Calculated extent sampled (full): {round(full.esa_sampled_ha.sum(),1)} ha')
    print(f'Actual extent sampled (full): {extent_full.full_area_ha.item()} ha')
          
    # check for differences in lccs sampled
    full_classes = list(full.esa_class.values)
    partial_classes = list(partial.esa_class.values)
    list_difference = [item for item in partial_classes if item not in full_classes]
    if len(list_difference) > 0:
        partial.drop(partial[partial.esa_class == list_difference[0]].index, inplace = True)
        print(f'{list_difference} caused shape mismatch and was dropped from the partial dataset.')

    # create the position of the grouped bars
    width = 0.4
    pos1 = np.arange(len(full))
    pos2 = pos1 + width

    plt.figure(figsize=(15,9))

    # create the position of the stacked bars
    classes = list(full.esa_class.values)
    bars1 = full.esa_sampled_ha
    bars2 = full.esa_not_sampled
    bars3 = partial.esa_sampled_ha
    bars4 = partial.esa_not_sampled

    plt.barh(pos1, bars1, width, color="gold", edgecolor='white', label='sampled (full)')  
    plt.barh(pos1, bars2, width, left=bars1, color="darkslateblue", edgecolor='white', label='not sampled (full)')
    #f3e151  #6c3376
    plt.barh(pos2, bars3, width, color="palegoldenrod", edgecolor='white', label='sampled (partial)')  
    plt.barh(pos2, bars4, width, left=bars3, color="slateblue", edgecolor='white', label='not sampled (partial)')

    plt.title(f'Land Cover Sampled in {country}')
    plt.xlabel('Tree Cover (ha)')
    plt.yticks(pos1 + width / 2, classes)
    plt.ticklabel_format(useOffset=False, style='plain', axis='x')
    plt.grid(axis='x', linestyle='-', linewidth=.2)
    plt.legend(loc='lower right');
    
    return None
    

In [None]:
compare_lc_sampled('El Salvador')

In [None]:
compare_lc_sampled('Costa Rica')

In [None]:
compare_lc_sampled('Belize')

In [None]:
compare_lc_sampled('Panama')

In [None]:
compare_lc_sampled('Honduras')

In [None]:
compare_lc_sampled('Nicaragua')

In [None]:
compare_lc_sampled('Guatemala')

## Total hectares of tree cover

In [None]:
def compare_lc_totalha(country):
    
    full = pd.read_csv(f'comparisons/{country}_statistics_full.csv')
    partial = pd.read_csv(f'comparisons/{country}_statistics.csv')
    
    full_sum = full.groupby('esa_class').sum()
    full_sum = full_sum[['tof_ha', 'hans_ha']]
    partial_sum = partial.groupby('esa_class').sum()
    partial_sum = partial_sum[['tof_ha', 'hans_ha']]

    diverge_sum = full_sum - partial_sum
    colors = ['red' if x < 0 else 'green' for x in diverge_sum.tof_ha]

    plt.figure(figsize=(15,7))
    plt.hlines(y=diverge_sum.index,
               xmin=0,
               xmax=diverge_sum.tof_ha,
               alpha=0.7,
               linewidth=7,
               colors=colors)
               #label=str(diverge_sum.tof_ha))
    
    plt.ticklabel_format(useOffset=False, style='plain', axis='x')
    plt.grid(axis='x', linestyle='-', linewidth=.2)
    plt.title(f'Difference in TML tree cover (hectares) per land cover class: {country}')
    plt.xlabel('tree cover (ha)');
    
    return None

In [None]:
compare_lc_totalha('Panama')

In [None]:
compare_lc_totalha('Guatemala')

In [None]:
compare_lc_totalha('El Salvador')

In [None]:
compare_lc_totalha('Nicaragua')

In [None]:
compare_lc_totalha('Belize')

In [None]:
compare_lc_totalha('Honduras')

In [None]:
def compare_lc_mean(country):

    full = pd.read_csv(f'comparisons/{country}_statistics_full.csv')
    partial = pd.read_csv(f'comparisons/{country}_statistics.csv')

    full_mean = full.groupby('esa_class').mean()
    full_mean = full_mean[['tof_mean']]
    partial_mean = partial.groupby('esa_class').mean()
    partial_mean = partial_mean[['tof_mean']]
    mean_merged = full_mean.merge(partial_mean, on='esa_class')

    plt.figure(figsize=(12,8))

    width = 0.4
    pos1 = np.arange(len(mean_merged))
    pos2 = pos1 + width

    plt.barh(pos1, mean_merged.tof_mean_x, width, color='gold', edgecolor='white', label='Full')
    plt.barh(pos2, mean_merged.tof_mean_y, width, color='palegoldenrod', edgecolor='white', label='Partial')

    plt.xlabel('% Tree Cover')
    plt.yticks(pos1 + width / 2, mean_merged.index.values)
    plt.title(f'Mean Tree Cover per Land Cover Class: {country}')
    plt.grid(axis='x', linestyle='-', linewidth=.3)
    plt.legend();

    return None

## Forest Cover Compliant

In [None]:
def create_regional_csv(list_of_countries, region, extent):
    '''
    Merges the statistics for a list of countries into a single csv 
    file to permit regional analyses.
    '''
    
    regional_df = pd.DataFrame()
    dfs_to_concat = []
    
    for country in list_of_countries:
        country_df = pd.read_csv(f'comparisons/{country}_statistics_{extent}.csv')
        dfs_to_concat.append(country_df)
    
    regional_df = pd.concat(dfs_to_concat, ignore_index=True)
    regional_df.to_csv(f'comparisons/{region}_{extent}.csv', index=False)
    
    return None

In [None]:
# central america full processing
create_regional_csv(['Belize', 
                     'Honduras', 
                     'Guatemala', 
                     'El Salvador', 
                     'Costa Rica', 
                     'Nicaragua', 
                     'Panama'], 'central_america', 'full')

In [None]:
# central america partial processing
create_regional_csv(['Belize', 
                     'Honduras', 
                     'Guatemala', 
                     'El Salvador', 
                     'Costa Rica', 
                     'Nicaragua', 
                     'Panama'], 'central_america', 'partial')

In [None]:
# west africa full processing (missing Cape Verde, Ghana, Mali, Nigeria)
create_regional_csv(['Benin',
                    'Burkina Faso',
                    'Ivory Coast',
                    'Gambia',
                    'Guinea',
                    'Liberia',
                    'Mauritania',
                    'Niger',
                    'Senegal',
                    'Sierra Leone',
                    'Togo'], 'west_africa', 'full')

In [None]:
def compare_forest_cover_compliant(region, extent, figsize, rotation):
    
    region_df = pd.read_csv(f'comparisons/{region}_{extent}.csv')
    
    countries = list(set(region_df.country.values))

    x_labels, ag_bar1, ag_bar2, urban_bar1, urban_bar2 = [],[],[],[],[]
    
    for country in countries:
        
        # filter to one country and only urban/ag land cover classes
        country_df = region_df[region_df.country == country]
        ag_ids = [10.0, 11.0, 12.0, 20.0, 30.0, 40.0]
        urban_ids = [190.0]
        ag_df = country_df[country_df.esa_id.isin(ag_ids)].sort_values('country')
        urban_df = country_df[country_df.esa_id.isin(urban_ids)].sort_values('country')
        
        # get total tof ha per tree cover threshold
        ag_df = ag_df.groupby(by=['country', 'tree_cover_class']).sum().reset_index() 
        ag_df = ag_df[['country', 'tree_cover_class', 'tof_ha']] 
        urban_df = urban_df.groupby(by=['country', 'tree_cover_class']).sum().reset_index() 
        urban_df = urban_df[['country', 'tree_cover_class', 'tof_ha']] 
        
        # calculate # ha <10% tree cover and >10% tree cover
        ag_under10 = ag_df.tof_ha[0] 
        ag_over10 = sum(ag_df.tof_ha[1:])
        urban_under10 = urban_df.tof_ha[0]
        urban_over10 = sum(urban_df.tof_ha[1:])
        
        # normalize
        ag_under10_norm = ag_under10/(ag_under10 + ag_over10)*100
        ag_over10_norm = ag_over10/(ag_under10 + ag_over10)*100
        urban_under10_norm = urban_under10/(urban_under10 + urban_over10)*100
        urban_over10_norm = urban_over10/(urban_under10 + urban_over10)*100
        
        x_labels.append(country)
        ag_bar1.append(ag_over10_norm) # >10% is on the bottom
        ag_bar2.append(ag_under10_norm) # <10% is on the top
        urban_bar1.append(urban_over10_norm) 
        urban_bar2.append(urban_under10_norm) 
   
    # convert to array in order to add data labels
    ag_bar1 = np.asarray(ag_bar1)
    ag_bar2 = np.asarray(ag_bar2)
    urban_bar1 = np.asarray(urban_bar1)
    urban_bar2 = np.asarray(urban_bar2)

    plt.figure(figsize=figsize)
        
    # Ag plot
    plt.subplot(1,2,1)
    plt.bar(x_labels, ag_bar1, color="seagreen", label='>10% tree cover', capsize=4)  
    plt.bar(x_labels, ag_bar2, bottom=ag_bar1, color="honeydew", label='<10% tree cover')    
    
    # labels
    for xpos, ypos, yval in zip(x_labels, ag_bar1/2, ag_bar1):
        plt.text(xpos, ypos, f'{round(yval)}%', ha="center", va="center")
    for xpos, ypos, yval in zip(x_labels, ag_bar1 + ag_bar2/2, ag_bar2):
        plt.text(xpos, ypos, f'{round(yval)}%', ha="center", va="center")
    
    plt.xlabel(' ')
    plt.xticks(rotation=rotation)
    plt.ylabel('% Land')
    plt.ticklabel_format(style='plain', axis='y')
    plt.title(f'% Agricultural Land Meeting \n 10% Forest Cover Criteria \n ({extent})')
    plt.legend(loc='lower right')
    
    # Urban plot
    plt.subplot(1,2,2)
    plt.bar(x_labels, urban_bar1, color="seagreen", label='>10% tree cover', capsize=4)  
    plt.bar(x_labels, urban_bar2, bottom=urban_bar1, color="honeydew", label='<10% tree cover')    
    
    # labels
    for xpos, ypos, yval in zip(x_labels, urban_bar1/2, urban_bar1):
        plt.text(xpos, ypos, f'{round(yval)}%', ha="center", va="center")
    for xpos, ypos, yval in zip(x_labels, urban_bar1 + urban_bar2/2, urban_bar2):
        plt.text(xpos, ypos, f'{round(yval)}%', ha="center", va="center")
    
    
    plt.xlabel(' ')
    plt.xticks(rotation=rotation)
    plt.ylabel('% Land')
    plt.ticklabel_format(style='plain', axis='y')
    plt.title(f'% Urban Land Meeting \n 10% Forest Cover Criteria \n ({extent})')
    plt.legend(loc='lower right')
    plt.tight_layout(); 

In [None]:
compare_forest_cover_compliant('central_america', 'partial', (14,7), 0)

In [None]:
compare_forest_cover_compliant('central_america', 'full', (14,7), 0)

In [None]:
compare_forest_cover_compliant('west_africa', 'full', (14,7), 55)

### Conclusions
**Area sampled (per land cover class)**
- The 'No Data (flag)' land cover classes is removed.
- We expect and see that the dark yellow bars are higher and the dark purple bars are smaller, indicating an increase in the sampling area during full processing. The most significant increases are in the tree cover (broadleaved/deciduous, broadleaved/evergreen) classes, as these were intentionally omitted in the initial processing extent.
- Additional labels for some land cover classes (same ESA label but different ESA ID number) were identified. This could result in a higher count of hectares for the sampled and total area for a land cover class if these labels were not previously identified.
- The no data glag is removed. If a land cover class does not appear in the visualization (a warning will print) the country did not have data for that land cover class in the initial processing, but does include that land cover class in the full processing.

**Total hectares of tree cover (per land cover class)**
- We expect and see the greatest differences in the tree cover (broadleaved/deciduous, broadleaved/evergreen) class. This is logical given the initial processing extent intentionally omitted these classes.
- We see an increase in the total hectares on cropland, which could be a result of the additional encoding/labeling of ESA ID's in that land cover category.
- Note: hectares of tree cover are not weighted by the percentage of tree cover within that hectare. A hectare with 10% tree cover is counted the same as a hectare with 90% tree cover.

**Forest cover compliant (per country)**
- In Central America, there's a fluctution of ~1% between the partial and full processing.


# Comparison with GFW Pipeline

## Compare boundaries
Get shapefile from geostore through RW API and run through the pipeline, compare to downloaded zip from GADM. Since geostore only returns a single admin, compare the results for a single admin across three different countries (DRC, El Salvador, South Sudan).

In [None]:
config = confuse.Configuration('sentinel-tree-cover')
config.set_file('/Users/jessica.ertel/sentinel-tree-cover/jessica-config.yaml')
api_token = config['rw']['token']

In [None]:
def get_gjson(country):
    
    '''
    Takes in a country to request admin 1 shapefile from geostore.
    Saves the geostore object (single admin shapefile) as a geojson.
    '''
    
    # use pycountry to get country ISO
    iso = pycountry.countries.get(name = country).alpha_3

    # if errors, try pycountry.countries.search_fuzzy('country')
    
    # Get geostore object by ISO endpoint and GADM admin boundary 1
    rw_url = (f'https://api.resourcewatch.org/v2/geostore/admin/{iso}/1')
    my_headers = {'Authorization': 'Bearer ' + str(api_token)}
    response = requests.get(url=rw_url, headers=my_headers)
    print(response)
    
    # save response as geojson
    data = response.json()
    geojson = data['data']['attributes'] 
    geojson['geojson']['crs'] = 'epsg:4326'
    
    # check which admin was returned
    print(geojson['info']['name'])
    
    with open(f'{country}_adminboundaries.geojson', 'w') as f:
        dump(geojson, f)
    
    return None

In [None]:
# admin is Central Equatoria
get_gjson('South Sudan')

In [None]:
# admin is Bas-Uélé
get_gjson('Congo, The Democratic Republic of the')

In [None]:
# admin is Ahuachapán
get_gjson('El Salvador')

In [None]:
# add NAME_1 column so geojsons are compatible with tml analysis pipeline reqs
geostore_salvador = gpd.read_file('Ahuachapán_adminboundaries.geojson')
geostore_salvador['NAME_1'] = 'Ahuachapán'
geostore_salvador.to_file(f'Ahuachapán_adminboundaries.geojson', driver='GeoJSON')
check = gpd.read_file('Ahuachapán_adminboundaries.geojson')
check.head()

In [None]:
geostore_drc = gpd.read_file('Bas-Uélé_adminboundaries.geojson')
geostore_drc['NAME_1'] = 'Bas-Uélé'
geostore_drc.to_file(f'Bas-Uélé_adminboundaries.geojson', driver='GeoJSON')
check = gpd.read_file('Bas-Uélé_adminboundaries.geojson')
check.head()

In [None]:
geostore_ssudan = gpd.read_file('Central Equatoria_adminboundaries.geojson')
geostore_ssudan['NAME_1'] = 'Central Equatoria'
geostore_ssudan.to_file(f'Central Equatoria_adminboundaries.geojson', driver='GeoJSON')
check = gpd.read_file('Central Equatoria_adminboundaries.geojson')
check.head()

In [None]:
def gadm_vs_geostore(country, admin):
    
    '''
    Takes in a country and admin in order to import and filter the statistics
    to enable side by side comparison of differences.
    
    '''

    # for gadm, import country stats and filter df to admin
    gadm = pd.read_csv(f'statistics/{country}_statistics_full.csv')
    gadm = gadm[gadm.admin == admin]
    gadm = gadm[['esa_id', 'esa_class', 'tof_mean']]
    gadm = gadm.drop_duplicates(keep='first', ignore_index=True).rename(columns={'tof_mean': 'tof_mean_gadm'}) 

    # for geostore, import admin stats
    geostore = pd.read_csv(f'statistics/{admin}_statistics_full_tmlonly.csv')
    geostore = geostore[['esa_id', 'esa_class','tof_mean']]
    geostore = geostore.drop_duplicates(keep='first', ignore_index=True).rename(columns={'tof_mean': 'tof_mean_geostore'})

    # compare tof mean for geostore and gadm
    comb = gadm.join(geostore.tof_mean_geostore)
    comb['diff'] = round(comb.tof_mean_gadm - comb.tof_mean_geostore, 3)
        
    return comb


In [None]:
gadm_vs_geostore('El Salvador', 'Ahuachapán')

In [None]:
gadm_vs_geostore('South Sudan', 'Central Equatoria')

In [None]:
# required different approach due to issue with accents on admin name
# for gadm, import country stats and filter df to admin
gadm = pd.read_csv(f'statistics/DRC_statistics_full_tmlonly.csv')
gadm = gadm[gadm.admin == 'Bas-Uele']
gadm = gadm[['esa_id', 'esa_class', 'tof_mean']]
gadm = gadm.drop_duplicates(keep='first', ignore_index=True).rename(columns={'tof_mean': 'tof_mean_gadm'}) 

# for geostore, import admin stats
geostore = pd.read_csv('statistics/Bas-Uélé_statistics_full_tmlonly.csv')
geostore = geostore[['esa_id', 'esa_class','tof_mean']]
geostore = geostore.drop_duplicates(keep='first', ignore_index=True).rename(columns={'tof_mean': 'tof_mean_geostore'})

# compare tof mean for geostore and gadm
comb = gadm.join(geostore.tof_mean_geostore)
comb['diff'] = round(comb.tof_mean_gadm - comb.tof_mean_geostore, 3)
comb

## Compare tree cover by land cover class
GFW and TML use the 2015 ESA land cover product for land cover classifications, however GFW aggregates to IPCCC land cover types. To compare, look at the total hectares of tree cover in agricultural areas for a country.

geostore IDs: 
- south sudan: 566e8323abaf1b7080b179bec5946ce6
- DRC: 2852c7accd29c848ed699cdff6bd275e
- el salvador: 8ea11cb2347e2e93ebc7e0ede15598ba


In [None]:
config = confuse.Configuration('sentinel-tree-cover')
config.set_file('/Users/jessica.ertel/sentinel-tree-cover/jessica-config.yaml')
api_token = config['rw']['token']

In [None]:

drc_id = '2852c7accd29c848ed699cdff6bd275e'
es_id = "8ea11cb2347e2e93ebc7e0ede15598ba"
ssudan_id = "566e8323abaf1b7080b179bec5946ce6"

country_id = drc_id
url = f'https://data-api.globalforestwatch.org/dataset/wri_trees_in_mosaic_landscapes/v20220218/query/json?geostore_id={country_id}&geostore_origin=rw&sql=SELECT+sum%28area__ha%29+FROM+data+GROUP+BY+esa_land_cover_2015__class%2C+wri_trees_in_mosaic_landscapes__decile'
my_headers = {'Authorization': str(api_token)}
response = requests.get(url=url, headers=my_headers)
print(response)

# store as json, but remove ('status', 'success')
data = response.json()
data.popitem() 

# make into dataframe with updated columns
gfw = pd.DataFrame(data['data'])
gfw.rename(columns={"esa_land_cover_2015__class": "esa_class", 
                   "wri_trees_in_mosaic_landscapes__decile": "tree_cover_class"},
          inplace=True)

In [None]:
#Estimated: 226,175,961  
#Actual: 219,481,200 


In [None]:
sum(gfw.area__ha)

In [None]:
# this is the total ha trees in ag areas for DRC
gfw_ag = gfw[gfw.esa_class == 'Agriculture']
gfw_ag

In [None]:
tml = pd.read_csv('statistics/DRC_statistics_full_tmlonly.csv')
tml_ag = tml[tml.esa_id.isin([10, 11, 12, 20, 30, 40])]
tml_ag = tml_ag[['tree_cover_class', 'tof_ha']]
tml_ag = tml_ag.groupby(by=['tree_cover_class']).sum().reset_index()
tml_ag

In [None]:
def esa_to_ipcc(df):
    
    '''
    Aggregates ESA land cover classes to IPCC land cover classes.
    Returns a dataframe with tree cover statistics for each IPCC class.
    
    '''
    agriculture = df[df.esa_id.isin([10, 11, 12, 20, 20, 30, 40])]
    forest = df[df.esa_id.isin([50, 60, 61, 62, 70, 71, 72, 80, 81, 82, 90, 100, 160, 170])]
    grassland = df[df.esa_id.isin([110, 130])]
    wetland = df[df.esa_id == 180]
    settlement = df[df.esa_id == 190]
    shrubland = df[df.esa_id.isin([120, 121, 122])]
    sparse_veg = df[df.esa_id.isin([140, 150, 151, 152, 153])]
    bare = df[df.esa_id.isin([200, 201, 202])]
    water = df[df.esa_id == 210]

    return agriculture, forest, grassland, wetland, settlement, shrubland, sparse_veg, bare, water

In [None]:
agriculture, forest, grassland, wetland, settlement, shrubland, sparse_veg, bare, water = esa_to_ipcc(drc)


In [None]:
# get the mean tree cover per land cover class

In [None]:
# get total ha per threshold per country
df = df.groupby(by=['country', 'tree_cover_class']).sum().reset_index()
df = df[['country', 'tree_cover_class', 'tof_ha']] 

In [None]:
agriculture = drc[drc.esa_id.isin([10, 11, 12, 20, 20, 30, 40])]
agriculture[['admin', 'esa_id', 'esa_class', 'tree_cover_class']]

In [None]:
# in the 0-20 class total ha is 
# get total ha for TML agriculture
tml_agriculture = agriculture[['country', 'tree_cover_class', 'tof_ha']]
tml_agriculture.groupby('tree_cover_class').sum().reset_index()

In [None]:
gfw_agriculture = gfw[gfw.esa_class == 'Agriculture']
gfw_agriculture

In [None]:
# get total ha for TML agriculture
tml_urban = settlement[['country', 'tree_cover_class', 'tof_ha']]
tml_urban.groupby('tree_cover_class').sum().reset_index()

## Compare resampling effects

**Processing Extents**   
DRC  
- Calculated from original TML tif: 229,218,208.13 ha
- Estimated area sampled (from TML pipeline): 226,175,961 ha  
- Extent Spreadsheet: 219,481,200 ha (difference from above -6,694,761 ha)
- GFW: 217,834,921.067 ha
- Wiki: 226,704,800 ha

El Salvador 
- Calculated from original TML tif: 2,101,131.25 ha
- Estimated area sampled (from TML pipeline): 1,957,471
- Extent Spreadsheet: 2,066,400 (difference from above 108,929 ha)
- GFW: 1,532,594.297 ha
- Wiki: 2,072,100 ha
 
 
South Sudan 
- Calculated from original TML tif: 62,429,864.11 ha
- Estimated area sampled (from TML pipeline): 60,928,916
- Extent Spreadsheet: 59,709,600 (difference from above -1,219,316 ha)
- GFW: 57,831,316.129 ha
- Wiki: 64,432,900 ha

https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_area

### Option 1
Compare total area_ha from GFW stats to total pixels in TML country raster

In [None]:
# get drc stats and save as df
drc_id = '2852c7accd29c848ed699cdff6bd275e'
es_id = "8ea11cb2347e2e93ebc7e0ede15598ba"
ssudan_id = "566e8323abaf1b7080b179bec5946ce6"

country_id = es_id
url = f'https://data-api.globalforestwatch.org/dataset/wri_trees_in_mosaic_landscapes/v20220218/query/json?geostore_id={country_id}&geostore_origin=rw&sql=SELECT+sum%28area__ha%29+FROM+data+GROUP+BY+esa_land_cover_2015__class%2C+wri_trees_in_mosaic_landscapes__decile'
my_headers = {'Authorization': str(api_token)}
response = requests.get(url=url, headers=my_headers)
print(response)

# store as json, but remove ('status', 'success')
data = response.json()
data.popitem() 

# make into dataframe with updated columns
gfw = pd.DataFrame(data['data'])
gfw.rename(columns={"esa_land_cover_2015__class": "esa_class", 
                   "wri_trees_in_mosaic_landscapes__decile": "tree_cover_class"},
          inplace=True)

In [None]:
# get the sum of gfw pixels for El Salvador
# note this is not filtering out no data values
es_gfw_count = round(sum(gfw.area__ha),3)
es_gfw_count

In [None]:
# get the sum of gfw pixels for South Sudan
# note this is not filtering out no data values
ssudan_gfw_count = round(sum(gfw.area__ha),3)
ssudan_gfw_count

In [None]:
ssudan = rs.open(f'South Sudan.tif').read(1)
ssudan_tml_count = np.sum(ssudan != 255)
ssudan_tml_count / 100

In [None]:
es = rs.open(f'El Salvador.tif').read(1)
es_tml_count = np.sum(es != 255)
es_tml_count / 100

In [None]:
drc_tml_count / 100

### Option 2

In [None]:
## get GFW data from gfw-data-lake
#aws s3 cp s3://gfw-data-lake/esa_land_cover_2015/v2016/raster/epsg-4326/10/100000/class/geotiff/20N_100W.tif jessica.ertel/sentinel-tree-cover/notebooks/analysis/ --request-payer

tml1 = rs.open('20N_100W.tif').read(1)
tml2 = rs.open('20N_090W.tif').read(1)


# mask out no data values for plotting
#tml_ma = np.ma.masked_where(tml == 255, tml, copy=True)

plt.figure(figsize=(20,20))
fontsize = 18

# plot admin with urban land cover overlay
plt.subplot(1,3,1)
plt.imshow(tml1, cmap='Greens')
plt.title(f'20N_100W', fontsize=fontsize)
plt.axis('off')

plt.subplot(1,3,2)
plt.imshow(tml2, cmap='Greens')
plt.title(f'20N_090W', fontsize=fontsize)
plt.axis('off')
    

In [None]:
# get TML pixel count for each admin in Belize
# to do this, downloaded the 10x10 degree tiles from gfw-data-lake
# ran them through the pipeline as if they were Hansen data
# skipped resampling and merged multipolys to calculate pixel size

admins = [x for x in natsorted(os.listdir("Belize/tml_clipped/")) if ".tif" in x]

for i in admins:
    tml = rs.open(f'Belize/tml_clipped/{i}').read(1)
    gfw = rs.open(f'Belize/gfw_clipped/{i}').read(1)
    tml_count = np.sum(tml != 255)
    gfw_count = np.sum(gfw != 255)
    print(i[:-4]) 
    print(f'TML pixel count: {tml_count}, GFW pixel count: {gfw_count}')
    print(f'Difference: {tml_count - gfw_count}')


## Removal of Median Bracketing for Statistics

In comparison with the statistics that use bracketing, removing the brackets results in lower total ha and average tree cover.

Bangladesh area  
Estimated: 13,521,943  
Actual: 8,405,739

In [None]:
# first compare the difference in statistics between the newly bracketed data and the old bracket
# for the 10m resolution tif (v1 and v4)

def compare_stats(filename):
    
    # create ha stats table
    df = pd.read_csv(filename)
    table = df[['admin', 'esa_class', 'tree_cover_class', 'tof_ha', 'hans_ha']]
    table = table.groupby('esa_class').sum().reset_index()
    
    # print ha totals
    stats = df.groupby(by='tree_cover_class').sum().reset_index()
    stats = stats[['tree_cover_class', 'tof_ha', 'hans_ha']] 

    # remove tree cover classes <10%
    over10 = stats[stats.tree_cover_class != '0-9']
    over20 = stats[(stats.tree_cover_class != '0-9') & (stats.tree_cover_class != '10-19')]
    over30 = stats[(stats.tree_cover_class != '0-9') & (stats.tree_cover_class != '10-19') & (stats.tree_cover_class != '20-29')]

    # calculate totals
    total = sum(stats.tof_ha)
    over10_ha = sum(over10.tof_ha)
    over10_perc = sum(over10.tof_ha) / total * 100
    over20_perc = sum(over20.tof_ha) / total * 100
    over30_perc = sum(over30.tof_ha) / total * 100

    print(f'ha >10%: {round(over10_perc,2)}')
    print(f'ha >20%: {round(over20_perc,2)}')
    print(f'ha >30%: {round(over30_perc,2)}')
    
    return table



In [None]:
v1 = pd.read_csv('statistics/Belize_statistics_full.csv')
v1_ha = compare_stats('statistics/Belize_statistics_full.csv')
v1_ha

In [None]:
v4 = pd.read_csv('Belize/stats/Belize_statistics_full.csv')
v4_ha = compare_stats('Belize/stats/Belize_statistics_full.csv')
v4_ha

In [None]:
v1 = v1[['admin', 'esa_class', 'esa_sampled_ha', 'esa_total_ha']]
v1 = v1.drop_duplicates()
v1 = v1.groupby('admin').sum().reset_index()
v1

In [None]:
v4 = v4[['admin', 'esa_class', 'esa_sampled_ha', 'esa_total_ha']]
v4 = v4.drop_duplicates()
v4 = v4.groupby('admin').sum().reset_index()
v4

In [None]:
v1.esa_total_ha - v4.esa_total_ha

In [None]:
 # create v1 avg table
df = pd.read_csv('statistics/Bangladesh_statistics_full.csv')
table = df[['admin', 'esa_class', 'tof_mean', 'hans_mean']]
table = table.drop_duplicates()
table = table.groupby('esa_class').mean().reset_index()
table

In [None]:
 # create v4 avg table
df = pd.read_csv('statistics/Bangladesh_v4_statistics_full.csv')
table = df[['admin', 'esa_class', 'tof_mean', 'hans_mean']]
table = table.drop_duplicates()
table = table.groupby('esa_class').mean().reset_index()
table

In [None]:
v1 = pd.read_csv('statistics/Bangladesh_statistics_full.csv')

v5 = pd.read_csv('statistics/Bangladesh_statistics_full_statscheck.csv')



In [None]:
compare_stats('statistics/Bangladesh_statistics_full.csv')

In [None]:
compare_stats('statistics/Bangladesh_statistics_full_statscheck.csv')

In [None]:
# calculate the % land with >10% tree cover
zonals = pd.read_csv('statistics/centralamzonalstats.csv')
zonals

In [None]:
pipeline = pd.read_csv('statistics/central_am.csv')
pipeline

## Check Hansen histogram

In [None]:
# uses a Hansen 10m resampled tif
file = '/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/Panama/hansen/Bocas del Toro.tif'
ds = gdal.Open(file)
array = ds.GetRasterBand(1).ReadAsArray()
plt.imshow(array)
plt.colorbar()
print(ds.GetGeoTransform())
print(ds.GetProjection())

In [None]:
array = None

In [None]:
resamp = gdal.Warp('/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/Panama/hansen/Bocas del Toro_resamp.tif',
                    ds,
                    xRes=0.001,
                    yRes=0.001)
# always remember to close
ds = None
resamp = None

In [None]:
# original
ds = gdal.Open('/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/Panama/hansen/Bocas del Toro.tif')
array = ds.GetRasterBand(1).ReadAsArray()

# resampled
resamp = rs.open('/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/Panama/hansen/Bocas del Toro_resamp.tif')

#fig, ax = plt.subplots(1, 2, figsize=(14,7))
show_hist(resamp, alpha=0.3)
plt.imshow(resamp_array)
plt.colorbar();
ds = None
resamp = None

In [None]:
# resampled
plt.figure(figsize=(10,7))
resamp = rs.open('/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/Panama/hansen/Bocas del Toro_resamp.tif')
show_hist(resamp, alpha=0.3, title='Bocas del Toro')

In [None]:
# Bocas de Toro
plt.figure(figsize=(10,7))
#convert to array in order to get subset of values
toro = resamp.read(1)
under15 = toro[np.where(toro <= 13)]
show_hist(under15, alpha=0.5, title='Bocas del Toro')

In [None]:
# test other admins in Belize
admin = 'Colón'
ds = gdal.Open(f'/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/Panama/hansen/{admin}.tif')
resamp = gdal.Warp(f'/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/Panama/hansen/{admin}_resamp.tif',
                    ds,
                    xRes=0.001,
                    yRes=0.001)

# always remember to close
ds = None
resamp = None

In [None]:
plt.figure(figsize=(10,7))
resamp = rs.open(f'/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/Panama/hansen/{admin}_resamp.tif')
show_hist(resamp, alpha=0.3, title=admin)

In [None]:
# zoom in on <10%
plt.figure(figsize=(10,7))
array = resamp.read(1)
under15 = array[np.where(array <= 15)]
show_hist(under15, alpha=0.5, title=admin)

In [None]:
admin = 'Los Santos'
ds = gdal.Open(f'/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/Panama/hansen/{admin}.tif')
resamp = gdal.Warp(f'/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/Panama/hansen/{admin}_resamp.tif',
                    ds,
                    xRes=0.001,
                    yRes=0.001)

# always remember to close
ds = None
resamp = None

In [None]:
plt.figure(figsize=(10,7))
resamp = rs.open(f'/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/Panama/hansen/{admin}_resamp.tif')
show_hist(resamp, alpha=0.3, title=admin)

In [None]:
# zoom in on <10%
plt.figure(figsize=(10,7))
array = resamp.read(1)
under15 = array[np.where(array <= 15)]
show_hist(under15, alpha=0.5, title=admin)

In [None]:
admin = 'Herrera'
ds = gdal.Open(f'/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/Panama/hansen/{admin}.tif')
resamp = gdal.Warp(f'/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/Panama/hansen/{admin}_resamp.tif',
                    ds,
                    xRes=0.001,
                    yRes=0.001)

# always remember to close
ds = None
resamp = None

In [None]:
plt.figure(figsize=(10,7))
resamp = rs.open(f'/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/Panama/hansen/{admin}_resamp.tif')
show_hist(resamp, alpha=0.3, title=admin)

In [None]:
# zoom in on <10%
plt.figure(figsize=(10,7))
array = resamp.read(1)
under15 = array[np.where(array <= 15)]
show_hist(under15, alpha=0.5, title=admin)

In [None]:
# original
coro = rs.open('/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/Belize/resampled_rasters/hansen/Corozal.tif')
show(coro);

In [None]:
# 1 ha resampled
coro_resamp = rs.open('/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/Belize/resampled_rasters/hansen/Corozal_resamp.tif')
show(coro_resamp);

In [None]:
show_hist(coro_resamp, alpha=0.3);

In [None]:
coro_resamp.read(1)

In [None]:
# zoom in on the values <10% and analyze curve
coro = coro_resamp.read(1)
under15 = coro[np.where(coro <= 15)]
under15

In [None]:
show_hist(under15, alpha=0.5, legend=False);

In [None]:
ds = gdal.Open('/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/Belize/resampled_rasters/hansen/Belize.tif')
array = ds.GetRasterBand(1).ReadAsArray()
plt.imshow(array)
plt.colorbar()
print(ds.GetGeoTransform())
print(ds.GetProjection())

In [None]:
belize_resamp = gdal.Warp('/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/Belize/resampled_rasters/hansen/Belize_resamp.tif',
                            ds,
                            xRes=0.001,
                            yRes=0.001)