# QA/QC for Pipeline Statistics

In [None]:
import numpy as np
import pandas as pd
import rasterio as rs
import rasterio.mask
import fiona
import geopandas as gpd


import sys
sys.path.append('scripts/')
import statistical_analyses as st

%load_ext autoreload
%autoreload 2

In [None]:
## Run stats check on each country
st.check_stats('Vietnam', 'full_tmlonly')

In [None]:
# confirm this only happens in 0-9% threshold
df = pd.read_csv('statistics/Vietnam_statistics_full_tmlonly.csv')
df[(df.tof_ha > 0) & (df.tof_mean == 0)]

In [None]:
# additional check for countries that used admin 2 boundaries
admin_sampled = df[['admin', 'esa_id', 'esa_sampled_ha']]
admin_sampled = admin_sampled.drop_duplicates()
admin_sampled = admin_sampled.groupby('admin').sum()
admin_sampled = admin_sampled[['esa_sampled_ha']]

# get total ha df
admin_tof = df[['admin', 'esa_id', 'tree_cover_class', 'tof_ha']]
admin_tof = admin_tof.groupby('admin').sum()
admin_tof = admin_tof[['tof_ha']]

list(set(admin_sampled.esa_sampled_ha == admin_tof.tof_ha))

In [None]:
# should be 0
df[df.tof_mean.isnull()]['esa_sampled_ha'].sum()

## Validate total ha >10%

For a selection of countries in each region, manually calculate ha >10% and compare to the output of the TML pipeline. Over or under estimation under 5% is acceptable.
- Central America:(all countries included)
- South America: Paraguay, Suriname, Acre
- Caribbean: Caribbean
- West Africa: Ghana, Liberia
- East Africa: Madagascar, Somalia
- North/Central Africa: Lesotho, Republic of Congo
- Asia: Bangladesh, Myanmar, Vietnam

In [None]:
def over10_stats(region):
    
    pipe = pd.read_csv(f'statistics/{region}_statistics_full_tmlonly.csv')
    
    table = pd.DataFrame(columns=['country',
                                 'perc_over10',
                                 'ha_over10',
                                 'km_over10'], dtype=object)

    for country in sorted(set(list(pipe.country.values))):

        country_df = pipe[pipe.country == country]

        # calculate total area sampled for the country
        sampled = country_df[['country', 'admin', 'esa_sampled_ha']]
        sampled = sampled.drop_duplicates()
        ha_sampled = sum(sampled.esa_sampled_ha)

        # Calculate total TML ag ha >10%
        ha = country_df[['country', 'tree_cover_class', 'tof_ha']].groupby(by=['country', 'tree_cover_class']).sum().reset_index() 
        ha_over10 = sum(ha.tof_ha[1:])


        # % ag land with >10% cover
        perc_over10 = (ha_over10 / ha_sampled) * 100

        # convert to km
        km_over10 = (ha_over10 / 100)

        table = table.append({'country': country,
                            'perc_over10': round(perc_over10, 2),
                            'ha_over10': ha_over10,
                            'km_over10': km_over10},
                            ignore_index=True)
    return table
    

        

In [None]:
# use rasterio to perform clipping  
def clip(country):
    '''
    Requires country tif and shapefile in 'checks' directory.
    Quickly clips country tif to geojson bounds.
    '''
    with fiona.open(f'checks/{country}_adminboundaries.geojson', 'r') as shape:
        shapes = [feature['geometry'] for feature in shape]
        
    with rs.open(f'checks/{country}.tif') as src:
        out_image, out_transform = rs.mask.mask(src, shapes, crop=True, nodata=255, filled=True)
        out_meta = src.meta
        out_meta.update({'driver': 'GTiff',
                         'height': out_image.shape[1],
                         'width': out_image.shape[2],
                         'transform': out_transform})
    with rs.open(f'checks/{country}-clipped.tif', 'w', **out_meta) as dest:
        dest.write(out_image)
        
    return None

In [None]:
def calculate(country):
    '''
    Quickly calculates the total hectares above 10%
    Requires country tif in 'checks' directory.
    '''
    x = rs.open(f'checks/{country}-clipped.tif').read(1)
    #x = x[:-(x.shape[0] % 10), :-(x.shape[1] % 10)]
    def round_down(num, divisor):
        return num - (num%divisor)
    new_dim1 = round_down(x.shape[0], 10)
    new_dim2 = round_down(x.shape[1], 10)
    x = x[:new_dim1, :new_dim2]
    x = np.ma.masked_array(x, mask = x == 255)
    x = np.reshape(x, (x.shape[0] // 10, 10, x.shape[1] // 10, 10))
    x = np.mean(x, axis=(1,3))
    total = np.sum(x > 10)
    return total

In [None]:
def compare(country):
    '''
    Compares quick clip/calculate stats to pipeline stats
    Requires country statistics in wherever location over10_stats references
    '''
    df = over10_stats(country)
    pipe_output = df.ha_over10[0] # gets pipeline est of ha over 10%
    quick_output = calculate(country) # calculates ha over 10% from clipped raster
    diff = (pipe_output - quick_output) / pipe_output
    if pipe_output > quick_output:
        print(f'{country} overestimates by {round(diff*100, 2)}%')
    elif pipe_output < quick_output:
        print(f'{country} underestimates by {round(diff*100, 2)}%')

In [None]:
# Belize originally under estimated by 47%
clip('Belize')
calculate('Belize')
compare('Belize')

In [None]:
# Nicaragua originall overestimates by 7.02%
clip('Nicaragua')
calculate('Nicaragua')
compare('Nicaragua')

In [None]:
clip('Acre')
calculate('Acre')
compare('Acre')

In [None]:
# Paraguay originally overestimates by 4.57%
clip('Paraguay')
calculate('Paraguay')
compare('Paraguay')

In [None]:
clip('Suriname')
calculate('Suriname')
compare('Suriname')

In [None]:
clip('Caribbean')
calculate('Caribbean')
compare('Caribbean')

In [None]:
# Ghana originally overestimates by 7.43%
clip('Ghana')
calculate('Ghana')
compare('Ghana')

In [None]:
# Liberia originally underestimates by %-12.23
clip('Liberia')
calculate('Liberia')
compare('Liberia')

In [None]:
# Madagascar originally overestimates by 7.37%
clip('Madagascar')
calculate('Madagascar')
compare('Madagascar')

In [None]:
clip('Somalia')
calculate('Somalia')
compare('Somalia')

In [None]:
clip('Lesotho')
calculate('Lesotho')
compare('Lesotho')

In [None]:
clip('Republic of Congo')
calculate('Republic of Congo')
compare('Republic of Congo')

In [None]:
clip('Bangladesh')
calculate('Bangladesh')
compare('Bangladesh')

In [None]:
clip('Myanmar')
calculate('Myanmar')
compare('Myanmar')

In [None]:
clip('Vietnam')
calculate('Vietnam')
compare('Vietnam')

### Central America

In [None]:
# Zonal statistics from GEE
zonals = pd.read_csv('statistics/centralamzonalstats.csv')
zonals.sort_values('Country')

In [None]:
# compare with pipeline output on 8/12
aug12 = over10_stats(region='central_am')
aug12

In [None]:
aug12['diff'] = round(((aug12.km_over10 - zonals.AreaKmGTE10Percent) / aug12.km_over10),2)

In [None]:
aug12