# QA/QC for Pipeline Statistics

In [None]:
import numpy as np
import pandas as pd
import rasterio as rs
import rasterio.mask
import fiona
import geopandas as gpd


import sys
sys.path.append('scripts/')
import statistical_analyses as st

%load_ext autoreload
%autoreload 2

In [None]:
raster = rs.open('Australia/resampled_rasters/tof/Ashfield.tif').read(1)
np.unique(raster)

In [None]:
raster = rs.open('Australia/resampled_rasters/tof/Bankstown.tif').read(1)
admin = 'Bankstown'
tof_vals = np.unique(raster)
if len(tof_vals) == 1:
    print(f'{admin} only contains value {tof_vals}, skipping processing...')
    pass

In [None]:
raster = rs.open('Australia/resampled_rasters/tof/Ashmore and Cartier Islands.tif').read(1)
np.unique(raster)

In [None]:
## Run stats check on each country
st.check_stats('Fiji', 'full_tmlonly')

In [None]:
# confirm mean value of 0 only happens in 0-9% threshold
df = pd.read_csv('statistics/Fiji_statistics_full_tmlonly.csv')
df[(df.tof_ha > 0) & (df.tof_mean == 0)]

In [None]:
# additional check for countries that used admin 2 boundaries
admin_sampled = df[['admin', 'esa_id', 'esa_sampled_ha']]
admin_sampled = admin_sampled.drop_duplicates()
admin_sampled = admin_sampled.groupby('admin').sum()
admin_sampled = admin_sampled[['esa_sampled_ha']]

# get total ha df
admin_tof = df[['admin', 'esa_id', 'tree_cover_class', 'tof_ha']]
admin_tof = admin_tof.groupby('admin').sum()
admin_tof = admin_tof[['tof_ha']]

list(set(admin_sampled.esa_sampled_ha == admin_tof.tof_ha))

In [None]:
# should be 0
df[df.tof_mean.isnull()]['esa_sampled_ha'].sum()

## Validate total ha >10%

For a selection of countries in each region, manually calculate ha >10% and compare to the output of the TML pipeline. Over or under estimation under 5% is acceptable.
- Central America:(all countries included)
- South America: Paraguay, Suriname, Acre
- Caribbean: Caribbean
- West Africa: Ghana, Liberia
- East Africa: Madagascar, Somalia
- North/Central Africa: Lesotho, Republic of Congo
- Asia: Bangladesh, Myanmar, Vietnam

In [None]:
def over10_stats(region):
    
    pipe = pd.read_csv(f'statistics/{region}_statistics_full_tmlonly.csv')
    
    table = pd.DataFrame(columns=['country',
                                 'perc_over10',
                                 'ha_over10',
                                 'km_over10'], dtype=object)

    for country in sorted(set(list(pipe.country.values))):

        country_df = pipe[pipe.country == country]

        # calculate total area sampled for the country
        sampled = country_df[['country', 'admin', 'esa_sampled_ha']]
        sampled = sampled.drop_duplicates()
        ha_sampled = sum(sampled.esa_sampled_ha)

        # Calculate total TML ag ha >10%
        ha = country_df[['country', 'tree_cover_class', 'tof_ha']].groupby(by=['country', 'tree_cover_class']).sum().reset_index() 
        ha_over10 = sum(ha.tof_ha[1:])


        # % ag land with >10% cover
        perc_over10 = (ha_over10 / ha_sampled) * 100

        # convert to km
        km_over10 = (ha_over10 / 100)

        table = table.append({'country': country,
                            'perc_over10': round(perc_over10, 2),
                            'ha_over10': ha_over10,
                            'km_over10': km_over10},
                            ignore_index=True)
    return table
    

        

In [None]:
# use rasterio to perform clipping  
def clip(country):
    '''
    Requires country tif and shapefile in 'checks' directory.
    Quickly clips country tif to geojson bounds.
    '''
    with fiona.open(f'checks/{country}_adminboundaries.geojson', 'r') as shape:
        shapes = [feature['geometry'] for feature in shape]
        
    with rs.open(f'checks/{country}.tif') as src:
        out_image, out_transform = rs.mask.mask(src, shapes, crop=True, nodata=255, filled=True)
        out_meta = src.meta
        out_meta.update({'driver': 'GTiff',
                         'height': out_image.shape[1],
                         'width': out_image.shape[2],
                         'transform': out_transform})
    with rs.open(f'checks/{country}-clipped.tif', 'w', **out_meta) as dest:
        dest.write(out_image)
        
    return None

In [None]:
def calculate(country):
    '''
    Quickly calculates the total hectares above 10%
    Requires country tif in 'checks' directory.
    '''
    x = rs.open(f'checks/{country}-clipped.tif').read(1)
    #x = x[:-(x.shape[0] % 10), :-(x.shape[1] % 10)]
    def round_down(num, divisor):
        return num - (num%divisor)
    new_dim1 = round_down(x.shape[0], 10)
    new_dim2 = round_down(x.shape[1], 10)
    x = x[:new_dim1, :new_dim2]
    x = np.ma.masked_array(x, mask = x == 255) # this will construct a new boolean array, mem heavy
    x = np.reshape(x, (x.shape[0] // 10, 10, x.shape[1] // 10, 10))
    x = np.mean(x, axis=(1,3))
    total = np.sum(x > 10)
    return total

In [None]:
def compare(country):
    '''
    Compares quick clip/calculate to pipeline stats (total ha >10%)
    Requires country statistics in wherever location over10_stats references
    '''
    df = over10_stats(country)
    pipe_output = df.ha_over10[0] # gets pipeline est of ha over 10%
    quick_output = calculate(country) # calculates ha over 10% from clipped raster
    diff = (pipe_output - quick_output) / pipe_output
    if pipe_output > quick_output:
        print(f'{country} overestimates by {round(diff*100, 2)}%')
    elif pipe_output < quick_output:
        print(f'{country} underestimates by {round(diff*100, 2)}%')

In [None]:
# Belize originally under estimated by 47%
clip('Belize')
calculate('Belize')
compare('Belize')

In [None]:
# Nicaragua originall overestimates by 7.02%
clip('Nicaragua')
calculate('Nicaragua')
compare('Nicaragua')

In [None]:
clip('Acre')
calculate('Acre')
compare('Acre')

In [None]:
# Paraguay originally overestimates by 4.57%
clip('Paraguay')
calculate('Paraguay')
compare('Paraguay')

In [None]:
clip('Suriname')
calculate('Suriname')
compare('Suriname')

In [None]:
clip('Caribbean')
calculate('Caribbean')
compare('Caribbean')

In [None]:
# Ghana originally overestimates by 7.43%
clip('Ghana')
calculate('Ghana')
compare('Ghana')

In [None]:
# Liberia originally underestimates by %-12.23
clip('Liberia')
calculate('Liberia')
compare('Liberia')

In [None]:
# Madagascar originally overestimates by 7.37%
clip('Madagascar')
calculate('Madagascar')
compare('Madagascar')

In [None]:
clip('Somalia')
calculate('Somalia')
compare('Somalia')

In [None]:
clip('Lesotho')
calculate('Lesotho')
compare('Lesotho')

In [None]:
clip('Republic of Congo')
calculate('Republic of Congo')
compare('Republic of Congo')

In [None]:
clip('Bangladesh')
calculate('Bangladesh')
compare('Bangladesh')

In [None]:
clip('Myanmar')
calculate('Myanmar')
compare('Myanmar')

In [None]:
clip('Vietnam')
calculate('Vietnam')
compare('Vietnam')

In [None]:
clip('Fiji')
calculate('Fiji')
compare('Fiji')

### Central America

In [None]:
# Zonal statistics from GEE
zonals = pd.read_csv('statistics/centralamzonalstats.csv')
zonals.sort_values('Country')

In [None]:
# compare with pipeline output on 8/12
aug12 = over10_stats(region='central_am')
aug12

In [None]:
aug12['diff'] = round(((aug12.km_over10 - zonals.AreaKmGTE10Percent) / aug12.km_over10),2)

In [None]:
aug12

### Indonesia

In [None]:
kal = pd.read_csv('statistics/Kalimantan_statistics_full_tmlonly.csv')
suma = pd.read_csv('statistics/Sumatra_statistics_full_tmlonly.csv')

In [None]:
kal[kal.admin == 'Kepulauan Riau']

In [None]:
# is there double counting for Kepuluan
kep_kal = kal[kal.admin == 'Kepulauan Riau']
kep_suma = suma[suma.admin == 'Kepulauan Riau']

In [None]:
# add cropland (ag) analyses

def total_ha(country_df):
    
    table = pd.DataFrame(columns=['lcc', 'ha_over10'], dtype=object)

    ag_ids = [10.0, 11.0, 12.0, 20.0, 30.0, 40.0]
    ag_df = country_df[country_df.esa_id.isin(ag_ids)]

    ag_df = ag_df[['country', 'admin', 'tree_cover_class', 'tof_ha']]
    ag_over10 = ag_df[ag_df.tree_cover_class != '0-9']
    ag_over10 = sum(ag_over10.tof_ha)


    table = table.append({
                        'lcc': 'Cropland',
                        'ha_over10': ag_over10},
                        ignore_index=True)

    # add urban analyses
    urban_ids = [190.0]
    urban_df = country_df[country_df.esa_id.isin(urban_ids)]

    urban_df = urban_df[['country', 'admin', 'tree_cover_class', 'tof_ha']]
    urban_over10 = urban_df[urban_df.tree_cover_class != '0-9']
    urban_over10 = sum(urban_over10.tof_ha)
    
    table = table.append({
                                'lcc': 'Urban',
                                'ha_over10': urban_over10},
                                ignore_index=True)
    return table

In [None]:
total_ha(kep_kal)

In [None]:
total_ha(kep_suma)

## Compare statistics for countries processed by admin 2 boundary
(Rwanda, Kenya, Malawi, Ethiopia, Cameroon)

In [None]:
# first do normal check on all countries
st.check_stats('Malawi', 'full_tmlonly', False)

In [None]:
st.check_stats('Malawi', 'full_tmlonly', True)

In [None]:
# check the nulls
country = 'Malawi'
adm2 = pd.read_csv(f'statistics/{country}_statistics2_full_tmlonly.csv')
adm2[adm2.tof_mean.isnull()].esa_sampled_ha.sum()

In [None]:
# check the ha w/ 0% mean cover
# ok if in the 0-9 tree cover class
adm2[(adm2.tof_mean == 0) & (adm2.tof_ha > 0)]

In [None]:
def over10_stats(country, adm):
    
    if adm == 1:
        df = pd.read_csv(f'statistics/{country}_statistics_full_tmlonly.csv')
    
    if adm == 2:
        df = pd.read_csv(f'statistics/{country}_statistics{str(adm)}_full_tmlonly.csv')
    
    table = pd.DataFrame(columns=['country',
                                 'perc_over10',
                                 'ha_over10',
                                 'km_over10'], dtype=object)

    # calculate total area sampled for the country
    sampled = df[['country', 'admin', 'esa_sampled_ha']]
    sampled = sampled.drop_duplicates()
    ha_sampled = sum(sampled.esa_sampled_ha)

    # Calculate total TML ag ha >10%
    ha = df[['country', 'tree_cover_class', 'tof_ha']].groupby(by=['country', 'tree_cover_class']).sum().reset_index() 
    ha_over10 = sum(ha.tof_ha[1:])


    # % ag land with >10% cover
    perc_over10 = (ha_over10 / ha_sampled) * 100

    # convert to km
    km_over10 = (ha_over10 / 100)

    table = table.append({'country': country,
                        'perc_over10': round(perc_over10, 2),
                        'ha_over10': ha_over10,
                        'km_over10': km_over10},
                        ignore_index=True)
    return table

In [None]:
over10_stats('Rwanda', 1)

In [None]:
over10_stats('Rwanda', 2)

In [None]:
ad1 = 821651
ad2 = 826063
round(((ad2 - ad1) / ad2), 5) * 100

In [None]:
over10_stats('Ethiopia', 1)

In [None]:
over10_stats('Ethiopia', 2)

In [None]:
ad1 = 55074804
ad2 = 55134155
round(((ad2 - ad1) / ad2), 5) * 100

In [None]:
over10_stats('Cameroon', 1)

In [None]:
over10_stats('Cameroon', 2)

In [None]:
ad1 = 36899770
ad2 = 36952314
round(((ad2 - ad1) / ad2), 5) * 100

In [None]:
over10_stats('Kenya', 1)

In [None]:
over10_stats('Kenya', 2)

In [None]:
ad1 = 30585831
ad2 = 30679144
round(((ad2 - ad1) / ad2), 5) * 100

In [None]:
over10_stats('Malawi', 1)

In [None]:
over10_stats('Malawi', 2)

In [None]:
ad1 = 3895504
ad2 = 3922918
round(((ad2 - ad1) / ad2), 5) * 100

## Compare w/ Justin's pipeline

In [None]:
justin = pd.read_csv('statistics/ttc_calcs.csv')
justin_nic = justin[justin.country == 'Nicaragua'].T.drop(index='country').reset_index()
justin_nic.rename(columns={1:'total_ha', 'index':'tree_cover_class'}, inplace=True)
justin_nic

In [None]:
jessica = pd.read_csv('statistics/GFWcomp_totalha.csv')
nic = jessica[jessica.country == 'Nicaragua'].reset_index()
nic

In [None]:
# how does this break down in each tree cover threshold
((nic.tof_ha - justin_nic.total_ha) / nic.tof_ha) * 100

In [None]:
# how big is nicaragua? 13.037 million ha
total_area = 13037000 

In [None]:
# justin's total ha is closer to the actual
print(f'dif in total ha: {total_area - justin_nic.total_ha.sum()}, {round(((total_area - justin_nic.total_ha.sum())/total_area)*100, 2)}%')

In [None]:
print(f'dif in total ha: {total_area - nic.tof_ha.sum()}, {round(((total_area - nic.tof_ha.sum())/total_area)*100, 2)}%')


In [None]:
thanks! remind me - the reason you need the tifs at half ha resolution is because your plan 
is to resample half ha -> 30m? because that fits in the existing geotrellis workflow? 


In [None]:
## this script resamples country tifs on s3 to a half ha
## calculated the hectare area by just counting the pixels in each decile and dividing by 2.

from osgeo import gdal
import os

def progress_cb(complete, message, cb_data):
    '''Emit progress report in numbers for 10% intervals and dots for 3%'''
    if int(complete*100) % 10 == 0:
        print(f'{complete*100:.0f}', end='', flush=True)
    elif int(complete*100) % 3 == 0:
        print(f'{cb_data}', end='', flush=True)


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--src_path", dest = 'src_path')
    parser.add_argument("--dst_path", dest = 'dst_path')

    args = parser.parse_args()

    for file in os.listdir(args.src_path):
        if file[-4:] == ".tif":
            if file not in os.listdir(args.dst_path):
                print(file)
                rast_src = f"{args.src_path}/{file}"
                rast_dst = f"{args.dst_path}/{file}"
                # open src file get x/y
                rast_open = gdal.Open(rast_src, gdal.GA_ReadOnly)
                x = rast_open.RasterXSize
                y = rast_open.RasterYSize
                divisor = (10000/2)**.5/10
                sizes = str(f'{int(x // divisor)} {int(y // divisor)}')
                opts = gdal.ParseCommandLine(f"-ot Byte -co COMPRESS=LZW -co BIGTIFF=YES -co NUM_THREADS=ALL_CPUS -ts {sizes} -r average -srcnodata 255 -wo NUM_THREADS=ALL_CPUS -multi")
                ds = gdal.Warp(rast_dst, rast_src, options =  opts, callback=progress_cb, callback_data='.')
                del ds