In [None]:
# Re-process statistics for previous processed countries
# check that uploading new files will simply replace what is currently there
# or rename the old file and upload new ones

In [None]:
import functools
from time import time, strftime
import os
import os.path
import boto3
import confuse
import rasterio as rs
from rasterio.mask import mask
from rasterio.merge import merge
from rasterio.enums import Resampling

import numpy as np 
import numpy.ma as ma 
import geopandas as gpd 
from shapely.geometry.polygon import Polygon
from shapely.geometry.multipolygon import MultiPolygon
import pandas as pd
import pandas.api.types as ptypes
import fiona
from contextlib import contextmanager  
from skimage.transform import resize
import math
import requests
import urllib.request
from urllib.error import HTTPError
import osgeo
from osgeo import gdal
from osgeo import gdalconst
import glob
from copy import copy
from datetime import datetime
import psutil
import scipy

In [None]:
def timer(func):
    '''
    Prints the runtime of the decorated function.
    '''
    
    @functools.wraps(func)
    def wrapper_timer(*args, **kwargs):
        start = datetime.now() 
        value = func(*args, **kwargs)
        end = datetime.now() 
        run_time = end - start
        print(f'Completed {func.__name__!r} in {run_time}.')
        return value
    return wrapper_timer

In [None]:
def download_inputs(country, bucket_name, s3_folder, local_dir=None):
    """
    Download the contents of a folder directory
    Args:
        bucket_name: the name of the s3 bucket
        s3_folder: the folder path in the s3 bucket
        local_dir: a relative or absolute directory path in the local file system
    """
    if not os.path.exists(f'{country}/'):
        os.makedirs(f'{country}/')
    
    if not os.path.exists(f'{country}/resampled_rasters/'):
        os.makedirs(f'{country}/resampled_rasters/')
        
    config = confuse.Configuration('sentinel-tree-cover')
    # CHANGE ONCE ON INSTANCE
    config.set_file('/Users/jessica.ertel/sentinel-tree-cover/jessica-config.yaml')
    aws_access_key = config['aws']['aws_access_key_id']
    aws_secret_key = config['aws']['aws_secret_access_key']
    s3 = boto3.resource('s3', aws_access_key_id=aws_access_key.as_str(), aws_secret_access_key=aws_secret_key.as_str())
    
    bucket = s3.Bucket(bucket_name)
    
    for obj in bucket.objects.filter(Prefix=s3_folder):
        target = obj.key if local_dir is None \
            else os.path.join(local_dir, os.path.relpath(obj.key, s3_folder))
        if not os.path.exists(os.path.dirname(target)):
            os.makedirs(os.path.dirname(target))
        if obj.key[-1] == '/':
            continue
        bucket.download_file(obj.key, target)
        
    return None

In [None]:
def reshape_to_4d(raster):
    
    '''
    Takes in a GTiff, identifies the dimensions and them down to the nearest 10th.
    Then uses those dimensions and reshapes to a 4 dimensional, 10x10 grid.
    
    Attributes
    ----------
    raster : str
        GTiff that will be reshaped
    '''
    
    def round_down(num, divisor):
         return num - (num%divisor)
   
    # round down rows and cols to nearest 10th
    rows, cols = round_down(raster.shape[0], 10), round_down(raster.shape[1], 10)
    
    # clip according to rounded numbers and reshape
    rounded = raster[:rows, :cols]
    reshaped = np.reshape(rounded, (rounded.shape[0] // 10, 10, rounded.shape[1] // 10, 10))
        
    return reshaped

In [None]:
@timer
def calculate_stats_tml(country, extent):
    
    '''
    Takes in a country and extent (full or partial) and produces zonal stats on tree cover. 
    Returns a csv with statistics per administrative district, per land cover class and 
    per tree cover threshold. Only produces statistics for TML data.
    
    Attributes
    ----------
    country : str
        a string indicating the country files to import
    extent : str
        a string indicating the processing extent of the geotiff

    '''
    
    if not os.path.exists(f'{country}/stats'):
        os.makedirs(f'{country}/stats')
        
    df = pd.DataFrame({'country': pd.Series(dtype='str'),
                       'admin': pd.Series(dtype='str'),
                       'esa_id': pd.Series(dtype='str'),
                       'esa_class': pd.Series(dtype='str'),
                       'esa_sampled_ha': pd.Series(dtype='float64'),
                       'esa_total_ha': pd.Series(dtype='float64'),
                       'tree_cover_class': pd.Series(dtype='str'),
                       'tof_ha': pd.Series(dtype='int64'),
                       'tof_mean': pd.Series(dtype='float64')})
    counter = 0
    
    folder_contents = [f for f in os.listdir(f'{country}/resampled_rasters/tof') if f != '.ipynb_checkpoints']
    folder_contents = ['Northern.tif']
    
    # iterate through the admins 
    for file in folder_contents:
        print(file)
        
        counter += 1
        tof = rs.open(f'{country}/resampled_rasters/tof/{file}').read(1)
        esa = rs.open(f'{country}/resampled_rasters/esa/{file}').read(1)
        
        # reshape TML admin tif to 4d array and mask where equal to 255
        tof = reshape_to_4d(tof)
        tof = np.ma.masked_equal(tof, 255)
        
        # manually calculate the mean per hectare for memory purposes
        tof_count_per_ha = np.sum(~tof.mask, axis = (1, 3), dtype=np.uint8) 
        tof_sum_per_ha = np.sum(tof, axis = (1, 3), dtype=np.uint16)
        tof_mean_per_ha = np.divide(tof_sum_per_ha, tof_count_per_ha, dtype=np.float32)

        # reshape esa admin tif to 4d array
        esa = reshape_to_4d(esa)
        
        # Set each hectare to the mode (lcc that appears most often) to prevent doouble counting 
        # The fastest way is a zipped for loop 
        for i, l in zip(range(esa.shape[0]), range(esa.shape[2])):
            
            # if there is > 1 unique value in a hectare of the esa tif (5-10% of cases)
            # calculate the mode, otherwise skip this step
            if len(np.unique(esa[i, :, l, :])) > 1:
                esa[i, :, l, :] = scipy.stats.mode(esa[i, :, l, :].flatten())[0]

        # Now that the esa array is set to the mode per hectare,
        # We need to make it a 2D array. np.max is a safe way to reshape quickly
        esa = np.max(esa, axis = (1, 3))
        
        lower_rng = [x for x in range(0, 100, 10)]
        upper_rng = [x for x in range(10, 110, 10)]
        
        # Set upper to 101, otherwise it isn't inclusive of 100% hectares.
        upper_rng[-1] = 101
        
        esa_classes = np.unique(esa)
        
        for cover in esa_classes:
            print(cover)
            
            tof_class_mean_per_ha = tof_mean_per_ha.copy()

            # Expand the existing no-data mask so that we calculate mean per lcc
            # tof class mean per ha is the mean TML
            tof_class_mean_per_ha.mask[esa != cover] = 1
            tof_class_mean = np.round(np.mean(tof_class_mean_per_ha), 2)

            # calculate the total land cover 
            lc_total = np.sum(esa == cover)
            
            # calculate land cover sampled - the sum of values that have not been masked out by 1
            lc_sampled = np.sum(~tof_class_mean_per_ha.mask)

            # iterate through the thresholds (0-10, 10-20, 20-30)
            for lower, upper in zip(lower_rng, upper_rng):

                # calculate total ha for that threshold 
                # if the lc sampled is a mask, then 0 area has been sampled 
                # which means tof_bin is 0 and tof mean should be NaN for that row
                if lc_sampled == 0:
                    tof_bin = 0
                    tof_class_mean = np.nan
                else:
                    tof_bin = np.sum((tof_class_mean_per_ha >= lower) & (tof_class_mean_per_ha < upper))
                
                bin_name = (f'{str(lower)}-{str(upper - 1)}')

                # confirm masked array doesn't propogate
                vars_to_check = [lc_sampled, lc_total, tof_bin, tof_class_mean]
                
                for index, var in enumerate(vars_to_check):
                    if np.ma.isMaskedArray(var):
                        print(f'Masked array at index {index} for {var}.')
                
                # check for erroneous values
                assert lc_sampled <= lc_total, f'Sampled area is greater than total area for land cover {cover} in {file}.'

                df = df.append({'country': country, 
                               'admin': file[:-4],
                               'esa_id': cover,
                               'esa_sampled_ha': lc_sampled,
                               'esa_total_ha': lc_total,
                               'tree_cover_class': bin_name,
                               'tof_ha': tof_bin,
                               'tof_mean': tof_class_mean},
                                ignore_index=True)

                # reinforce datatypes
                convert_dict = {'esa_sampled_ha':'float64',
                                'esa_total_ha':'float64',
                                'tof_ha':'int64',
                                'tof_mean': 'float64'}
                df = df.astype(convert_dict)
                print(f'{cover} done.')
                
                #assert df.esa_sampled_ha.any() <= df.esa_total_ha.any(), f'Sampled area is greater than total area for land cover {cover} in {file}.'

        # map ESA id numbers to lcc labels
        esa_legend = {0: 'ESA No Data',
                10: 'Cropland, rainfed',
                11: 'Cropland, rainfed',
                12: 'Cropland, rainfed',
                20: 'Cropland, irrigated or post-flooding',
                30: 'Mosaic cropland / natural vegetation',
                40: 'Mosaic natural vegetation / cropland',
                50: 'Tree cover, broadleaved, evergreen',
                60: 'Tree cover, broadleaved, deciduous',
                61: 'Tree cover, broadleaved, deciduous',
                62: 'Tree cover, broadleaved, deciduous',
                70: 'Tree cover, needleleaved, evergreen',
                71: 'Tree cover, needleleaved, evergreen',
                72: 'Tree cover, needleleaved, evergreen',
                80: 'Tree cover, needleleaved, deciduous',
                81: 'Tree cover, needleleaved, deciduous',
                82: 'Tree cover, needleleaved, deciduous',
                90: 'Tree cover, mixed leaf type',
                100: 'Mosaic tree and shrub / herbaceous cover',
                110: 'Mosaic herbaceous cover / tree and shrub',
                120: 'Shrubland',
                121: 'Shrubland',
                122: 'Shrubland',
                130: 'Grassland',
                140: 'Lichens and mosses',
                150: 'Sparse vegetation',
                151: 'Sparse vegetation',
                152: 'Sparse vegetation',
                153: 'Sparse vegetation',
                160: 'Tree cover, flooded, fresh or brakish water',
                170: 'Tree cover, flooded, saline water',
                180: 'Shrub or herbaceous cover, flooded, fresh/saline/brakish water',
                190: 'Urban areas',
                200: 'Bare areas',
                201: 'Bare areas',
                202: 'Bare areas',
                210: 'Water bodies',
                220: 'Permanent snow and ice',
                255: 'No Data (flag)'}
        df['esa_class'] = df['esa_id'].map(esa_legend)
        
        tof = None
        esa = None
        
        if counter % 3 == 0:
            print(f'{counter}/{len(folder_contents)} admins processed...')
    
    cols_to_check = ['esa_sampled_ha', 'esa_total_ha', 'tof_ha', 'tof_mean']
    assert all(ptypes.is_numeric_dtype(df[col]) for col in cols_to_check)
    
    df.to_csv(f'{country}/stats/{country}_statistics_{extent}_tmlonly_Northern.csv', index=False)
    print('Analysis complete.')
    
    return None

In [None]:
@timer
def upload_dir(filename, bucket, object_name):
    """
    Upload a file to an S3 bucket. 

    file_name: File to upload
    bucket: Bucket to upload to
    object_name: S3 object name. If not specified then file_name is used

    """
    config = confuse.Configuration('sentinel-tree-cover')
    # CHANGE ONCE ON INSTANCE
    config.set_file('/Users/jessica.ertel/sentinel-tree-cover/jessica-config.yaml')
    aws_access_key = config['aws']['aws_access_key_id']
    aws_secret_key = config['aws']['aws_secret_access_key']
    session = boto3.Session(aws_access_key_id=aws_access_key.as_str(), aws_secret_access_key=aws_secret_key.as_str())    
    s3 = session.client('s3') 
  
    with open(filename, 'rb') as data:
        s3.upload_fileobj(data, bucket, object_name)

    print('Upload complete.')
    
    return None

In [None]:
@timer
def execute_pipe(country, extent):
#     print(f'Started at: {datetime.now().strftime("%H:%M:%S")}')
#     print('Downloading input data...')
#     download_inputs(country,
#                    'tof-output',
#                     f'2020/analysis/2020-full/{country}/resampled_rasters/',
#                     f'/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/{country}/resampled_rasters/')
    print('Calculating statistics...')
    calculate_stats_tml(country, extent)
#     print('Uploading files to s3...')
#     upload_dir(f'/Users/jessica.ertel/sentinel-tree-cover/notebooks/analysis/{country}/stats/{country}_statistics_full_tmlonly.csv', 
#                'tof-output', 
#                f'2020/analysis/2020-full/{country}/stats/{country}_statistics_full_tmlonly.csv')
    print(f'Finished {extent} processing at: {datetime.now().strftime("%H:%M:%S")}')
    return None

In [None]:
execute_pipe('Fiji', 'full')

In [None]:
tif = rs.open('Fiji/resampled_rasters/tof/Northern.tif').read(1)
tif.shape

In [None]:
np.unique(tif)

In [None]:
tif = rs.open('Fiji/resampled_rasters/esa/Eastern2.tif').read(1)
tif.shape