# Development: Building Standalone Training Samples
Within new architecture, training samples will be built on the fly from standalone image "chips". These chips, however, will be precreated, static files that are stored separately from the source imagery and ground-truth from which they are derived. Simultaneously, the relevant features of every single chip will be stored in a catalog (csv/dataframe).

Author: Eric, Taufiq  
Date: Summer 2019  

In [None]:
! which gdal_translate

In [None]:
#imports
import os, sys
import shapely
import cartopy
import numpy as np
get_ipython().magic(u'matplotlib inline')
import matplotlib.pyplot as plt
import descarteslabs as dl
import pandas as pd
import subprocess
import gdal

ULU_REPO = os.environ["ULU_REPO"]
sys.path.append(ULU_REPO+'/utils')
sys.path.append(ULU_REPO)
print(sys.path)
import util_vectors
import util_rasters
import util_chips_debug

In [None]:
# df_old = pd.read_csv('/data/phase_iv/chip_catalog.csv')
# df_old

### Set key variables

In [None]:
tile_resolution = 5
tile_size = 256
tile_pad = 32

processing_level = None
source = 's2'

s2_bands=['blue','green','red','nir','swir1','swir2','alpha']; suffix='BGRNS1S2A'  # S2, Lx
resolution=tile_resolution  # Lx:15 S2:10

#s1_bands=['vv','vh']; s1_suffix='VVVH'  

resampling='bilinear'

label_suffix = 'aue'
label_lot = '0'

In [None]:
# select city
# loop through ground truth tiles
# for each tile:
# get locations of target pixels
# loop through list of locations of target pixels
# for each pixel:
# grab a window of imagery centered around target pixel
# save this as a separate "image chip" geotiff, with careful naming
# as each chip is saved, add an entry to a catalog dataframe
# save dataframe

In [None]:
import itertools
from multiprocessing import Process, cpu_count
from multiprocessing import Pool
from multiprocessing.pool import ThreadPool
from datetime import datetime
import gdal


#
# CONFIG
#
MAX_POOL_PROCESSES=cpu_count()-1
MAX_THREADPOOL_PROCESSES=16



#
# METHODS
#
""" MAP METHODS
  Args:
    * map_function <function>: 
      a function to map over args list. the function should take a single argument.
      if multiple arguments are needed accept them as a single list or tuple
    * args_list <list>: the list of arguments to map over
    * max_process <int>: number of processes
      - for max_with_pool defaults to the number of cpus minus 1
      - for max_with_threadpool defaults to 16
      - map_sequential ignores this argument as its doesn't actually do 
        any multiprocesssing 
  Return:
    List of return values from map_function
  Notes:
    map_sequential does NOT multiprocess.  it can be used as a sequential drop-in 
    replacement for map_with_pool/threadpool.  this is useful for:
      - development 
      - debugging
      - benchmarking 
"""
def map_with_pool(map_function,args_list,max_processes=MAX_POOL_PROCESSES):
  pool=Pool(processes=min(len(args_list),max_processes))
  return _run_pool(pool,map_function,args_list)


def map_with_threadpool(map_function,args_list,max_processes=MAX_THREADPOOL_PROCESSES):
  pool=ThreadPool(processes=min(len(args_list),max_processes))
  return _run_pool(pool,map_function,args_list)


def map_sequential(map_function,args_list,print_args=False,noisy=False,**dummy_kwargs):
  if noisy:
    print('multiprocessing(test):')
  out=[]
  for i,args in enumerate(args_list):
      if noisy: 
        print('\t{}...'.format(i))
      if print_args:
        print('\t{}'.format(args))
      out.append(map_function(args))
  if noisy: 
    print('-'*25)
  return out





""" simple: vanilla multiprocessing
  Args:
    * function <function>: function. function can take multiple arguments 
    * args_list <list>: the list of argument lists
    * join <bool[True]>: join processes before return
  Return: 
    List of processes 
"""
def simple(function,args_list,join=True):
  procs=[]
  for args in args_list:
      proc=Process(
          target=function, 
          args=args)
      procs.append(proc)
      proc.start()
  if join:
    for proc in procs:
        proc.join()
  return procs






""" MPList
Run the above methods on map_function,args_list pairs where the map_function
changes for each new set of args in args_list
Args:
    pool_type<str>: 
        one of MPList.POOL|THREAD|SEQUENTIAL.  determines which map_function 
        and default max_processes to use. If not MPList.THREAD|SEQUENTIAL it 
        will default to MPList.POOL.
    max_processes<int>:
        if not passed will set default based on pool_type
    jobs<list>:
        list of (target,args,kwargs) tuples. Note: use the append method rather than
        creating (target,args,kwargs) tuples
        
"""
class MPList():
    #
    # POOL TYPES
    #
    POOL='pool'
    THREAD='threading'
    SEQUENTIAL='sequential'
    

    #
    # PUBLIC
    #
    def __init__(self,pool_type=None,max_processes=None,jobs=None):
        self.pool_type=pool_type or self.POOL
        self.max_processes=max_processes
        self.jobs=jobs or []

        
    def append(self,target,*args,**kwargs):
        self.jobs.append((target,)+(args,)+(kwargs,))
        
    
    def run(self):
        self.start_time=datetime.now()
        map_func,self.max_processes=self._map_func_max_processes()
        out=map_func(self._target,self.jobs,max_processes=self.max_processes)
        self.end_time=datetime.now()
        self.duration=str(self.end_time-self.start_time)
        return out
        

    def __len__(self):
        return len(self.jobs)
    
    
    #
    # INTERNAL
    #    
    def _map_func_max_processes(self):
        if self.pool_type==MPList.THREAD:
            map_func=map_with_threadpool
            max_processes=self.max_processes or MAX_THREADPOOL_PROCESSES
        elif self.pool_type==MPList.SEQUENTIAL:
            map_func=map_sequential
            max_processes=False
        else:
            map_func=map_with_pool
            max_processes=self.max_processes or MAX_POOL_PROCESSES
        return map_func, max_processes
        
        
    def _target(self,args):
        target,args,kwargs=args
        return target(*args,**kwargs)
        
    

#
# INTERNAL METHODS
#
def _stop_pool(pool,success=True):
  pool.close()
  pool.join()
  return success


def _map_async(pool,map_func,objects):
  try:
    return pool.map_async(map_func,objects)
  except KeyboardInterrupt:
    print("Caught KeyboardInterrupt, terminating workers")
    pool.terminate()
    return False
  else:
    print("Failure")
    return _stop_pool(pool,False)


def _run_pool(pool,map_function,args_list):
  out=_map_async(pool,map_function,args_list)
  _stop_pool(pool)
  return out.get()

In [None]:
def arg_dict_decorator(func):
    def decorator(arg_dict):
        return func(**arg_dict)
    return decorator


@arg_dict_decorator
def generate_chips(image_suffix, place, data_root='/data/phase_iv/', 
                   label_suffix= label_suffix, label_lot= label_lot,
                   source= source,  bands=s2_bands,
                   resampling=resampling, processing_level=processing_level,
                   chip_radius=32,
                   remove_duplicates=True,
                   category_label={0:'Open Space',1:'Non-Residential',\
                       2:'Residential Atomistic',3:'Residential Informal Subdivision',\
                       4:'Residential Formal Subdivision',5:'Residential Housing Project',\
                       6:'Roads',7:'Study Area',8:'Labeled Study Area',254:'No Data',255:'No Label'},
                   show_stats=False,
                   tile_start=None,
                   tile_stop=None
                  ):
    
    place_title = place.title()
    place_shapefile = data_root+place+'/'+place_title+"_studyAreaEPSG4326.shp"

    shape = util_vectors.load_shape(place_shapefile)

    tiles = dl.raster.dltiles_from_shape(tile_resolution, tile_size, tile_pad, shape)

    resolution = int(tiles['features'][0]['properties']['resolution'])
    size = int(tiles['features'][0]['properties']['tilesize'])
    pad = int(tiles['features'][0]['properties']['pad'])

    if resolution==10:
        zfill=3
    elif resolution==5:
        zfill=4
    elif resolution==2:
        zfill=5 
    else:
        raise Exception('bad resolution: '+str(resolution))
        

    rows_list = []
    # select city
    # loop through ground truth tiles
    if tile_start is None:
        tile_start = 0;
    if tile_stop is None:
        tile_stop = len(tiles['features'])
    for tile_id in range(tile_start, tile_stop):
        # for each tile:
            # sample file name: /data/phase_iv/sitapur/gt/sitapur_aue0_5m_p32_tile0586_lulc.tif
        path_base = data_root+place+'/gt/'+place+'_'+label_suffix+label_lot+'_'+str(resolution)+'m'+'_'+\
            'p'+str(pad)+'_'+'tile'+str(tile_id).zfill(zfill)
        #print(path_base)
        path_lulc = path_base+'_'+'lulc.tif'
        path_locale = path_base+'_'+'locale.tif'
        lulc,_,_,_,_ = util_rasters.load_geotiff(path_lulc,dtype='uint8')
        locale,_,_,_,_ = util_rasters.load_geotiff(path_locale,dtype='uint8')
        # 'erase' irrelevant pixels
        lulc[0:pad,:] = 255; lulc[-pad:,:] = 255; lulc[:,0:pad] = 255; lulc[:,-pad:] = 255
        locale[0:pad,:] = 255; locale[-pad:,:] = 255; locale[:,0:pad] = 255; locale[:,-pad:] = 255
        # get locations of target pixels
        locs = np.where(lulc!=255)
        n_px = len(locs[0])
        if n_px == 0: 
            continue
        if show_stats:
            util_rasters.stats_byte_raster(lulc, category_label, lulc=True, show=True)
        print(place + ' '+ image_suffix + ': valid pixels in tile'+str(tile_id).zfill(zfill)+':', len(locs[0]))
        #image path example: /data/phase_iv/sitapur/imagery/none/sitapur_s2_E_5m_p32_tile0006.tif
        path_image = data_root+place+'/imagery/'+str(processing_level).lower()+'/'+place+'_'+source+'_'+\
            image_suffix+'_'+str(resolution)+'m'+'_'+'p'+str(pad)+'_'+'tile'+str(tile_id).zfill(zfill)+'.tif'
        # loop through list of locations of target pixels
        for i in range(len(locs[0])):
            row = locs[0][i]
            col = locs[1][i]
            #print(row,col,lulc[row,col])
            # for each pixel:
            # grab a window of imagery centered around target pixel
            xoff = col - chip_radius; yoff = row - chip_radius;
            xsize = chip_radius*2+1; ysize = chip_radius*2+1
            # save this as a separate "image chip" geotiff, with careful naming
            path_chip = data_root+place+'/chips/'+str(processing_level).lower()+'/'+\
                place+'_'+label_suffix+label_lot+'_'+source+'_'+image_suffix+'_'+str(resolution)+'m'+'_'+\
                't'+str(tile_id).zfill(zfill)+'_'+'x'+str(col-pad).zfill(3)+'y'+str(row-pad).zfill(3)+'_'+\
                'c'+str(lulc[row,col])+'.tif'
            #print(path_image)
            #print(path_chip)
            #print(xoff, yoff, xsize, ysize)
            #gdal template: gdal_translate -srcwin xstart ystart xstop ystop input.raster output.raster
            #!gdal_translate -q -srcwin {xoff} {yoff} {xsize} {ysize} {path_image} {path_chip}
            
            gdal.Translate(path_chip,path_image,srcWin=[xoff,yoff,xsize,ysize]) #added due to gdal_translate issues
            
#             commented out due to gdal_translate issues 
#             command = 'gdal_translate -q -srcwin {0} {1} {2} {3} {4} {5}'.format(xoff,yoff,xsize,ysize,path_image,path_chip)
            #print('>>>',command)
#             try:
#                 subprocess.check_output(command.split(), shell=False)
#             except subprocess.CalledProcessError as e:
#                 raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
            
    
            # as each chip is saved, add an entry to a catalog dataframe
            #['city','gt_type','gt_lot','locale','source','image','bands',
            #   'resolution','resampling','processing','tile_id','column','row','lulc']
            row_dict = {}
            row_dict['path']=path_chip
            row_dict['city']=place
            row_dict['gt_type']=label_suffix
            row_dict['gt_lot']=label_lot
            row_dict['locale']=locale[row,col]
            row_dict['source']=source
            row_dict['image']=image_suffix
            row_dict['bands']=bands
            row_dict['resolution']=resolution
            row_dict['resampling']=resampling
            row_dict['processing']=str(processing_level).lower()
            row_dict['tile_id']=tile_id
            row_dict['x']=col-pad
            row_dict['y']=row-pad
            row_dict['lulc']=lulc[row,col]
            rows_list.append(row_dict)
            
    columns = ['path','city','gt_type','gt_lot','locale','source','image','bands',
               'resolution','resampling','processing','tile_id','x','y','lulc']
    df_new = pd.DataFrame(rows_list, columns=columns)
    #DataFrame.set_index(keys, drop=True, append=False, inplace=False, verify_integrity=False)
    #df_new.set_index('path',drop=True,append=False,inplace=True,verify_integrity=True)
    df_new.head()
    # save dataframe
    path_catalog = data_root+place+'_chip_catalog.csv'  # WRITTING THE CHIPS DATA FROM EACH CITIES TO UNIQUE CSV FILES
    if not os.path.isfile(path_catalog):
        #write new records directly
        df_new.to_csv(path_catalog,index=False,header=True)
    else:
        #read csv
        #load to dataframe
        df_old = pd.read_csv(path_catalog)
        # append new
        #DataFrame.append(other, ignore_index=False, verify_integrity=False, sort=None)
        df_combo = df_old.append(df_new,ignore_index=False,verify_integrity=False)
        #remove duplicates
        if remove_duplicates:
            #DataFrame.drop_duplicates(subset=None, keep='first', inplace=False)
            df_combo.drop_duplicates(subset='path',keep='first',inplace=True)
        df_combo.to_csv(path_catalog,index=False,header=True)

In [None]:
arg_list1=[    
        {'image_suffix':'A', 'place':'johannesburg'},
        {'image_suffix':'A', 'place':'kampala'},
        {'image_suffix':'A', 'place':'port_elizabeth'},
        {'image_suffix':'A', 'place':'kigali'},
        {'image_suffix':'A', 'place':'arusha'},
        {'image_suffix':'A', 'place':'nakuru'}
         ]
arg_list1

In [None]:
arg_list2=[    
        {'image_suffix':'B', 'place':'johannesburg'},
        {'image_suffix':'B', 'place':'kampala'},
        {'image_suffix':'B', 'place':'port_elizabeth'},
        {'image_suffix':'B', 'place':'kigali'},
        {'image_suffix':'B', 'place':'arusha'},
        {'image_suffix':'B', 'place':'nakuru'}
         ]
arg_list2

In [None]:
%time map_with_threadpool(generate_chips,arg_list1,max_processes=32)

In [None]:
%time map_with_threadpool(generate_chips,arg_list2,max_processes=32)