## Core Workflow: Normalize training data
Purpose: The mean and standard deviation of the band values for each pixel in each satellite imagery within the training data AOI are calculated and used to normalize the training data. This final normalized set of band values will be eventually used to train the model.
<br>
<br>
*Date: 10-31-2019*


### Import statements

In [1]:
import warnings
warnings.filterwarnings('ignore')
#
import os
import sys
import json
import itertools
import pickle
from pprint import pprint
#
import numpy as np
import shapely
from shapely.geometry import shape, Point
from shapely.geometry import mapping, Polygon
# import cartopy
import geojson
import fiona
import h5py
get_ipython().magic(u'matplotlib inline')
import matplotlib as mpl
import matplotlib.pyplot as plt

import gdal
from glob import glob

import jenkspy

import rasterio as rio
from rasterio.plot import show

import pandas as pd

import collections
from numpy import mean

import random
import statistics

import time

import descarteslabs as dl
print (sys.path)

['', '/opt/caffe/python', '/opt/caffe2/build', '/data/home/peter/notebooks/urban_heat', '/anaconda/envs/py36/lib/python36.zip', '/anaconda/envs/py36/lib/python3.6', '/anaconda/envs/py36/lib/python3.6/lib-dynload', '/anaconda/envs/py36/lib/python3.6/site-packages', '/anaconda/envs/py36/lib/python3.6/site-packages/IPython/extensions', '/data/home/peter/.ipython']


### Helper functions

In [None]:
def load_shape(place_shapefile):
    c = fiona.open(place_shapefile)
    pol = c.next()
    shape = {}
    shape['type'] = pol['type']
    shape['properties'] = pol['properties']
    shape['geometry'] = {}
    shape['geometry']['type'] = 'Polygon'  # pol['geometry']['type']
    shape['geometry']['coordinates'] = [[]]
    # if MultiPolygon (e.g., city='kampala')
    if (len(pol['geometry']['coordinates'])>1):
        # identify largest single polygon
#         print("MultiPolygon", len(pol['geometry']['coordinates']))
        p_argmax = 0 
        pn_max = 0
        for p in range(len(pol['geometry']['coordinates'])):
            pn = len(pol['geometry']['coordinates'][p][0])
            if pn>pn_max:
                p_argmax = p
                pn_max = pn
#             print(p, pn, p_argmax, pn_max )
        # make largest polygon the only polygon, move other polys to a backup variable 
        polygon = pol['geometry']['coordinates'][p_argmax]
    else:
#         print('simple polygon')
        polygon = pol['geometry']['coordinates']
       
    xmin =  180
    xmax = -180
    ymin =  90
    ymax = -90
    for x,y in polygon[0]:
        xmin = xmin if xmin < x else x
        xmax = xmax if xmax > x else x
        ymin = ymin if ymin < y else y
        ymax = ymax if ymax > y else y
        shape['geometry']['coordinates'][0].append([x,y])
    shape['bbox'] = [xmin,ymin,xmax,ymax]
    
    return shape

import itertools
from multiprocessing import Process, cpu_count
from multiprocessing import Pool
from multiprocessing.pool import ThreadPool
from datetime import datetime


#
# CONFIG
#
MAX_POOL_PROCESSES=cpu_count()-1
MAX_THREADPOOL_PROCESSES=16



#
# METHODS
#
""" MAP METHODS
  Args:
    * map_function <function>: 
      a function to map over args list. the function should take a single argument.
      if multiple arguments are needed accept them as a single list or tuple
    * args_list <list>: the list of arguments to map over
    * max_process <int>: number of processes
      - for max_with_pool defaults to the number of cpus minus 1
      - for max_with_threadpool defaults to 16
      - map_sequential ignores this argument as its doesn't actually do 
        any multiprocesssing 
  Return:
    List of return values from map_function
  Notes:
    map_sequential does NOT multiprocess.  it can be used as a sequential drop-in 
    replacement for map_with_pool/threadpool.  this is useful for:
      - development 
      - debugging
      - benchmarking 
"""
def map_with_pool(map_function,args_list,max_processes=MAX_POOL_PROCESSES):
  pool=Pool(processes=min(len(args_list),max_processes))
  return _run_pool(pool,map_function,args_list)


def map_with_threadpool(map_function,args_list,max_processes=MAX_THREADPOOL_PROCESSES):
  pool=ThreadPool(processes=min(len(args_list),max_processes))
  return _run_pool(pool,map_function,args_list)


def map_sequential(map_function,args_list,print_args=False,noisy=False,**dummy_kwargs):
  if noisy:
    print('multiprocessing(test):')
  out=[]
  for i,args in enumerate(args_list):
      if noisy: 
        print('\t{}...'.format(i))
      if print_args:
        print('\t{}'.format(args))
      out.append(map_function(args))
  if noisy: 
    print('-'*25)
  return out





""" simple: vanilla multiprocessing
  Args:
    * function <function>: function. function can take multiple arguments 
    * args_list <list>: the list of argument lists
    * join <bool[True]>: join processes before return
  Return: 
    List of processes 
"""
def simple(function,args_list,join=True):
  procs=[]
  for args in args_list:
      proc=Process(
          target=function, 
          args=args)
      procs.append(proc)
      proc.start()
  if join:
    for proc in procs:
        proc.join()
  return procs






""" MPList
Run the above methods on map_function,args_list pairs where the map_function
changes for each new set of args in args_list
Args:
    pool_type<str>: 
        one of MPList.POOL|THREAD|SEQUENTIAL.  determines which map_function 
        and default max_processes to use. If not MPList.THREAD|SEQUENTIAL it 
        will default to MPList.POOL.
    max_processes<int>:
        if not passed will set default based on pool_type
    jobs<list>:
        list of (target,args,kwargs) tuples. Note: use the append method rather than
        creating (target,args,kwargs) tuples
        
"""
class MPList():
    #
    # POOL TYPES
    #
    POOL='pool'
    THREAD='threading'
    SEQUENTIAL='sequential'
    

    #
    # PUBLIC
    #
    def __init__(self,pool_type=None,max_processes=None,jobs=None):
        self.pool_type=pool_type or self.POOL
        self.max_processes=max_processes
        self.jobs=jobs or []

        
    def append(self,target,*args,**kwargs):
        self.jobs.append((target,)+(args,)+(kwargs,))
        
    
    def run(self):
        self.start_time=datetime.now()
        map_func,self.max_processes=self._map_func_max_processes()
        out=map_func(self._target,self.jobs,max_processes=self.max_processes)
        self.end_time=datetime.now()
        self.duration=str(self.end_time-self.start_time)
        return out
        

    def __len__(self):
        return len(self.jobs)
    
    
    #
    # INTERNAL
    #    
    def _map_func_max_processes(self):
        if self.pool_type==MPList.THREAD:
            map_func=map_with_threadpool
            max_processes=self.max_processes or MAX_THREADPOOL_PROCESSES
        elif self.pool_type==MPList.SEQUENTIAL:
            map_func=map_sequential
            max_processes=False
        else:
            map_func=map_with_pool
            max_processes=self.max_processes or MAX_POOL_PROCESSES
        return map_func, max_processes
        
        
    def _target(self,args):
        target,args,kwargs=args
        return target(*args,**kwargs)
        
    

#
# INTERNAL METHODS
#
def _stop_pool(pool,success=True):
  pool.close()
  pool.join()
  return success


def _map_async(pool,map_func,objects):
  try:
    return pool.map_async(map_func,objects)
  except KeyboardInterrupt:
    print("Caught KeyboardInterrupt, terminating workers")
    pool.terminate()
    return False
  else:
    print("Failure")
    return _stop_pool(pool,False)


def _run_pool(pool,map_function,args_list):
  out=_map_async(pool,map_function,args_list)
  _stop_pool(pool)
  return out.get()


#------------------------Scene Normalization------------------------------

def normalize_scenes(scenes_list):
    try:
        scene, ctx = dl.scenes.Scene.from_id(scenes_list)
        arr = scene.ndarray(bands="red green blue nir", ctx=ctx.assign(resolution=50),mask_alpha=False)

        red = arr[0]
        red = red.astype(float)

        green = arr[1]
        green = green.astype(float)

        blue = arr[2]
        blue = blue.astype(float)

        nir = arr[3]
        nir = nir.astype(float)

        for val in red:
            for v in val:
                red_means.append(v)
        for val in green:
            for v in val:
                green_means.append(v)
        for val in blue:
            for v in val:
                blue_means.append(v)
        for val in nir:
            for v in val:
                nir_means.append(v)

        print('correct')
    except Exception as e:
        print(e)

### Scene normalization

In [None]:
# Set your input file here
path_data = 'band_values_train_data.csv'

# Read the data to a Pandas Dataframe
path_df = pd.read_csv(path_data, encoding='utf8')

img_info= path_df[['img_path', 'footprint_shapes','latitude', 'longitude', 'tile_id',
                   'roof_no', 'red_mean','green_mean','blue_mean','nir_mean','expected_albedo']].apply(tuple, axis=1)

path_df

In [None]:
# create a list of scene ids that cover the training data
scene_ids = list(df['tile_id'])

# Remove any duplicate scene ids
scene_ids = list(dict.fromkeys(scene_ids))

In [None]:
red_means=[]
green_means=[]
blue_means=[]
nir_means=[]


%time out = map_with_threadpool(normalize_scenes,scene_ids,max_processes=32)

In [None]:
# mean and standard deviation of the band values for each pixel within each tile that covers training data

big_red_mean=mean(red_means)
big_green_mean=mean(green_means)
big_blue_mean=mean(blue_means)
big_nir_mean=mean(nir_means)

big_red_sd=statistics.stdev(red_means)
big_green_sd=statistics.stdev(green_means)
big_blue_sd=statistics.stdev(blue_means)
big_nir_sd=statistics.stdev(nir_means)

### Normalize training data using the constants calculated above

In [None]:
norm_r_m = []
norm_g_m = []
norm_b_m = []
norm_n_m = []

lat=[]
lon=[]
roofs = []

tile_ids = []

exp_values = []
img_path = []

footprint_shapes=[]

for Y, X in img_info.iteritems():
    
    raw_red_mean=(X[6])
    raw_green_mean=(X[7])
    raw_blue_mean=(X[8])
    raw_nir_mean=(X[9])

    r = (raw_red_mean-big_red_m)/big_red_sd
    g = (raw_green_mean-big_green_m)/big_green_sd
    b = (raw_blue_mean-big_blue_m)/big_blue_sd
    n = (raw_nir_mean-big_nir_m)/big_nir_sd

    norm_r_m.append(r)
    norm_g_m.append(g)
    norm_b_m.append(b)
    norm_n_m.append(n)    

    raw_reds.append(raw_red_mean)
    raw_greens.append(raw_green_mean)
    raw_blues.append(raw_blue_mean)
    raw_nirs.append(raw_nir_mean)

    img_path.append(X[0])
    roofs.append(X[5])
    footprint_shapes.append(X[1])
    tile_ids.append(X[4])
    lat.append(X[2])
    lon.append(X[3])
    exp_values.append(X[10])
    

# store the results to a pandas library.
df = pd.DataFrame({ 'img_path': img_path, 'roof_id':roofs, 'footprint_shapes':footprint_shapes,'tile_ids':tile_ids, 
                   'latitude':lat, 'longitude':lon,'norm_red_mean': norm_r_m,'norm_green_mean': norm_g_m,
                   'norm_blue_mean': norm_b_m,'norm_nir_mean': norm_n_m,
                  'raw_red_mean':raw_reds,'raw_green_mean': raw_greens,'raw_blue_mean': raw_blues,'raw_nir_mean': raw_nirs})

# Write the full results to csv using the pandas library. 
df.to_csv('normalized_training_data.csv',encoding='utf8')

----------------------------------------------------