## Core Workflow: Normalize prediction data
Purpose: The mean and standard deviation of the mean band values for each rooftop are calculated and used to normalize the prediction data. The normalization is done separately for each study area and each time period. For example, while normalizing the 2009 prediction data for LA, the mean band values from 2009 LA imagery is only used. 
<br>
*Date: 2019-10-31*



### Import statements

In [None]:
import warnings
warnings.filterwarnings('ignore')
#
import os
import sys
import json
import itertools
import pickle
from pprint import pprint
#
import numpy as np
import shapely
from shapely.geometry import shape, Point
from shapely.geometry import mapping, Polygon
# import cartopy
import geojson
import fiona
import gdal
import h5py
get_ipython().magic(u'matplotlib inline')
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler 
# import ogr, gdal
from glob import glob

import requests
import logging
import time

import pandas as pd

import collections

import rasterio as rio
from rasterio.plot import show
from numpy import mean

import random
import statistics

import descarteslabs as dl
from descarteslabs.vectors import FeatureCollection

print (sys.path)

### Set input variables

In [None]:
# set the AOI for processing (LA for Los Angeles or KC for Kansas City)
prediction_city = 'LA'

In [None]:
# Set your input file here
path_data = 'band_values_NAIP_LA_cnty_2009_10-18.csv'

# Read the data to a Pandas Dataframe
path_df = pd.read_csv(path_data, encoding='utf8')

if prediction_city = 'LA':
    img_info= path_df[['roof_no','img_id','footprint_shapes','total_pixels','bld_id','shp_area','raw_red_mean',
                   'raw_green_mean','raw_blue_mean','raw_nir_mean']].apply(tuple, axis=1)
elif prediction_city == 'KC':
    img_info= path_df[['roof_no','img_id','footprint_shapes','total_pixels','state',
                   'raw_red_mean','raw_green_mean','raw_blue_mean','raw_nir_mean']].apply(tuple, axis=1)
    
path_df.head()

### Empty lists to hold all variables

In [None]:
roofs = []
imgs = []
footprint_shapes=[]
total_pixels = []

norm_r_m = []
norm_g_m = []
norm_b_m = []
norm_n_m = []

raw_reds = []
raw_greens = []
raw_blues = []
raw_nirs = []

bld_ids = []
shp_areas = []

states= []

In [None]:
if prediction_city = 'LA':
    # mean and standard deviation of the mean band values for each rooftop 
    big_red_m = path_df.loc[:,"raw_red_mean"].mean()
    big_green_m = path_df.loc[:,"raw_green_mean"].mean()
    big_blue_m = path_df.loc[:,"raw_blue_mean"].mean()
    big_nir_m = path_df.loc[:,"raw_nir_mean"].mean()

    big_red_sd = path_df.loc[:,"raw_red_mean"].std()
    big_green_sd = path_df.loc[:,"raw_green_mean"].std()
    big_blue_sd = path_df.loc[:,"raw_blue_mean"].std()
    big_nir_sd = path_df.loc[:,"raw_nir_mean"].std()

    for Y, X in img_info.iteritems():

        raw_red_mean=mean(X[6])
        raw_green_mean=mean(X[7])
        raw_blue_mean=mean(X[8])
        raw_nir_mean=mean(X[9])

        r = (raw_red_mean-big_red_m)/big_red_sd
        g = (raw_green_mean-big_green_m)/big_green_sd
        b = (raw_blue_mean-big_blue_m)/big_blue_sd
        n = (raw_nir_mean-big_nir_m)/big_nir_sd

        norm_r_m.append(r)
        norm_g_m.append(g)
        norm_b_m.append(b)
        norm_n_m.append(n)    

        raw_reds.append(raw_red_mean)
        raw_greens.append(raw_green_mean)
        raw_blues.append(raw_blue_mean)
        raw_nirs.append(raw_nir_mean)

        imgs.append(X[1])
        roofs.append(X[0])
        footprint_shapes.append(X[2])
        total_pixels.append(X[3])
        bld_ids.append(X[4])
        shp_areas.append(X[5])

    # store the results to a pandas library.
    df = pd.DataFrame({ 'roof_no': roofs, 'img_id':imgs, 'footprint_shapes':footprint_shapes,'total_pixels': total_pixels,
                       'bld_id':bld_ids, 'shp_area':shp_areas,
                       'norm_red_mean': norm_r_m,'norm_green_mean': norm_g_m,'norm_blue_mean': norm_b_m,
                       'norm_nir_mean': norm_n_m,
                      'raw_red_mean':raw_reds,'raw_green_mean': raw_greens,'raw_blue_mean': raw_blues,'raw_nir_mean': raw_nirs})

    # Write the full results to csv using the pandas library. 
    df.to_csv('data_NAIP_LA_cnty_rf-norm_2009_10-24.csv',encoding='utf8')
    
elif prediction_city == 'KC':
    
    # normalization for Missouri part of Kansas city
    mo_df = path_df.loc[path_df['state'] == 'mo']
    # mean and standard deviation of the mean band values for each rooftop
    mo_big_red_m = mo_df.loc[:,"raw_red_mean"].mean()
    mo_big_green_m = mo_df.loc[:,"raw_green_mean"].mean()
    mo_big_blue_m = mo_df.loc[:,"raw_blue_mean"].mean()
    mo_big_nir_m = mo_df.loc[:,"raw_nir_mean"].mean()

    mo_big_red_sd = mo_df.loc[:,"raw_red_mean"].std()
    mo_big_green_sd = mo_df.loc[:,"raw_green_mean"].std()
    mo_big_blue_sd = mo_df.loc[:,"raw_blue_mean"].std()
    mo_big_nir_sd = mo_df.loc[:,"raw_nir_mean"].std()
    
    # normalization for Kansas part of Kansas city
    ks_df = path_df.loc[path_df['state'] == 'ks']
    # mean and standard deviation of the mean band values for each rooftop
    ks_big_red_m = ks_df.loc[:,"raw_red_mean"].mean()
    ks_big_green_m = ks_df.loc[:,"raw_green_mean"].mean()
    ks_big_blue_m = ks_df.loc[:,"raw_blue_mean"].mean()
    ks_big_nir_m = ks_df.loc[:,"raw_nir_mean"].mean()

    ks_big_red_sd = ks_df.loc[:,"raw_red_mean"].std()
    ks_big_green_sd = ks_df.loc[:,"raw_green_mean"].std()
    ks_big_blue_sd = ks_df.loc[:,"raw_blue_mean"].std()
    ks_big_nir_sd = ks_df.loc[:,"raw_nir_mean"].std()
    
    for Y, X in img_info.iteritems():
    
        state = X[4]
        raw_red_mean=mean(X[5])
        raw_green_mean=mean(X[6])
        raw_blue_mean=mean(X[7])
        raw_nir_mean=mean(X[8])
        
        if state == 'mo':
            r = (raw_red_mean-mo_big_red_m)/mo_big_red_sd
            g = (raw_green_mean-mo_big_green_m)/mo_big_green_sd
            b = (raw_blue_mean-mo_big_blue_m)/mo_big_blue_sd
            n = (raw_nir_mean-mo_big_nir_m)/mo_big_nir_sd
        elif state == 'ks':
            r = (raw_red_mean-ks_big_red_m)/ks_big_red_sd
            g = (raw_green_mean-ks_big_green_m)/ks_big_green_sd
            b = (raw_blue_mean-ks_big_blue_m)/ks_big_blue_sd
            n = (raw_nir_mean-ks_big_nir_m)/ks_big_nir_sd

        norm_r_m.append(r)
        norm_g_m.append(g)
        norm_b_m.append(b)
        norm_n_m.append(n)    

        raw_reds.append(raw_red_mean)
        raw_greens.append(raw_green_mean)
        raw_blues.append(raw_blue_mean)
        raw_nirs.append(raw_nir_mean)

        imgs.append(X[1])
        roofs.append(X[0])
        footprint_shapes.append(X[2])
        total_pixels.append(X[3])
        states.append(X[4])

    # store the results to a pandas library.
    df = pd.DataFrame({ 'roof_no': roofs, 'img_id':imgs, 'footprint_shapes':footprint_shapes,'total_pixels': total_pixels,
                       'state':states,
                       'norm_red_mean': norm_r_m,'norm_green_mean': norm_g_m,'norm_blue_mean': norm_b_m,
                       'norm_nir_mean': norm_n_m,
                      'raw_red_mean':raw_reds,'raw_green_mean': raw_greens,'raw_blue_mean': raw_blues,'raw_nir_mean': raw_nirs})

    # Write the full results to csv using the pandas library. 
    df.to_csv('data_NAIP_KC_rf-norm_2018_10-27.csv',encoding='utf8')

----------------------------------------