## Core Workflow: Get NAIP imageries from given addresses
Purpose: Geocode addresses from an input csv file. Find matching polygons from Microsoft Building foorprint data using geocoded addresses. Use the shape of the polygons to download NAIP imageries for the rooftops.
<br>
*Date: 2019-02-08*
<br>
*Author: Taufiq Rashid*


### Import statements

In [None]:
import warnings
warnings.filterwarnings('ignore')
#
import os
import sys
import json
import itertools
import pickle
from pprint import pprint
#
import numpy as np
import shapely
from shapely.geometry import shape, Point
from shapely.geometry import mapping, Polygon
import cartopy
import geojson
import fiona
import gdal
import h5py
get_ipython().magic(u'matplotlib inline')
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler 
import ogr, gdal

import requests
import logging
import time

import pandas as pd

import collections

import descarteslabs as dl

ULU_REPO = os.environ["ULU_REPO"]
sys.path.append(ULU_REPO+'/utils')
print sys.path

import bronco
import bronco_candidates

### Set key variables

In [None]:
place = 'san_diego'  # setting the study area

data_root='/data/phase_iii/NAIP/'
data_path=data_root+place+'/'

bands=['red','green','blue','nir','alpha']; suffix='RGBNA'  # S2, Lx
resolution=1

### load GeoJSON file containing polygons

In [None]:
with open('/data/phase_iii/microsoft_footprints/California.geojson') as f:
    js = json.load(f)

### Supply the rooftop addresses and geocode them

In [None]:
### Adapted from python batch geocoding.py by Shane Lynn
logger = logging.getLogger("root")
logger.setLevel(logging.DEBUG)
# create console handler
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)

#------------------ CONFIGURATION -------------------------------

# Set your Google API key here. 
API_KEY = 'AIzaSyDKrvutuGy1zFt3p4VLGA5Lq87AZYwmc2c'
# Backoff time sets how many minutes to wait between google pings when your API limit is hit
BACKOFF_TIME = 30
# Set your output file name here.
output_filename = 'geocded_addresses.csv'
# Set your input file here
input_filename = "ground_truth.csv"
# Specify the column name in your input data that contains addresses here
address_column_name = "Address"
# Return Full Google Results? If True, full JSON results from Google are included in output
RETURN_FULL_RESULTS = False

#------------------ DATA LOADING --------------------------------

# Read the data to a Pandas Dataframe
data = pd.read_csv(input_filename, encoding='utf8')

# Form a list of addresses for geocoding:
# Make a big list of all of the addresses to be processed.
addresses = data[address_column_name].tolist()

#------------------	FUNCTION DEFINITIONS ------------------------

def get_google_results(address, api_key=API_KEY, return_full_response=False):

    # Set up your Geocoding url
    geocode_url = "https://maps.googleapis.com/maps/api/geocode/json?address={}".format(address) + "&key={}".format(api_key)
        
    # Ping google for the reuslts:
    results = requests.get(geocode_url)
    # Results will be in JSON format - convert to dict using requests functionality
    results = results.json()
    
    # if there's no results or an error, return empty results.
    if len(results['results']) == 0:
        output = {
            "formatted_address" : None,
            "latitude": None,
            "longitude": None,
            "accuracy": None,
            "google_place_id": None,
            "type": None,
            "postcode": None
        }
    else:    
        answer = results['results'][0]
        output = {
            "formatted_address" : answer.get('formatted_address'),
            "latitude": answer.get('geometry').get('location').get('lat'),
            "longitude": answer.get('geometry').get('location').get('lng'),
            "accuracy": answer.get('geometry').get('location_type'),
            "google_place_id": answer.get("place_id"),
            "type": ",".join(answer.get('types')),
            "postcode": ",".join([x['long_name'] for x in answer.get('address_components') 
                                  if 'postal_code' in x.get('types')])
        }
        
    # Append some other details:    
    output['input_string'] = address
    output['number_of_results'] = len(results['results'])
    output['status'] = results.get('status')
    if return_full_response is True:
        output['response'] = results
    
    return output

# Create a list to hold results
results = []
# Go through each address in turn
for address in addresses:
    # While the address geocoding is not finished:
    geocoded = False
    while geocoded is not True:
        # Geocode the address with google
        try:
            geocode_result = get_google_results(address, API_KEY, return_full_response=RETURN_FULL_RESULTS)
        except Exception as e:
            logger.exception(e)
            logger.error("Major error with {}".format(address))
            logger.error("Skipping!")
            geocoded = True
            
        # If we're over the API limit, backoff for a while and try again later.
        if geocode_result['status'] == 'OVER_QUERY_LIMIT':
            logger.info("Hit Query Limit! Backing off for a bit.")
            time.sleep(BACKOFF_TIME * 60) # sleep for 30 minutes
            geocoded = False
        else:
            if geocode_result['status'] != 'OK':
                logger.warning("Error geocoding {}: {}".format(address, geocode_result['status']))
            logger.debug("Geocoded: {}: {}".format(address, geocode_result['status']))
            results.append(geocode_result)           
            geocoded = True

    # Print status every 100 addresses
    if len(results) % 100 == 0:
    	logger.info("Completed {} of {} address".format(len(results), len(addresses)))
            
    # Every 500 addresses, save progress to file(in case of a failure so you have something!)
    if len(results) % 500 == 0:
        pd.DataFrame(results).to_csv("{}_bak".format(output_filename))

# All done
logger.info("Finished geocoding all addresses")
# Write the full results to csv using the pandas library.
pd.DataFrame(results).to_csv(output_filename, encoding='utf8')

### store the geocoded addresses to a dataframe

In [None]:
# Set your input file here
output_filename = "geocded_addresses.csv"

# Read the data to a Pandas Dataframe
df = pd.read_csv(output_filename, encoding='utf8')

addresses= df[['longitude','latitude']].apply(tuple, axis=1)
addresses

### Batch processing for finding matching footprints

In [None]:
for Y, X in addresses.iteritems():
    print 'Searching matching polygon for:', X[0],X[1]
    # construct point based on lat/long returned by geocoder
    point = Point(X[0],X[1])

    # check each polygon to see if it contains the point
    for feature in js['features']:
        polygon = shape(feature['geometry'])
        if polygon.contains(point):
            print 'Found containing polygon:', feature

            # Define a polygon feature geometry with one attribute
            schema = {
                'geometry': 'Polygon',
                'properties': {'id': 'int'},
            }

            # Write a new Shapefile
            with fiona.open('my_shp.shp', 'w', 'ESRI Shapefile', schema) as c:
                ## If there are multiple geometries, put the "for" loop here
                c.write({
                    'geometry': mapping(polygon),
                    'properties': {'id': 123},
                })
            #As soon as you find the matching polygon, break out of the loop
            break
            
shape = bronco_candidates.load_shape('my_shp.shp')

### Search and acquire NAIP imagery 

In [None]:
product = u'762932f1db2b68d82f08b527ffe5a32d949dc8ec:usda:naip:rgbn:ca'

#  Search metadata given a spatio-temporal query
feature_collection = dl.metadata.search(product=[product], start_time='2009-01-01', end_time='2009-12-31',
                                        limit=10, geom=shape['geometry'])
naip_ids = [f['id'] for f in feature_collection['features']]
naip_ids.sort()
print len(naip_ids), naip_ids

### Download the imageries

In [None]:
# pull from api and save to file 
naip_dict = {}
naip_dict['A'] = [u'762932f1db2b68d82f08b527ffe5a32d949dc8ec:usda:naip:rgbn:ca:Brdf_006_034_092306_2136_RGBNN00L2_SD_4_0']

continue_index = 0

for naip_suffix, naip_imgs in naip_dict.iteritems():
    print naip_suffix, naip_imgs
    continue_index = 0
    naip_band_file =  data_path+place+'_naip_'+naip_suffix+'_0623'+'_'+str(resolution)+'m'
    print naip_band_file
    naip = dl.raster.raster(
            naip_imgs,
            bands=bands,
            data_type='UInt16',
            cutline=shape['geometry'],
            save=True,
            outfile_basename=naip_band_file)