## Core Workflow: Geocode addresses from the ground truth data
Purpose: Geocode the training data rooftop addresses using Google geocoding API. The latitude & longitude of the roofs will be used to search for their geometries from Microsoft footprints data.   
<br>
*Date: 10/31/2019*


### Import statements

In [1]:
import warnings
warnings.filterwarnings('ignore')
#
import os
import sys
import json
import itertools
import pickle
from pprint import pprint
#
import numpy as np
import shapely
from shapely.geometry import shape, Point
from shapely.geometry import mapping, Polygon
# import cartopy
import geojson
import fiona
import h5py
get_ipython().magic(u'matplotlib inline')
import matplotlib as mpl
import matplotlib.pyplot as plt

import gdal
from glob import glob

import jenkspy

import rasterio as rio
from rasterio.plot import show

import pandas as pd

import collections
from numpy import mean

import random
import statistics

import time

print (sys.path)

['', '/opt/caffe/python', '/opt/caffe2/build', '/data/home/peter/notebooks/urban_heat', '/anaconda/envs/py36/lib/python36.zip', '/anaconda/envs/py36/lib/python3.6', '/anaconda/envs/py36/lib/python3.6/lib-dynload', '/anaconda/envs/py36/lib/python3.6/site-packages', '/anaconda/envs/py36/lib/python3.6/site-packages/IPython/extensions', '/data/home/peter/.ipython']


### Set key variables

In [None]:
# Set your input file here
input_filename = "ground_truth.csv"

# Read the data to a Pandas Dataframe
data = pd.read_csv(input_filename, encoding='utf8')
# Formatting the input address column so that it can be interpreted easily by the geocoding API
data['Address'] = data['Name'] +', '+ data['Street Address'] +', '+ data['City'] + ', ' + data['State'] + ' ' + data['Zip']  # assigned to a column

# Specify the column name in your input data that contains addresses here
address_column_name = "Address"

# Form a list of addresses for geocoding:
# Make a big list of all of the addresses to be processed.
addresses = data[address_column_name].tolist()
addresses

### Helper function

In [2]:
#------------------	FUNCTION DEFINITIONS ------------------------

def get_google_results(address, api_key=API_KEY, return_full_response=False):

    # Set up your Geocoding url
    geocode_url = "https://maps.googleapis.com/maps/api/geocode/json?address={}".format(address) + "&key={}".format(api_key)
        
    # Ping google for the reuslts:
    results = requests.get(geocode_url)
    # Results will be in JSON format - convert to dict using requests functionality
    results = results.json()
    
    # if there's no results or an error, return empty results.
    if len(results['results']) == 0:
        output = {
            "formatted_address" : None,
            "latitude": None,
            "longitude": None,
            "accuracy": None,
            "google_place_id": None,
            "type": None,
            "postcode": None
        }
    else:    
        answer = results['results'][0]
        output = {
            "formatted_address" : answer.get('formatted_address'),
            "latitude": answer.get('geometry').get('location').get('lat'),
            "longitude": answer.get('geometry').get('location').get('lng'),
            "accuracy": answer.get('geometry').get('location_type'),
            "viewport": answer.get('geometry').get('viewport'),
            "bounds": answer.get('geometry').get('bounds'),
            "google_place_id": answer.get("place_id"),
            "type": ",".join(answer.get('types')),
            "postcode": ",".join([x['long_name'] for x in answer.get('address_components') 
                                  if 'postal_code' in x.get('types')])
        }
        
    # Append some other details:    
    output['input_string'] = address
    output['number_of_results'] = len(results['results'])
    output['status'] = results.get('status')
    if return_full_response is True:
        output['response'] = results
    
    return output



### Supply the rooftop addresses and geocode them

In [None]:
### Adapted from python batch geocoding.py by Shane Lynn
logger = logging.getLogger("root")
logger.setLevel(logging.DEBUG)
# create console handler
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)

#------------------ CONFIGURATION -------------------------------

# Set your Google API key here. 
API_KEY = 'put your API key here'
# Backoff time sets how many minutes to wait between google pings when your API limit is hit
BACKOFF_TIME = 30

# the output file containig the geocoded addresses
output_filename = "geocded_addresses.csv"

# Return Full Google Results? If True, full JSON results from Google are included in output
RETURN_FULL_RESULTS = False

# Create a list to hold results
results = []
# Go through each address in turn
for address in addresses:
    # While the address geocoding is not finished:
    geocoded = False
    while geocoded is not True:
        # Geocode the address with google
        try:
            geocode_result = get_google_results(address, API_KEY, return_full_response=RETURN_FULL_RESULTS)
        except Exception as e:
            logger.exception(e)
            logger.error("Major error with {}".format(address))
            logger.error("Skipping!")
            geocoded = True
            
        # If we're over the API limit, backoff for a while and try again later.
        if geocode_result['status'] == 'OVER_QUERY_LIMIT':
            logger.info("Hit Query Limit! Backing off for a bit.")
            time.sleep(BACKOFF_TIME * 60) # sleep for 30 minutes
            geocoded = False
        else:
            if geocode_result['status'] != 'OK':
                logger.warning("Error geocoding {}: {}".format(address, geocode_result['status']))
            logger.debug("Geocoded: {}: {}".format(address, geocode_result['status']))
            results.append(geocode_result)           
            geocoded = True

    # Print status every 100 addresses
    if len(results) % 100 == 0:
    	logger.info("Completed {} of {} address".format(len(results), len(addresses)))
            
    # Every 500 addresses, save progress to file(in case of a failure so you have something!)
    if len(results) % 500 == 0:
        pd.DataFrame(results).to_csv("{}_bak".format(output_filename))

# All done
logger.info("Finished geocoding all addresses")
# Write the full results to csv using the pandas library.
pd.DataFrame(results).to_csv(output_filename, encoding='utf8')

### Search for matching footprints from Microsoft footprints data
`Two way to achieve that. Either search from Geojson files uploaded to the VM or search from the DL vector product that contains all the Microsoft footprints`

In [None]:
# Set your input file containing the geocoded addresses
input_filename = "geocded_addresses.csv"

# Read the data to a Pandas Dataframe
df = pd.read_csv(input_filename, encoding='utf8')

addresses= df[['longitude','latitude','input_string','Warranty Start Date','Twelve Months','Solar-Initial']].apply(tuple, axis=1)
addresses

##### Download footprints directly from geojson

In [None]:
# downloading footprints from California in this example
with open('/data/phase_i/microsoft_footprints/California.geojson') as f:
    js = json.load(f)

In [None]:
for Y, X in addresses.iteritems():
    print('Searching matching polygon for:', X[0],X[1])
    # construct point based on lat/long returned by geocoder
    point = Point(X[0],X[1])

    # check each polygon to see if it contains the point
    for feature in js['features']:
        polygon = shape(feature['geometry'])
        if polygon.contains(point):
            print ('Found containing polygon:', feature)

            # Define a polygon feature geometry with one attribute
            schema = {
                'geometry': 'Polygon',
                'properties': {'id': 'int'},
            }

            # Write a new Shapefile
            with fiona.open(data_root+'footprint_shapes/george'+str(X[1])+','+str(X[0])+'_msfootprint.shp', 'w', 'ESRI Shapefile', schema) as c:
                ## If there are multiple geometries, put the "for" loop here
                c.write({
                    'geometry': mapping(polygon),
                    'properties': {'id': 123},
                })
            #As soon as you find the matching polygon, break out of the loop
            break
            


###### Download footprints from DL

In [None]:
microsoft_footprints  = FeatureCollection('put the DL vector product id here')

In [None]:
roof_add=[]
foot_path = []
start_dt=[]
end_dt=[]
in_solar = []
# thr_solar = []
lat=[]
long=[]
bld_shp = []

cnt=0
for Y, X in addresses.iteritems():
    cnt=cnt+1
    print(cnt)
    print ('Searching matching polygon for:', X[0],X[1])
    # construct point based on lat/long returned by geocoder
    point = Point(X[0],X[1])
    buf = point.buffer(0.00001)
    buf = mapping(buf)
    
    s = dl.Vector().search_features('a35126a241bd022c026e96ab9fe5e0ea23967d08:USBuildingFootprints', geometry=buf, query_expr=None, query_limit=None,)

    for feature in s:
        att = feature.attributes
        polygon = shape(att['geometry'])
        
        # Define a polygon feature geometry with one attribute
        schema = {
            'geometry': 'Polygon',
            'properties': {'id': 'int'},
            }

        # Write a new Shapefile
        with fiona.open(data_root+'footprint_shapes/george/'+str(X[1])+','+str(X[0])+'_msfootprint.shp', 'w', 'ESRI Shapefile', schema) as c:
            ## If there are multiple geometries, put the "for" loop here
            c.write({
                'geometry': mapping(polygon),
                'properties': {'id': 123},
            })
        
        roof_add.append(X[2])
        foot_path.append(data_root+'footprint_shapes/george/'+str(X[1])+','+str(X[0])+'_msfootprint.shp')
        start_dt.append(X[3])
        end_dt.append(X[4])
        in_solar.append(X[5])
#         thr_solar.append(X[6])
        lat.append(X[1])
        long.append(X[0])
        bld_shp.append(polygon)
        
        break

# store the results to a pandas library.
df = pd.DataFrame({'roof_add': roof_add, 'footprint_path': foot_path, 'footprint_shapes': bld_shp, 'longitude': long,'latitude': lat, 
                   'start_date': start_dt,'end_date': end_dt,'Solar-Initial': in_solar})

# Write the full results to csv using the pandas library. 
df.to_csv('footprints_data.csv',encoding='utf8')

------------------------------------------------------------------