In [5]:
import pandas as pd
import numpy as np
import os
import time
import json
import ast
import sys
import ast

from geopy.exc import (
    GeocoderQueryError,
    GeocoderQuotaExceeded,
    ConfigurationError,
    GeocoderParseError,
    GeocoderTimedOut
)

# Config

In [6]:
API_KEY = "AIzaSyAM6rMvNGdyCaJlhXvOjb_rYwqpzzKSGZ8"

COMPONENT_RESTRICTIONS = { 'country': 'CA'}

P_GEO_COLS = ["Address", "Postal Code"]
C_GEO_COLS = ["Clinic Address", "Postal Code"]

N = 6
RETRY_COUNTER_CONST = 10

P_FALLBACK_SCHEMA = ['FSA', 'City', 'Province']
C_FALLBACK_SCHEMA = ['FSA', 'Clinic City', 'Province']

OUTPUT_COLS = ["Patient_ID","Pat_Geo_Cols","Pat_Geocode","Pat_Address","Pat_Postal_Code",
    "Pat_FSA","Nearest_Clinic_ID","Clinic_Geo_Cols","Clinic_Geocode",
    "Clinic_Address","Clinic_Postal_Code","Clinic_FSA","Clinic_Distance"]
    
DEFAULT_LOC = [43.6532, 79.3832]
DEFAULT_DIST = 9999


# Load files

In [7]:
def load_files(P_FILE, C_FILE):
    try:
        p_df = pd.read_csv(P_FILE)
        c_df = pd.read_csv(C_FILE)
    except:
        print("Fail to load data, please check input data path is correct")
        sys.exit(1)

    return p_df, c_df

# Geocoding

In [8]:
import geopy
from geopy.geocoders import GoogleV3
from functools import partial 

from tqdm import tqdm
tqdm.pandas()

In [9]:
def generate_address_list(r, Geo_Cols, schema):
    '''
    Generate a list of fallback address list in the order per schema
    '''
    address_list = []
    
    address = ""
    for c in Geo_Cols.split(','):
        if not address:
            address += r[c]
        else:
            address += ", " + r[c]
            
    address_list.append(address)
    
    for cc in schema:
        address_list.append(r[cc])
    
    return address_list

In [10]:
def geocode_address(geo_locator, address_list, retry_counter=1):
    '''
    Request GoogleV3 service for geocode data
    Exceptions are caught and handled
        - if Connection Error, retry $RETRY_COUNTER_CONST times
        - otherwise fall back to next precision level address and request the geolocation again
    Inputs: geo_locator: GoogleV3 geocoder
            address_list: a fall back address list in precision order, most precise address at position 0
            retry_counter: initial retry value
    Outputs: location_result: Geocoded lat/lng values received from the server
                            *note: if all precision level failed to retrive a valid data, return DEFAULT_LOC
            al: final state of address fallback list, according its consumed level, 
                assign the corresponding Geo_Cols info to the dataframe
    '''
    
    try:
        # default case if exhuasted the fallback list still no location found
        if not address_list:
            al = address_list
            return DEFAULT_LOC, al

        # the geopy GoogleV3 geocoding call
        location = geo_locator(address_list[0])
        
        if location is not None:
            location_result = tuple(location.point[:2])
            al = address_list
#             print(f'location : {location}, al: {al}')
        else:
            location_result, al = geocode_address(geo_locator, address_list[1:], retry_counter)

    # To catch generic geocoder errors.
    except (ValueError, GeocoderQuotaExceeded, ConfigurationError, GeocoderParseError) as error:
        if hasattr(error, 'message'):
            error_message = error.message
        else:
            error_message = error
        print(error_message)
        lcoation_result, al = geocode_address(geo_locator, address_list[1:], retry_counter)
    # To retry because intermittent failures and timeout sometimes occurs
    except (GeocoderTimedOut, GeocoderQueryError) as geocodingerror:
        if retry_counter < RETRY_COUNTER_CONST:
            print(f'Retrying {retry_counter} time(s) ' )
            return geocode_address(geo_locator, address_list, retry_counter+1)
        else:
            if hasattr(geocodingerror, 'message'):
                error_message = geocodingerror.message
            else:
                error_message = geocodingerror
            print(error_message)
            location_result, al = geocode_address(geo_locator, address_list[1:], retry_counter)
    # To retry because intermittent failures and timeout sometimes occurs
    except BaseException as error:
        if retry_counter < RETRY_COUNTER_CONST:
#             time.sleep(5)
            print(f'Retrying {retry_counter} time(s) ')
            return geocode_address(geo_locator, address_list, retry_counter+1)
        else:
            print(error)
            location_result, al = geocode_address(geo_locator, address_list[1:], retry_counter)

    return location_result, al

In [11]:
def geocode_files(P_df, C_df, output_path):
    '''
    Calling geocode_address function and adding corresponding result cols
    Outputs: processed Patients & Clinics dataframes
    '''
    
    geocoder = GoogleV3(api_key=API_KEY)
    geo_locator = partial(geocoder.geocode, components=COMPONENT_RESTRICTIONS)
    
    P_df["Pat_Geo_Cols"] = ",".join(map(str, P_GEO_COLS))
    C_df["Clinic_Geo_Cols"] = ",".join(map(str, C_GEO_COLS))
    
    P_df["Pat_Geocode"] = ''
    C_df["Clinic_Geocode"] = ''
    
    # geocode addresses by rows and modifies 'Geo_Cols' if needed
    for i,p_r in P_df.iterrows():
        p_address_list = generate_address_list(p_r, p_r["Pat_Geo_Cols"], P_FALLBACK_SCHEMA)
        
        P_df.at[i, "Pat_Geocode"], p_al = geocode_address(geo_locator, p_address_list)#, retry_counter=1)
        if len(p_al) != len(p_address_list):
            P_df.at[i, "Pat_Geo_Cols"] = P_FALLBACK_SCHEMA[-len(p_al)] if len(p_al) else "DEFAULT_LOC"
            
    for i,c_r in C_df.iterrows():
        c_address_list = generate_address_list(c_r, c_r["Clinic_Geo_Cols"], C_FALLBACK_SCHEMA)
        
        C_df.at[i, "Clinic_Geocode"], c_al = geocode_address(geo_locator, c_address_list)#, retry_counter=1)
        if len(c_al) != len(c_address_list):
            C_df.at[i, "Clinic_Geo_Cols"] = C_FALLBACK_SCHEMA[-len(c_al)] if len(c_al) else "DEFAULT_LOC"
            
    # output processed dataframes
    try:
        if not os.path.isdir(output_path):
            os.makedirs(output_path)
        P_df.to_csv(os.path.join(output_path, 'P_df.csv'), index=False)
        C_df.to_csv(os.path.join(output_path, 'C_df.csv'), index=False)
    except:
        print("Fail to output processed Patients & Clinics dataframes, please check the output path is corret")
    
    return P_df, C_df

# Matching

In [12]:
from geopy import distance
import googlemaps

In [13]:
def get_travel_dist(gmaps, P_node, C_node, retry_counter=1):
    '''
    apply gmaps Google Distance matrix API to find the nearest one with shortest travel distance
    Inputs: P_node: target patient node
            C_node: Client node
    Output: dist: shortest travel distance
    '''
    
    try:
        # the gmaps api call
        dist = gmaps.distance_matrix(P_node, C_node, mode='driving')
        
        if dist is not None:
            # locate the distance value from returned string
            dist_result = dist["rows"][0]["elements"][0]["distance"]["value"]/1000
        else:
            #if API server fails all, travel distances will set to the same default value
            # later the one with shorter geodesic distance will be picked
            dist_result = DEFAULT_DIST
     
    # To catch generic geocoder errors.
    except (ValueError, GeocoderQuotaExceeded, ConfigurationError, GeocoderParseError) as error:
        if hasattr(error, 'message'):
            error_message = error.message
        else:
            error_message = error
        print(error_message)
        dist_result = DEFAULT_DIST
    # To retry because intermittent failures and timeout sometimes occurs
    except (GeocoderTimedOut, GeocoderQueryError) as geocodingerror:
        if retry_counter < RETRY_COUNTER_CONST:
            print(f'Retrying {retry_counter} time(s)' )
            return get_travel_dist(P_node, C_node, retry_counter+1)
        else:
            if hasattr(geocodingerror, 'message'):
                error_message = geocodingerror.message
            else:
                error_message = geocodingerror
            print(error_message)
            dist_result = DEFAULT_DIST
    # To retry because intermittent failures and timeout sometimes occurs
    except BaseException as error:
        if retry_counter < RETRY_COUNTER_CONST:
#             time.sleep(5)
            print(f'Retrying {retry_counter} time(s)')
            return get_travel_dist(P_node, C_node, retry_counter+1)
        else:
            print(error)
            dist_result = DEFAULT_DIST
    
    return dist_result

In [69]:
def map_files(P_df, C_df, output_path):
    
    '''
    first using geodesic to filter out N nearest locations
    then apply google distance matrix to compute the travel distance
    '''
    # cols mapping
    PC_mapped = pd.DataFrame(columns = OUTPUT_COLS)
    PC_mapped["Patient_ID"] = P_df["ID"]
    PC_mapped["Pat_Geo_Cols"] = P_df["Pat_Geo_Cols"]
    PC_mapped["Pat_Geocode"] = P_df["Pat_Geocode"]
    PC_mapped["Pat_Address"] = P_df["Address"]
    PC_mapped["Pat_Postal_Code"] = P_df["Postal Code"]
    PC_mapped["Pat_FSA"] = P_df["FSA"]
    
    # initialize the distance API
    gmaps = googlemaps.Client(key=API_KEY)
    
    for i, p_r, in PC_mapped.iterrows():
        # target patient node
        P_node = p_r['Pat_Geocode']
        
        # target pool of N locations with neareset geodesic distance
        C_df['dist'] = C_df['Clinic_Geocode'].apply(lambda p : distance.geodesic(P_node, p).km)
        N_pool = C_df.nsmallest(N, 'dist', keep='all').reset_index()
        
        N_pool['travel_dist'] = ''
        
        # function that call the Google distance matrix API
        for n_i, r in N_pool.iterrows():
            N_pool.at[n_i,'travel_dist'] = get_travel_dist(gmaps, P_node, r["Clinic_Geocode"])
        
        # pick the one with smallest travel_dist, in case of duplicate, keep the first 
        # thus the one with shorter geodesic distance will be picked
        N_pool['travel_dist'] = N_pool['travel_dist'].apply(np.float)
        Nearest = N_pool.nsmallest(1, 'travel_dist', keep='first')

        PC_mapped.at[i, "Nearest_Clinic_ID"] = Nearest["Clinic ID"].values[0]
        PC_mapped.at[i, "Clinic_Geo_Cols"] = Nearest["Clinic_Geo_Cols"].values[0]
        PC_mapped.at[i, "Clinic_Geocode"] = Nearest["Clinic_Geocode"].values[0]
        PC_mapped.at[i, "Clinic_Address"] = Nearest["Clinic Address"].values[0]
        PC_mapped.at[i, "Clinic_Postal_Code"] = Nearest["Postal Code"].values[0]
        PC_mapped.at[i, "Clinic_FSA"] = Nearest["FSA"].values[0]
        PC_mapped.at[i, "Clinic_Distance"] = Nearest["travel_dist"].values[0]
       
        
    # output final mapped dataframes
    try:
        if not os.path.isdir(output_path):
            os.makedirs(output_path)
        PC_mapped.to_csv(os.path.join(output_path, 'PC_mapped.csv'), index=False)
    except:
        print("Fail to output final Mapped dataframes, please check the output path is corret")
    
    return PC_mapped

# Main

In [21]:
# import click

# @click.command()
# @click.option('patients_file', '--patients', default='./data/patients.csv', type=click.Path(exists=True),
#              help='Patients.csv file path')
# @click.option('clinics_file', '--clinics', default='./data/clinics.csv', type=click.Path(exists=True),
#              help='Clinics.csv file path')
# @click.option('output_path', '--output', default='./output', type=click.Path(),
#              help='Output directory')
# def main(patients_file, clinics_file, output_path):
    
#     click.echo('Loading data files...')
#     P_df, C_df = load_files(patients_file, clinics_file)
    
#     click.echo('Geocoding...')
#     P_df, C_df = geocode_files(P_df, C_df, output_path)
    
#     click.echo('Calculating travel distance and matching...')
#     PC_mapped = map_files(P_df, C_df, output_path)

    
# if __name__ == '__main__':
#     main()

In [17]:
patients_file = './data/patients.csv'
clinics_file = './data/clinics.csv'
output_path = './output'


# P_df, C_df = load_files(patients_file, clinics_file)

In [None]:
P_df, C_df = geocode_files(P_df, C_df, output_path)



Retrying 1 time(s) 
Retrying 1 time(s) 
Retrying 2 time(s) 
Retrying 3 time(s) 


In [2]:
P_df.head()

NameError: name 'P_df' is not defined

In [18]:
P_df = pd.read_csv('./output/P_df.csv', converters={'Pat_Geocode': ast.literal_eval})
C_df = pd.read_csv('./output/C_df.csv', converters={'Clinic_Geocode': ast.literal_eval})

In [None]:
# PC_mapped = map_files(P_df.iloc[11:12, :], C_df, './output')
nearest = map_files(P_df, C_df, './output')