In [1]:
# Import local libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import time

# Import GeoPandas
import geopandas as gpd

# Import PySpark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType, FloatType
from pyspark.sql.functions import col

# Import Apache Sedona
from sedona.register import SedonaRegistrator
from sedona.core.formatMapper.shapefileParser import ShapefileReader
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from sedona.utils.adapter import Adapter as adp

## Define spark session if not defined yet
No need to define spark if run in an external cloud

In [2]:
try:
    spark
except NameError:
    spark = SparkSession. \
    builder. \
    appName('appName'). \
    master('local[*]'). \
    config("spark.serializer", KryoSerializer.getName). \
    config("spark.kryo.registrator", SedonaKryoRegistrator.getName). \
    config("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"). \
    config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"). \
    getOrCreate()

23/04/06 01:56:23 WARN Utils: Your hostname, Kanchans-Laptop.local resolves to a loopback address: 127.0.0.1; using 192.168.1.13 instead (on interface en0)
23/04/06 01:56:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/opt/homebrew/Cellar/apache-spark/3.3.1/libexec/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/kanchan/.ivy2/cache
The jars for the packages stored in: /Users/kanchan/.ivy2/jars
org.apache.sedona#sedona-python-adapter-3.0_2.12 added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-3aeb0ce4-0fa7-4c4f-9f7d-06f96cd53a7d;1.0
	confs: [default]
	found org.apache.sedona#sedona-python-adapter-3.0_2.12;1.2.0-incubating in central
	found org.locationtech.jts#jts-core;1.18.0 in central
	found org.wololo#jts2geojson;0.16.1 in central
	found org.apache.sedona#sedona-core-3.0_2.12;1.2.0-incubating in central
	found org.scala-lang.modules#scala-collection-compat_2.12;2.5.0 in central
	found org.apache.sedona#sedona-sql-3.0_2.12;1.2.0-incubating in central
	found org.datasyslab#geotools-wrapper;geotools-24.0 in central
:: resolution report :: resolve 157ms :: artifacts dl 13ms
	:: modules in use:
	org.apache.sedona#sedona-core-3.0_2.12;1.2.0-incubating from central in [default

23/04/06 01:56:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
SedonaRegistrator.registerAll(spark)
sc = spark.sparkContext
sc.setSystemProperty("sedona.global.charset", "utf8")
sc._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")

23/04/06 01:57:17 WARN UDTRegistration: Cannot register UDT for org.locationtech.jts.geom.Geometry, which is already registered.
23/04/06 01:57:17 WARN UDTRegistration: Cannot register UDT for org.locationtech.jts.index.SpatialIndex, which is already registered.
23/04/06 01:57:17 WARN SimpleFunctionRegistry: The function st_pointfromtext replaced a previously registered function.
23/04/06 01:57:17 WARN SimpleFunctionRegistry: The function st_polygonfromtext replaced a previously registered function.
23/04/06 01:57:17 WARN SimpleFunctionRegistry: The function st_linestringfromtext replaced a previously registered function.
23/04/06 01:57:17 WARN SimpleFunctionRegistry: The function st_geomfromtext replaced a previously registered function.
23/04/06 01:57:17 WARN SimpleFunctionRegistry: The function st_geomfromwkt replaced a previously registered function.
23/04/06 01:57:17 WARN SimpleFunctionRegistry: The function st_geomfromwkb replaced a previously registered function.
23/04/06 01:57:

# Use the prefix in all your DBFS path

If you use DBFS, Databricks requires that all paths must be absolute. You can use the variable below as the prefix for all paths.

In [3]:
from pathlib import Path

PATH_PREFIX= str(Path.home()) + '/' if os.environ.get('ENV_WB', 'false') == 'true' else '/'

print(PATH_PREFIX)

/


## Reverse Geocoding
Perform reverse geocoding using the package road_network_toolbox

### Import PyMaps and Set SparkSession to RoadNetworkToolbox

In [5]:
import pymaps
from pymaps import SparkRegistration

SparkRegistration.set_spark_session(spark)

## Create Instance of Reverse Geocoding

In [6]:
rgc = pymaps.ReverseGeocoding()

### Load OpenStreetMaps File Into RoadNetworkToolbox
The following step load_OSM is executed only once to load osm data. It might take time

In [8]:
rgc.load_OSM("s3a://wherobots-examples/data/osm.xml")
dfLocations = rgc.get_address_dataframe()
dfLocations.show()

23/04/06 01:59:44 WARN SimpleFunctionRegistry: The function get_address_nodes replaced a previously registered function.


[Stage 7:>                                                          (0 + 1) / 1]

+----------+-----------+---------+-----+--------------------+------------------+-------+-----+-----+--------+--------+-------+-------+------+------+-----------+--------+------+----------+-----------+--------------------+
|        id|housenumber|housename|flats|              street|conscriptionnumber|   city|place|state|province|postcode|country|postbox|hamlet|suburb|subdistrict|district|county|       lat|        lon|            geometry|
+----------+-----------+---------+-----+--------------------+------------------+-------+-----+-----+--------+--------+-------+-------+------+------+-----------+--------+------+----------+-----------+--------------------+
| 367815760|       null|     null| null|                null|              null|   null| null|   MI|    null|    null|   null|   null|  null|  null|       null|    null|  null|42.3517621|-83.0626801|POINT (-83.062680...|
|1217721054|       4421|     null| null|     Woodward Avenue|              null|Detroit| null|   MI|    null|   4820

                                                                                

In [9]:
dfLocations.printSchema()

root
 |-- id: long (nullable = true)
 |-- housenumber: string (nullable = true)
 |-- housename: string (nullable = true)
 |-- flats: string (nullable = true)
 |-- street: string (nullable = true)
 |-- conscriptionnumber: string (nullable = true)
 |-- city: string (nullable = true)
 |-- place: string (nullable = true)
 |-- state: string (nullable = true)
 |-- province: string (nullable = true)
 |-- postcode: string (nullable = true)
 |-- country: string (nullable = true)
 |-- postbox: string (nullable = true)
 |-- hamlet: string (nullable = true)
 |-- suburb: string (nullable = true)
 |-- subdistrict: string (nullable = true)
 |-- district: string (nullable = true)
 |-- county: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- geometry: geometry (nullable = false)



### Perform Reverse Geocoding for a Latitude and Longitude Pair

#### After loading OSM data, call the following methods every time you need to perform reverse geocoding

In [10]:
userLat = 42.3527285
userLon = -83.0621298

In [14]:
fullAddress = rgc.get_address(userLat, userLon)
fullAddress

                                                                                

{'found': True,
 'housenumber': '4421',
 'street': 'Woodward Avenue',
 'city': 'Detroit',
 'state': 'MI',
 'postcode': '48201'}

### Perform Reverse Geocoding for a Sedona DataFrame

In [15]:
df_points = dfLocations.select("geometry").limit(100)
df_points.show(5)

+--------------------+
|            geometry|
+--------------------+
|POINT (-83.062680...|
|POINT (-83.062129...|
|POINT (-83.065367...|
|POINT (-83.061481...|
|POINT (-83.065562...|
+--------------------+
only showing top 5 rows



[Stage 10:>                                                         (0 + 1) / 1]                                                                                

In [16]:
addressDf = rgc.get_address(df_points, "geometry")
addressDf.show(5)

                                                                                

+--------------------+----------+-----------+---------+-----+------------------+------------------+-------+-----+-----+--------+--------+-------+-------+------+------+-----------+--------+------+----------+-----------+--------------------+--------+
|            geometry|        id|housenumber|housename|flats|            street|conscriptionnumber|   city|place|state|province|postcode|country|postbox|hamlet|suburb|subdistrict|district|county|       lat|        lon|            geometry|distance|
+--------------------+----------+-----------+---------+-----+------------------+------------------+-------+-----+-----+--------+--------+-------+-------+------+------+-----------+--------+------+----------+-----------+--------------------+--------+
|POINT (-83.064125...|2418338352|        422|     null| null|West Willis Street|              null|Detroit| null| null|    null|   48201|   null|   null|  null|  null|       null|    null|  null| 42.350535| -83.064125|POINT (-83.064125...|     0.0|
|POI

