In [1]:
from time import time
import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.sql import SparkSession

from py4j.java_gateway import java_import

In [2]:
spark = (
    SparkSession.builder.appName("test")
    .config("spark.sql.sources.partitionOverwriteMode", "static")
    .config("spark.sql.caseSensitive", "true")
    .config("spark.driver.extraJavaOptions", "-Duser.timezone=GMT")
    .config("spark.executor.extraJavaOptions", "-Duser.timezone=GMT")
    .config("spark.sql.session.timeZone", "UTC")
    .getOrCreate()
)

In [3]:
sdf = spark.read.parquet("data/dataset_3")

# 1. Python
## 1.1. Python library

In [4]:
import h3
h3.geo_to_h3(0, 0, 8)

'88754e6499fffff'

## 1.2. Python udf

In [5]:
@F.udf(T.StringType())
def geo_to_h3_py(latitude, longitude, resolution):
    if latitude is None or longitude is None:
        return None
    
    return h3.geo_to_h3(latitude, longitude, resolution)

In [6]:
(
    sdf
    .withColumn("h8", geo_to_h3_py("latitude", "longitude", F.lit(8)))
    .show()
)

+-------------------+-------------------+---------------+
|           latitude|          longitude|             h8|
+-------------------+-------------------+---------------+
|-21.312207074342027| -160.0670321180626|88b4ee4e09fffff|
|  50.19929821057201| 162.28091785280452|8816a9480bfffff|
|  41.31619353494014|-126.25339414248694|88280e8c45fffff|
|  80.74685363258044| -146.7956606121309|880385740bfffff|
| 61.132119913605635| 3.9118133429158206|8809aad433fffff|
|   78.3401760559614|-106.11650754917403|880203b86dfffff|
| -58.54991587056003| 113.45900052031868|88e5a820d5fffff|
| -42.03748505685331|-161.50714721056474|88d4c6d5c3fffff|
|   68.6213813476812|  19.86834445821239|8808e89a8dfffff|
|  59.85531571059457|   5.23330694280304|8809802217fffff|
|-23.319622712468345|-21.532138804100356|88a4463b33fffff|
|  -33.4208833127299|-3.1036409270136858|88c15ba897fffff|
|  64.17563826776947| -9.793711461081045|88076481c1fffff|
| 21.269421204176865|-44.992039917657564|8857b671b7fffff|
|  36.02039510

# 2. Java
## 2.1. Java function

In [7]:
# Using h3 library
h3 = spark.sparkContext._jvm.com.uber.h3core.H3Core.newInstance()
h3.geoToH3Address(0.0, 0.0, 8)

'88754e6499fffff'

In [8]:
# Using our static function
spark.sparkContext._jvm.com.villoro.SimpleH3.toH3Address(0.0, 0.0, 8)

'88754e6499fffff'

## 2.2. Java udf

In [9]:
spark.udf.registerJavaFunction("geo_to_h3", "com.villoro.toH3AddressUDF", T.StringType())

In [10]:
(
    sdf
    .withColumn("h8", F.expr("geo_to_h3(latitude, longitude, 8)"))
    .show()
)

+-------------------+-------------------+---------------+
|           latitude|          longitude|             h8|
+-------------------+-------------------+---------------+
|-21.312207074342027| -160.0670321180626|88b4ee4e09fffff|
|  50.19929821057201| 162.28091785280452|8816a9480bfffff|
|  41.31619353494014|-126.25339414248694|88280e8c45fffff|
|  80.74685363258044| -146.7956606121309|880385740bfffff|
| 61.132119913605635| 3.9118133429158206|8809aad433fffff|
|   78.3401760559614|-106.11650754917403|880203b86dfffff|
| -58.54991587056003| 113.45900052031868|88e5a820d5fffff|
| -42.03748505685331|-161.50714721056474|88d4c6d5c3fffff|
|   68.6213813476812|  19.86834445821239|8808e89a8dfffff|
|  59.85531571059457|   5.23330694280304|8809802217fffff|
|-23.319622712468345|-21.532138804100356|88a4463b33fffff|
|  -33.4208833127299|-3.1036409270136858|88c15ba897fffff|
|  64.17563826776947| -9.793711461081045|88076481c1fffff|
| 21.269421204176865|-44.992039917657564|8857b671b7fffff|
|  36.02039510