In [1]:
# Import local libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import time

# Import GeoPandas
import geopandas as gpd

# Import PySpark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType, FloatType
from pyspark.sql.functions import col

# Import Apache Sedona
from sedona.register import SedonaRegistrator
from sedona.core.formatMapper.shapefileParser import ShapefileReader
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from sedona.utils.adapter import Adapter as adp

## Define spark session if not defined yet
No need to define spark if run in an external cloud

In [2]:
try:
    spark
except NameError:
    spark = SparkSession. \
    builder. \
    appName('appName'). \
    master('local[*]'). \
    config("spark.serializer", KryoSerializer.getName). \
    config("spark.kryo.registrator", SedonaKryoRegistrator.getName). \
    config("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"). \
    config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"). \
    getOrCreate()

23/03/02 12:16:41 WARN Utils: Your hostname, Kanchans-Laptop.local resolves to a loopback address: 127.0.0.1; using 192.168.1.13 instead (on interface en0)
23/03/02 12:16:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/opt/homebrew/Cellar/apache-spark/3.3.1/libexec/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/kanchan/.ivy2/cache
The jars for the packages stored in: /Users/kanchan/.ivy2/jars
org.apache.sedona#sedona-python-adapter-3.0_2.12 added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-8c19fe1e-55aa-4a5e-accb-78ec6c608f24;1.0
	confs: [default]
	found org.apache.sedona#sedona-python-adapter-3.0_2.12;1.3.1-incubating in central
	found org.locationtech.jts#jts-core;1.18.2 in central
	found org.wololo#jts2geojson;0.16.1 in central
	found org.apache.sedona#sedona-core-3.0_2.12;1.3.1-incubating in central
	found org.apache.sedona#sedona-common;1.3.1-incubating in central
	found org.apache.sedona#sedona-sql-3.0_2.12;1.3.1-incubating in central
	found org.scala-lang.modules#scala-collection-compat_2.12;2.5.0 in central
	found org.datasyslab#geotools-wrapper;1.1.0-25.2 in central
:: resolution report :: resolve 141ms :: artifacts dl 10ms
	:: modules in use:
	org.apache.sedon

23/03/02 12:16:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
SedonaRegistrator.registerAll(spark)
sc = spark.sparkContext
sc.setSystemProperty("sedona.global.charset", "utf8")
sc._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")

# Use the prefix in all your DBFS path

If you use DBFS, Databricks requires that all paths must be absolute. You can use the variable below as the prefix for all paths.

In [4]:
from pathlib import Path

PATH_PREFIX= str(Path.home()) + '/' if os.environ.get('ENV_WB', 'false') == 'true' else '/'

print(PATH_PREFIX)




## Load Taxi Pick Up Data to Sedona

In [5]:
taxiDf = spark.read.format("csv").option("header", True).load("s3a://wherobots-examples/data/nyc-taxi-data.csv")
taxiDf = taxiDf.filter(taxiDf.Start_Lat >= 40).filter(taxiDf.Start_Lat <= 41).filter(taxiDf.Start_Lon >= -75).filter(taxiDf.Start_Lon <= -73)
taxiDf.createOrReplaceTempView("taxiDf")
taxiDf.show(5)

+-----------+--------------------+---------------------+---------------+-------------------+-------------------+------------------+---------+-----------------+-------------------+------------------+------------+------------------+---------+-------+------------------+---------+------------------+
|vendor_name|Trip_Pickup_DateTime|Trip_Dropoff_DateTime|Passenger_Count|      Trip_Distance|          Start_Lon|         Start_Lat|Rate_Code|store_and_forward|            End_Lon|           End_Lat|Payment_Type|          Fare_Amt|surcharge|mta_tax|           Tip_Amt|Tolls_Amt|         Total_Amt|
+-----------+--------------------+---------------------+---------------+-------------------+-------------------+------------------+---------+-----------------+-------------------+------------------+------------+------------------+---------+-------+------------------+---------+------------------+
|        VTS| 2009-01-04 02:52:00|  2009-01-04 03:02:00|              1| 2.6299999999999999|-73.9919569999999

In [6]:
## Select only required or target columns
taxiDf = spark.sql("select ST_Point(Double(Start_Lon), Double(Start_Lat)) as pickup_loc, Trip_Pickup_DateTime as pickup_time from taxiDf")
taxiDf.createOrReplaceTempView("taxiDf")
taxiDf.show(5)

+--------------------+-------------------+
|          pickup_loc|        pickup_time|
+--------------------+-------------------+
|POINT (-73.991957...|2009-01-04 02:52:00|
|POINT (-73.982102...|2009-01-04 03:31:00|
|POINT (-74.002587...|2009-01-03 15:43:00|
|POINT (-73.974267...|2009-01-01 20:52:58|
|POINT (-74.00158 ...|2009-01-24 16:18:23|
+--------------------+-------------------+
only showing top 5 rows



In [7]:
taxiDf.count()

                                                                                

13876530

## Spatial Sampling Analysis
Perform spatially stratified sampling with lampy

In [10]:
from lampy import SpatialSampling as ss
from lampy import SparkRegistration

SparkRegistration.set_spark_session(spark)

In [11]:
ssDf = ss.get_spatially_stratified_samples(taxiDf, "pickup_loc", 0.1)
ssDf.show(10)

                                                                                

23/03/02 12:16:54 WARN JoinQuery: UseIndex is true, but no index exists. Will build index on the fly.




+--------------------+-------------------+
|          pickup_loc|        pickup_time|
+--------------------+-------------------+
|POINT (-75.968462...|2009-01-24 00:46:00|
|POINT (-75.95602 ...|2009-01-24 00:18:00|
|POINT (-75.764997...|2009-01-23 19:05:00|
|POINT (-75.806533...|2009-01-26 22:58:00|
|POINT (-75.571363...|2009-01-26 05:52:00|
|POINT (-74.866317...|2009-01-16 19:24:00|
|POINT (-74.890647...|2009-01-16 20:10:00|
|POINT (-74.874623...|2009-01-16 18:58:00|
|POINT (-75.844613...|2009-01-27 00:41:00|
|POINT (-74.831133...|2009-01-07 06:28:00|
+--------------------+-------------------+
only showing top 10 rows





In [12]:
ssDf.count()

23/03/02 12:17:01 WARN JoinQuery: UseIndex is true, but no index exists. Will build index on the fly.


                                                                                

1389530