In [2]:
# Import local libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import time

# Import GeoPandas
import geopandas as gpd

# Import PySpark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType, FloatType
from pyspark.sql.functions import col

# Import Apache Sedona
from sedona.register import SedonaRegistrator
from sedona.core.formatMapper.shapefileParser import ShapefileReader
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from sedona.utils.adapter import Adapter as adp

## Define spark session if not defined yet
No need to define spark if run in an external cloud

In [3]:
try:
    spark
except NameError:
    spark = SparkSession. \
    builder. \
    appName('appName'). \
    master('local[*]'). \
    config("spark.serializer", KryoSerializer.getName). \
    config("spark.kryo.registrator", SedonaKryoRegistrator.getName). \
    config("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"). \
    config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"). \
    getOrCreate()



23/03/13 13:08:32 WARN Utils: Your hostname, Kanchans-Laptop.local resolves to a loopback address: 127.0.0.1; using 192.168.1.13 instead (on interface en0)
23/03/13 13:08:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/13 13:08:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
SedonaRegistrator.registerAll(spark)
sc = spark.sparkContext
sc.setSystemProperty("sedona.global.charset", "utf8")
sc._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")

# Use the prefix in all your DBFS path

If you use DBFS, Databricks requires that all paths must be absolute. You can use the variable below as the prefix for all paths.

In [4]:
from pathlib import Path

PATH_PREFIX= str(Path.home()) + '/' if os.environ.get('ENV_WB', 'false') == 'true' else '/'

print(PATH_PREFIX)




## Load Taxi Pick Up Data to Sedona

In [5]:
taxiDf = spark.read.format("csv").option("header", True).load("s3a://wherobots-examples/data/nyc-taxi-data.csv")
taxiDf = taxiDf.filter(taxiDf.Start_Lat >= 40).filter(taxiDf.Start_Lat <= 41).filter(taxiDf.Start_Lon >= -75).filter(taxiDf.Start_Lon <= -73)
taxiDf.createOrReplaceTempView("taxiDf")
taxiDf.show(5)

+-----------+--------------------+---------------------+---------------+-------------------+-------------------+------------------+---------+-----------------+-------------------+------------------+------------+------------------+---------+-------+------------------+---------+------------------+
|vendor_name|Trip_Pickup_DateTime|Trip_Dropoff_DateTime|Passenger_Count|      Trip_Distance|          Start_Lon|         Start_Lat|Rate_Code|store_and_forward|            End_Lon|           End_Lat|Payment_Type|          Fare_Amt|surcharge|mta_tax|           Tip_Amt|Tolls_Amt|         Total_Amt|
+-----------+--------------------+---------------------+---------------+-------------------+-------------------+------------------+---------+-----------------+-------------------+------------------+------------+------------------+---------+-------+------------------+---------+------------------+
|        VTS| 2009-01-04 02:52:00|  2009-01-04 03:02:00|              1| 2.6299999999999999|-73.9919569999999

In [6]:
## Remove unnecessary columns and convert Start_lat and Start_Lon into Geometry type Points
taxiDf = spark.sql("select ST_Point(Double(Start_Lon), Double(Start_Lat)) as pickup_loc, Trip_Pickup_DateTime as pickup_time from taxiDf")
taxiDf.createOrReplaceTempView("taxiDf")
taxiDf.show(5)

+--------------------+-------------------+
|          pickup_loc|        pickup_time|
+--------------------+-------------------+
|POINT (-73.991957...|2009-01-04 02:52:00|
|POINT (-73.982102...|2009-01-04 03:31:00|
|POINT (-74.002587...|2009-01-03 15:43:00|
|POINT (-73.974267...|2009-01-01 20:52:58|
|POINT (-74.00158 ...|2009-01-24 16:18:23|
+--------------------+-------------------+
only showing top 5 rows



## Hotspot Analysis
Hotspot analysis finds out those places and time periods which have the highest occurrence of a target event. Lampy allows hotspot analysis in two dimensions: spatial and temporal. In spatial dimension, it converts the spatial coverage of the dataset into a grid. Usage of temporal dimension is optional, it will be considered only if parameter col_date is not None.

In [10]:
from lampy import HotspotAnalysis as ha
from lampy import SparkRegistration

SparkRegistration.set_spark_session(spark)

In [11]:
haDf = ha.get_hotspots(taxiDf, "pickup_loc", 10, 10)
haDf.createOrReplaceTempView("haDf")
haDf.show()



+--------------------+------------+-------------+
|       cell_geometry|_id_timestep|hotspot_count|
+--------------------+------------+-------------+
|POLYGON ((-73.826...|          91|            1|
|POLYGON ((-73.826...|         163|            1|
|POLYGON ((-73.826...|         646|            1|
|POLYGON ((-73.826...|         665|            1|
|POLYGON ((-73.826...|          33|            1|
|POLYGON ((-73.826...|          42|            1|
|POLYGON ((-73.826...|          45|            1|
|POLYGON ((-73.826...|          64|            1|
|POLYGON ((-73.826...|          82|            1|
|POLYGON ((-73.826...|          87|            1|
|POLYGON ((-73.826...|          88|            2|
|POLYGON ((-73.826...|         108|            1|
|POLYGON ((-73.826...|         109|            1|
|POLYGON ((-73.826...|         118|            1|
|POLYGON ((-73.826...|         130|            1|
|POLYGON ((-73.826...|         138|            1|
|POLYGON ((-73.826...|         151|            1|


                                                                                

### Hotspot Analysis with Temporal Dimension

In [None]:
haTimeDf = ha.get_hotspots(taxiDf, "pickup_loc", 10, 10, col_date="pickup_time", date_format="M/d/yy H:m")
haTimeDf.show()