In [None]:
%%configure
{
    "conf":{
        "spark.pyspark.virtualenv.enabled": "false"
    }
}

In [4]:
# Import local libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import time
import math

# Import GeoPandas
import geopandas as gpd

# Import PySpark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType, FloatType
from pyspark.sql.functions import col

# Import Apache Sedona
from sedona.register import SedonaRegistrator
from sedona.core.formatMapper.shapefileParser import ShapefileReader
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from sedona.utils.adapter import Adapter as adp
from sedona.core.enums import GridType, IndexType
from sedona.core.SpatialRDD import CircleRDD
from sedona.core.spatialOperator import JoinQuery

## Define spark session if not defined yet
No need to define spark if run in an external cloud

In [5]:
try:
    spark
except NameError:
    spark = SparkSession. \
    builder. \
    appName('appName'). \
    master('local[*]'). \
    config("spark.serializer", KryoSerializer.getName). \
    config("spark.kryo.registrator", SedonaKryoRegistrator.getName). \
    config("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"). \
    config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"). \
    getOrCreate()

In [None]:
SedonaRegistrator.registerAll(spark)
sc = spark.sparkContext
sc.setSystemProperty("sedona.global.charset", "utf8")
sc._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")

def delete_path(sc, path):
    fs = (sc._jvm.org
          .apache.hadoop
          .fs.FileSystem
          .get(sc._jsc.hadoopConfiguration())
          )
    fs.delete(sc._jvm.org.apache.hadoop.fs.Path(path), True)

## Use the prefix in all your EMR path

If you use EMR, EMR requires that all paths must be relative. Please use the variable below as the prefix for all paths because it can automatically detect if you are in Wherobots environment or not.

In [10]:
from pathlib import Path

PATH_PREFIX= str(Path.home()) + '/' if os.environ.get('ENV_WB', 'false') == 'true' else ''

print(PATH_PREFIX)

/


## Load Area Landmark Data to Sedona

In [None]:
area_rdd = ShapefileReader.readToGeometryRDD(sc, "s3a://wherobots-examples/data/nyc-area-landmark-shapefile")
area_df = adp.toDf(area_rdd, spark)
area_df.show(5)

In [6]:
area_df.printSchema()

root
 |-- geometry: geometry (nullable = true)
 |-- statefp: string (nullable = true)
 |-- ansicode: string (nullable = true)
 |-- areaid: string (nullable = true)
 |-- fullname: string (nullable = true)
 |-- mtfcc: string (nullable = true)
 |-- aland: string (nullable = true)
 |-- awater: string (nullable = true)
 |-- intptlat: string (nullable = true)
 |-- intptlon: string (nullable = true)



In [7]:
area_df = area_df.select("geometry")
area_df.show(5)

+--------------------+
|            geometry|
+--------------------+
|POLYGON ((-73.894...|
|POLYGON ((-73.781...|
|POLYGON ((-73.911...|
|POLYGON ((-73.772...|
|POLYGON ((-73.787...|
+--------------------+
only showing top 5 rows



## Load Taxi Trip Data to Sedona

In [None]:
taxi_df = spark.read.format("csv").option("header", True).load("s3a://wherobots-examples/data/nyc-taxi-data.csv")
taxi_df.show(5)

In [10]:
taxi_df.createOrReplaceTempView("taxi_df")
taxi_df = spark.sql("select ST_Point(Double(Start_Lon), Double(Start_Lat)) as point from taxi_df")
taxi_df.show(5)

+--------------------+
|               point|
+--------------------+
|POINT (-73.991957...|
|POINT (-73.982102...|
|POINT (-74.002587...|
|POINT (-73.974267...|
|POINT (-74.00158 ...|
+--------------------+
only showing top 5 rows



## Colocation Pattern Detection
There are various algorithms for detecting colocation pattern between two spatial datasets. Here, we use the algorithm based on Ripley's K.

In [11]:
from lampy import ColocationPatterns, SparkRegistration

SparkRegistration.set_spark_session(spark)

In [12]:
t1 = time.time()
colocated = ColocationPatterns.get_colocation_pattern_on_ripleys_k(area_df, taxi_df, "geometry", "point")
if colocated:
    print("Colocated")
else:
    print("Not Colocated")
t2 = time.time()
print("Required time:", t2-t1)

                                                                                

Colocated
Required time: 3.1826131343841553
