In [None]:
%%configure
{
    "conf":{
        "spark.pyspark.virtualenv.enabled": "false"
    }
}

In [None]:
# Import local libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import time

# Import GeoPandas
import geopandas as gpd

# Import PySpark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType, FloatType
from pyspark.sql.functions import col

# Import Apache Sedona
from sedona.register import SedonaRegistrator
from sedona.core.formatMapper.shapefileParser import ShapefileReader
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from sedona.utils.adapter import Adapter as adp

## Define spark session if not defined yet
No need to define spark if run in an external cloud

In [None]:
try:
    spark
except NameError:
    spark = SparkSession. \
    builder. \
    appName('appName'). \
    master('local[*]'). \
    config("spark.serializer", KryoSerializer.getName). \
    config("spark.kryo.registrator", SedonaKryoRegistrator.getName). \
    config("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"). \
    config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"). \
    getOrCreate()

In [None]:
SedonaRegistrator.registerAll(spark)
sc = spark.sparkContext
sc.setSystemProperty("sedona.global.charset", "utf8")
sc._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")

def delete_path(sc, path):
    fs = (sc._jvm.org
          .apache.hadoop
          .fs.FileSystem
          .get(sc._jsc.hadoopConfiguration())
          )
    fs.delete(sc._jvm.org.apache.hadoop.fs.Path(path), True)

## Use the prefix in all your EMR path

If you use EMR, EMR requires that all paths must be relative. Please use the variable below as the prefix for all paths because it can automatically detect if you are in Wherobots environment or not.

In [None]:
from pathlib import Path

PATH_PREFIX= str(Path.home()) + '/' if os.environ.get('ENV_WB', 'false') == 'true' else ''

print(PATH_PREFIX)

## Load Taxi Pick Up Data to Sedona

In [None]:
zones_rdd = ShapefileReader.readToGeometryRDD(sc, "s3a://wherobots-examples/data/pickup_data/shape_file")
zones_df = adp.toDf(zones_rdd, spark)
zones_df = zones_df.drop("_id")
zones_df = zones_df.rdd.zipWithIndex().toDF()
zones_df = zones_df.select(col("_1.*"), col("_2").alias('ids'))
zones_df = zones_df.withColumn("pickup_count", zones_df.pickup_cou.cast(IntegerType())).drop("pickup_cou")
zones_df.show(5)

## Find the Adjacency of Each Pickup Zone
The returned DataFrame will contain a new column named binary_adjacency. For each row (zone), it contains the adjacency (1/0) with other zones

In [None]:
from lampy import Adjacency
from lampy import SpatialAutocorrelation as sa
from lampy import SparkRegistration

SparkRegistration.set_spark_session(spark)

In [None]:
adjacency_df = Adjacency.get_polygons_adjacency(zones_df, "ids", "geometry")

## Compute Spatial Autocorrelation
Get local moran's statistics of the Sedona DataFrame for the attribute pickup_count. The Sedona DataFrame will be returned with a new attribute named local_moran which contains the value of local moran's statistics for each location

In [None]:
local_moran_df = sa.get_local_moran(zones_df, adjacency_df, "pickup_count", "binary_adjacency", "ids", "id")
local_moran_df.show(5)

## Visualize the Local Moran's Index Value for Each Zone

In [None]:
local_moran_gdf = gpd.GeoDataFrame(local_moran_df.toPandas(), geometry = "geometry", crs = "EPSG:4326")
local_moran_gdf.plot(column='local_moran', cmap='OrRd', edgecolor='k', aspect = 'equal', legend=True)

In [None]:
%matplot plt