In [None]:
# Import local libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import time

# Import GeoPandas
import geopandas as gpd

# Import PySpark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType, FloatType
from pyspark.sql.functions import col
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

# Import Apache Sedona
from sedona.register import SedonaRegistrator
from sedona.core.formatMapper.shapefileParser import ShapefileReader
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from sedona.utils.adapter import Adapter as adp

VBox()

Starting Spark application




## Define spark session if not defined yet
No need to define spark if run in an external cloud

In [None]:
try:
    spark
except NameError:
    spark = SparkSession. \
    builder. \
    appName('appName'). \
    master('local[*]'). \
    config("spark.serializer", KryoSerializer.getName). \
    config("spark.kryo.registrator", SedonaKryoRegistrator.getName). \
    config("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"). \
    config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"). \
    getOrCreate()

In [None]:
SedonaRegistrator.registerAll(spark)
sc = spark.sparkContext
sc.setSystemProperty("sedona.global.charset", "utf8")
sc._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")

# Use the prefix in all your DBFS path

If you use DBFS, Databricks requires that all paths must be absolute. You can use the variable below as the prefix for all paths.

In [None]:
from pathlib import Path

PATH_PREFIX= str(Path.home()) + '/' if os.environ.get('ENV_WB', 'false') == 'true' else '/'

print(PATH_PREFIX)

## Load Taxi Pick Up Data to Sedona

In [None]:
zones_rdd = ShapefileReader.readToGeometryRDD(sc, "s3a://wherobots-examples/data/pickup_data/shape_file")
zones_df = adp.toDf(zones_rdd, spark)
zones_df = zones_df.drop("_id")
zones_df = zones_df.rdd.zipWithIndex().toDF()
zones_df = zones_df.select(col("_1.*"), col("_2").alias('ids'))
zones_df = zones_df.withColumn("pickup_count", zones_df.pickup_cou.cast(IntegerType())).drop("pickup_cou")
zones_df.show(5)

## Outlier Detection
Get box maps statistics of the Sedona DataFrame for the attribute pickup_count. The Sedona DataFrame will be returned with a new attribute named box_statistics_class which defines the class of each record. Outlier is one of the classes.

In [None]:
from lampy import Outliers
from lampy import SparkRegistration

SparkRegistration.set_spark_session(spark)

In [None]:
outlier_df = Outliers.get_box_map_statistics(zones_df, "pickup_count")
outlier_df.show(5)

## Visualize the Class of each Record
Each class is defined by a color

In [None]:
outlier_gdf = gpd.GeoDataFrame(outlier_df.toPandas(), geometry = "geometry", crs = "EPSG:4326")
ax = outlier_gdf.plot(column='box_statistics_class', categorical=True, aspect = 'equal', legend=True, legend_kwds={'loc': 'center left', 'bbox_to_anchor':(1,0.5)})