In [None]:
%%configure
{
    "conf":{
        "spark.pyspark.virtualenv.enabled": "false"
    }
}

In [1]:
# Import local libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import time

# Import GeoPandas
import geopandas as gpd

# Import PySpark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType, FloatType
from pyspark.sql.functions import col, array, udf

# Import Apache Sedona
from sedona.register import SedonaRegistrator
from sedona.core.formatMapper.shapefileParser import ShapefileReader
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from sedona.utils.adapter import Adapter as adp

## Define spark session if not defined yet
No need to define spark if run in an external cloud

In [2]:
try:
    spark
except NameError:
    spark = SparkSession. \
    builder. \
    appName('appName'). \
    master('local[*]'). \
    config("spark.serializer", KryoSerializer.getName). \
    config("spark.kryo.registrator", SedonaKryoRegistrator.getName). \
    config("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"). \
    config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"). \
    getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/09 10:05:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
SedonaRegistrator.registerAll(spark)
sc = spark.sparkContext
sc.setSystemProperty("sedona.global.charset", "utf8")
sc._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")

def delete_path(sc, path):
    fs = (sc._jvm.org
          .apache.hadoop
          .fs.FileSystem
          .get(sc._jsc.hadoopConfiguration())
          )
    fs.delete(sc._jvm.org.apache.hadoop.fs.Path(path), True)

## Use the prefix in all your EMR path

If you use EMR, EMR requires that all paths must be relative. Please use the variable below as the prefix for all paths because it can automatically detect if you are in Wherobots environment or not.

In [4]:
from pathlib import Path

PATH_PREFIX= str(Path.home()) + '/' if os.environ.get('ENV_WB', 'false') == 'true' else ''

print(PATH_PREFIX)

/home/wherobots/


## Load Airbnb Data to Sedona

In [5]:
listings_df = spark.read.format("csv").option("header",True).load("s3a://wherobots-examples/data/airbnb_listings.csv")
listings_df = listings_df.drop("price")
listings_df.createOrReplaceTempView("listings_df")

listings_df = spark.sql("select double(log_price), double(host_listings_count), double(bedrooms), double(beds), double(accommodates), double(latitude) as latitude, double(longitude) as longitude from listings_df")
listings_df.createOrReplaceTempView("listings_df")
listings_df.show(5)

23/02/09 10:05:45 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


[Stage 4:>                                                          (0 + 1) / 1]

+------------------+-------------------+--------+----+------------+------------------+------------------+
|         log_price|host_listings_count|bedrooms|beds|accommodates|          latitude|         longitude|
+------------------+-------------------+--------+----+------------+------------------+------------------+
| 4.941642429752162|                1.0|     1.0| 2.0|         3.0|          30.26057|         -97.73441|
|3.7135720910945516|                1.0|     1.0| 1.0|         2.0|30.456970000000002|-97.78421999999999|
| 5.267858164217968|                1.0|     2.0| 4.0|         4.0|          30.80862|           -98.374|
| 6.542471961947727|                1.0|     2.0| 3.0|         5.0|          30.25328|         -97.72968|
| 5.257495377236115|                3.0|     3.0| 3.0|         8.0|30.283540000000002|         -97.64966|
+------------------+-------------------+--------+----+------------+------------------+------------------+
only showing top 5 rows



                                                                                

## Split the DataFrame into Train and Test Parts (60/40 Percent)

In [6]:
train_df, test_df = listings_df.randomSplit([0.6, 0.4], 24)

## Perform Geographically Weighted Regression
Cluster the points based on DBSCAN algorithm. For each point the returned DataFrame conatins the cluster labels with the column name component.

In [7]:
from lampy import ML_GWR
from lampy import SparkRegistration

SparkRegistration.set_spark_session(spark)

In [8]:
x_names = ['host_listings_count', 'bedrooms', 'beds', 'accommodates']
y_name = 'log_price'
lat_name = 'latitude'
lon_name = 'longitude'

### Train GWR Model

In [None]:
model = ML_GWR()
train_results = model.fit(train_df, lat_name, lon_name, x_names, y_name)

### Test the Trained Model with Test DataFrame

In [None]:
pred_results = model.predict(test_df, lat_name, lon_name, x_names)
test_predictions = pred_results.predictions

y_test = np.array(list(test_df.select(y_name).toPandas()[y_name])).T
mae = abs(y_test - test_predictions)
print('Mean Absolute Error:', round(np.mean(mae), 2))

mse = ((y_test - test_predictions)**2).mean()
rmse = np.sqrt(mse)
print("Root mean square error: " + str(rmse))

### Get the Model Parameters as a DataFrame

In [None]:
model_params_df = model.get_model_params() # You can write the model_params_df dataframe to any path for future use

### Load a New Model with the Previuosly Saved Parameters

In [None]:
model_new = ML_GWR()
model_new.load_model_params(model_params_df) # First, you should read model_params_df from the path where it was stored

### Test the New Loaded Model with Test DataFrame

In [None]:
pred_results = model_new.predict(test_df, lat_name, lon_name, x_names)
test_predictions = pred_results.predictions

y_test = np.array(list(test_df.select(y_name).toPandas()[y_name])).T
mae = abs(y_test - test_predictions)
print('Mean Absolute Error:', round(np.mean(mae), 2))

mse = ((y_test - test_predictions)**2).mean()
rmse = np.sqrt(mse)
print("Root mean square error: " + str(rmse))

## Perfrom Post Processing on Output and Test Data For Visualization

In [None]:
test_predictions = np.squeeze(test_predictions).tolist()
def get_pred_column_value(id):
    return test_predictions[id]
udf_get_prediction = udf(get_pred_column_value)

In [None]:
test_df = test_df.rdd.zipWithIndex().toDF().select(col("_1.*"), col("_2").alias('idx'))
test_df = test_df.withColumn("predicted_log_price", udf_get_prediction(test_df.idx))
test_df.createOrReplaceTempView("test_df")

test_df = spark.sql("select idx, log_price, predicted_log_price, ST_Point(double(longitude), double(latitude)) as airbnb_loc from test_df")
test_df.createOrReplaceTempView("test_df")
test_df.show(5)

### Loading Shape File for Austin Zip Code Boundaries

In [None]:
austin_rdd = ShapefileReader.readToGeometryRDD(sc, "s3a://wherobots-examples/data/austin_boundaries")
austin_df = adp.toDf(austin_rdd, spark)
austin_df = austin_df.select(austin_df.geometry)
austin_df.createOrReplaceTempView("austin_df")
austin_df.show(5)

### Find Average Predicted Airbnb Price in Each Zip Code Bounday

In [None]:
joined_df = spark.sql("select a.geometry as geometry, b.predicted_log_price as pred_price, b.log_price as price from austin_df a left outer join test_df b on ST_Contains(a.geometry, b.airbnb_loc) == True")
joined_df = joined_df.na.fill(value=0,subset=["pred_price"])
joined_df = joined_df.na.fill(value=0,subset=["price"])
joined_df.createOrReplaceTempView("joined_df")

joined_df = spark.sql("select geometry, avg(pred_price) as avg_pred_price, avg(price) as avg_price from joined_df group by geometry")
joined_df.createOrReplaceTempView("joined_df")
joined_df.show(5)

## Visualize Predicted Average Airbnb Price in Each Zip Code Area

In [None]:
joined_gdf = gpd.GeoDataFrame(joined_df.toPandas(), geometry = "geometry", crs = "EPSG:4326")
joined_gdf.plot(column='avg_pred_price', cmap='OrRd', edgecolor='k', legend=True)

In [None]:
%matplot plt