# This is for the processing of the Rooftop data from public S3 bucket using EMR jupyter Notebook and save to our bucket

In [None]:
# make sure create the EMR cluster and bootstrap using the packages

# Following Packages need to install

In [None]:
# pip install pygeohash
# pip install shapely

In [1]:
import shapely # this is to change the geometry to latitude and longitude
import pygeohash as pgh # this is to change into geohash
from shapely import wkt

# Creating spark session using the python3 kernel in EMR JupyterNotebook server

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [3]:
spark = (SparkSession
        .builder
        .appName('rooftop')
        .getOrCreate())

## S3 bucket where input file is present. We will do postproceesing the New York and Albany rooftop data files using spark of year 2013

In [None]:
input_file_rooftop_albany = "s3://oedi-data-lake/pv-rooftop/developable-planes/city_year=albany_ny_13/part-00007-63280d0e-7793-41b1-bef8-06c2e2c20a1c-c000.snappy.parquet"
input_file_rooftop_NY = "s3://oedi-data-lake/pv-rooftop/developable-planes/city_year=newyork_ny_13/part-00000-0baf61da-9506-4594-b3b1-6783ba8e64d5-c0000.snappy.parquet"

In [None]:
output_file_tosave  = 's3://data-est2-cap/rooftop/data_input_ny_albany'

In [None]:
output_file_tosave = "../../../Data/albany_13/final/final_ny"

In [None]:
rooftop_input = spark.read.parquet("../../../Data/albany_13/")

In [4]:
rooftop_input = spark.read.parquet("../../../Data/rooftop_ny/")

In [None]:
rooftop_input = spark.read.parquet(input_file_rooftop_albany, input_file_rooftop_NY)

In [5]:
rooftop_input.printSchema()

root
 |-- bldg_fid: long (nullable = true)
 |-- footprint_m2: double (nullable = true)
 |-- slope: long (nullable = true)
 |-- flatarea_m2: double (nullable = true)
 |-- slopeconversion: double (nullable = true)
 |-- slopearea_m2: double (nullable = true)
 |-- aspect: long (nullable = true)
 |-- gid: long (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- year: long (nullable = true)
 |-- the_geom_96703: string (nullable = true)
 |-- the_geom_4326: string (nullable = true)



## We are using the the_geom_4326  to convert to hash map. We need state, city, flatarea_m2, slopearea_m2, the_geom_4326

In [6]:
df1 = rooftop_input.select(["the_geom_4326", "state", "city", "flatarea_m2", "slopearea_m2"])

# Get the total rooftop area by slope area and flat area

In [7]:
df_total = df1.withColumn("total_area_m2", F.col("flatarea_m2") + F.col("slopearea_m2"))

In [8]:
df_total = df1.withColumn("total_area_m2", F.round(F.col("flatarea_m2") + F.col("slopearea_m2"), 2))

In [9]:
df_total = df_total.select(["the_geom_4326", "city", "total_area_m2"])

In [None]:
df_total.show(5)

## This will covert the the_geom_4326 to Geohash. Here we use the precision of 6 (error in few KM) to avoid computational cost

In [10]:
def Geom4326ToGeo(the_geom_4326, precision=6):
    """transfer the the_geom_4326 to Geohash
    """
    geometry=shapely.wkt.loads(the_geom_4326)
    if geometry.geom_type == 'MultiPolygon':    # for MultiPolygon type geometry
        latitude,longitude=geometry.bounds[1], geometry.bounds[0]
    else:
                                            #find the centroid of geometry
        latitude, longitude= geometry.centroid.y, geometry.centroid.x
    res=pgh.encode(latitude,longitude, precision=precision)
    return res
    

In [11]:
udf_geohash = F.udf(lambda z: Geom4326ToGeo(z, precision=6))

In [12]:
df_geohash = df_total.withColumn('geohash',udf_geohash("the_geom_4326"))

In [13]:
df_geohash_final = df_geohash.groupBy("geohash", "city").sum("total_area_m2").withColumnRenamed('sum(total_area_m2)', 'total_area_m2')

In [14]:
df_geohash_final.show(4)

KeyboardInterrupt: 

In [None]:
df_geohash_final= df_geohash_final.withColumn("total_area_m2", F.round( "total_area_m2", 2))

In [None]:
df_geohash_final.write.mode("overwrite").parquet(f"{output_file_tosave}.parquet")

In [None]:
#df_geohash_final.write.mode("overwrite").parquet("rootop_data.parquet")