# Spark in Action - Chapter 3 Python Version - Lab 230

In [None]:
import os
import logging
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
        .appName("Union of two dataframes") \
        .master("local[*]") \
        .getOrCreate()

spark.sparkContext.setLogLevel('warn')

22/10/22 09:49:07 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
22/10/22 09:49:07 INFO SharedState: Warehouse path is 'file:/Users/development/ml/Spark/spark-warehouse'.
22/10/22 09:49:08 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [None]:
def get_absolute_file_path(path, filename):
    # To get absolute path for a given filename
    current_dir = os.getcwd() #os.path.dirname(__file__)
    relative_path = "{}{}".format(path, filename)
    absolute_file_path = os.path.join(current_dir, relative_path)
    return absolute_file_path

In [None]:
# The processing code.
filename = 'Restaurants_in_Durham_County_NC.json'
path = 'net.jgp.books.spark.ch03/data/'
absolute_file_path = get_absolute_file_path(path, filename)

In [None]:
# Reads a JSON file called Restaurants_in_Durham_County_NC.json, stores
# it in a dataframe
df = spark.read.json(absolute_file_path)
logging.warning("*** Right after ingestion")
df.show(5)
df.printSchema()
logging.warning("We have {} records.".format(df.count()))



+----------------+--------------------+--------------------+--------------------+--------------------+
|       datasetid|              fields|            geometry|    record_timestamp|            recordid|
+----------------+--------------------+--------------------+--------------------+--------------------+
|restaurants-data|{null, Full-Servi...|{[-78.9573299, 35...|2017-07-13T09:15:...|1644654b953d1802c...|
|restaurants-data|{null, Nursing Ho...|{[-78.8895483, 36...|2017-07-13T09:15:...|93573dbf8c9e799d8...|
|restaurants-data|{null, Fast Food ...|{[-78.9593263, 35...|2017-07-13T09:15:...|0d274200c7cef50d0...|
|restaurants-data|{null, Full-Servi...|{[-78.9060312, 36...|2017-07-13T09:15:...|cf3e0b175a6ebad2a...|
|restaurants-data|{null, null, [36....|{[-78.9135175, 36...|2017-07-13T09:15:...|e796570677f7c39cc...|
+----------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows

root
 |-- datasetid: string (nullable = true)
 |



In [None]:
df =  df.withColumn("county", F.lit("Durham")) \
        .withColumn("datasetId", F.col("fields.id")) \
        .withColumn("name", F.col("fields.premise_name")) \
        .withColumn("address1", F.col("fields.premise_address1")) \
        .withColumn("address2", F.col("fields.premise_address2")) \
        .withColumn("city", F.col("fields.premise_city")) \
        .withColumn("state", F.col("fields.premise_state")) \
        .withColumn("zip", F.col("fields.premise_zip")) \
        .withColumn("tel", F.col("fields.premise_phone")) \
        .withColumn("dateStart", F.col("fields.opening_date")) \
        .withColumn("dateEnd", F.col("fields.closing_date")) \
        .withColumn("type", F.split(F.col("fields.type_description"), " - ").getItem(1)) \
        .withColumn("geoX", F.col("fields.geolocation").getItem(0)) \
        .withColumn("geoY", F.col("fields.geolocation").getItem(1))

In [None]:
df = df.withColumn("id", F.concat(F.col("state"), F.lit("_"),
                                  F.col("county"), F.lit("_"),
                                  F.col("datasetId")))

In [None]:
logging.warning("*** Dataframe transformed")
df.select('id',"state", "county", "datasetId").show(5)
df.printSchema()



+---------------+-----+------+---------+
|             id|state|county|datasetId|
+---------------+-----+------+---------+
|NC_Durham_56060|   NC|Durham|    56060|
|NC_Durham_58123|   NC|Durham|    58123|
|NC_Durham_70266|   NC|Durham|    70266|
|NC_Durham_97837|   NC|Durham|    97837|
|NC_Durham_60690|   NC|Durham|    60690|
+---------------+-----+------+---------+
only showing top 5 rows

root
 |-- datasetId: string (nullable = true)
 |-- fields: struct (nullable = true)
 |    |-- closing_date: string (nullable = true)
 |    |-- est_group_desc: string (nullable = true)
 |    |-- geolocation: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- hours_of_operation: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- insp_freq: long (nullable = true)
 |    |-- opening_date: string (nullable = true)
 |    |-- premise_address1: string (nullable = true)
 |    |-- premise_address2: string (nullable = true)
 |    |-- premise_city: strin

In [None]:
logging.warning("*** Looking at partitions")
partitionCount = df.rdd.getNumPartitions()
logging.warning("Partition count before repartition: {}".format(partitionCount))




In [None]:
df = df.repartition(4)
logging.warning("Partition count after repartition: {}".format(df.rdd.getNumPartitions()))




In [None]:
spark.stop()