# Spark in Action - Chapter 3 Python Version - Lab 200

In [None]:
import os
import logging
from pyspark.sql import SparkSession


In [None]:
#from pyspark.sql.functions import F
import pyspark.sql.functions as F

In [None]:
def get_absolute_file_path(path, filename):
    # To get absolute path for a given filename
    current_dir = os.getcwd() #os.path.dirname(__file__)
    relative_path = "{}{}".format(path, filename)
    absolute_file_path = os.path.join(current_dir, relative_path)
    return absolute_file_path

In [None]:
def main(spark):
    # The processing code.
    filename = 'Restaurants_in_Wake_County_NC.csv'
    path = 'net.jgp.books.spark.ch03/data/'
    absolute_file_path = get_absolute_file_path(path, filename)

    # Reads a CSV file with header, called
    # Restaurants_in_Wake_County_NC.csv,
    # stores it in a dataframe
    df = spark.read.csv(header=True, inferSchema=True,path=absolute_file_path)

    logging.warn("*** Right after ingestion")
    df.show(5)
    df.printSchema()

    logging.warn("We have {} records.".format(df.count()))

    # Let's transform our dataframe
    df =  df.withColumn("county", F.lit("Wake")) \
            .withColumnRenamed("HSISID", "datasetId") \
            .withColumnRenamed("NAME", "name") \
            .withColumnRenamed("ADDRESS1", "address1") \
            .withColumnRenamed("ADDRESS2", "address2") \
            .withColumnRenamed("CITY", "city") \
            .withColumnRenamed("STATE", "state") \
            .withColumnRenamed("POSTALCODE", "zip") \
            .withColumnRenamed("PHONENUMBER", "tel") \
            .withColumnRenamed("RESTAURANTOPENDATE", "dateStart") \
            .withColumnRenamed("FACILITYTYPE", "type") \
            .withColumnRenamed("X", "geoX") \
            .withColumnRenamed("Y", "geoY") \
            .drop("OBJECTID", "PERMITID", "GEOCODESTATUS")

    df = df.withColumn("id",
                       F.concat(F.col("state"), F.lit("_"),
                                F.col("county"), F.lit("_"),
                                F.col("datasetId")))

    # Shows at most 5 rows from the dataframe
    logging.warn("*** Dataframe transformed")
    df.show(5)

    # for book only
    dfUsedForBook = df.drop("address2","zip","tel","dateStart","geoX","geoY","address1","datasetId")

    dfUsedForBook.show(5, 15)
    # end

    df.printSchema()

    logging.warn("*** Looking at partitions")
    partitionCount = df.rdd.getNumPartitions()
    logging.warn("Partition count before repartition: {}".format(partitionCount))

    df = df.repartition(4)

    logging.warn("Partition count after repartition: {}".format(df.rdd.getNumPartitions()))


In [None]:
spark = SparkSession.builder.appName("Restaurants in Wake County, NC") \
    .master("local[*]").getOrCreate()

22/10/22 01:24:49 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
22/10/22 01:24:49 INFO SharedState: Warehouse path is 'file:/Users/development/ml/Spark/spark-warehouse'.
22/10/22 01:24:50 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [None]:
# Comment this line to see full log
spark.sparkContext.setLogLevel('warn')
main(spark)
spark.stop()

  logging.warn("*** Right after ingestion")


+--------+----------+--------------------+--------------------+--------+-----------+-----+----------+--------------+-------------------+-----------------+--------+------------+-----------+-------------+
|OBJECTID|    HSISID|                NAME|            ADDRESS1|ADDRESS2|       CITY|STATE|POSTALCODE|   PHONENUMBER| RESTAURANTOPENDATE|     FACILITYTYPE|PERMITID|           X|          Y|GEOCODESTATUS|
+--------+----------+--------------------+--------------------+--------+-----------+-----+----------+--------------+-------------------+-----------------+--------+------------+-----------+-------------+
|    1001|4092016024|                WABA|2502 1/2 HILLSBOR...|    null|    RALEIGH|   NC|     27607|(919) 833-1710|2011-10-18 02:00:00|       Restaurant|    6952|-78.66818477|35.78783803|            M|
|    1002|4092021693|  WALMART DELI #2247|2010 KILDAIRE FAR...|    null|       CARY|   NC|     27518|(919) 852-6651|2011-11-08 01:00:00|       Food Stand|    6953|-78.78211173|35.73717591|

  logging.warn("We have {} records.".format(df.count()))
  logging.warn("*** Dataframe transformed")


+----------+--------------------+--------------------+--------+-----------+-----+----------+--------------+-------------------+-----------------+------------+-----------+------+------------------+
| datasetId|                name|            address1|address2|       city|state|       zip|           tel|          dateStart|             type|        geoX|       geoY|county|                id|
+----------+--------------------+--------------------+--------+-----------+-----+----------+--------------+-------------------+-----------------+------------+-----------+------+------------------+
|4092016024|                WABA|2502 1/2 HILLSBOR...|    null|    RALEIGH|   NC|     27607|(919) 833-1710|2011-10-18 02:00:00|       Restaurant|-78.66818477|35.78783803|  Wake|NC_Wake_4092016024|
|4092021693|  WALMART DELI #2247|2010 KILDAIRE FAR...|    null|       CARY|   NC|     27518|(919) 852-6651|2011-11-08 01:00:00|       Food Stand|-78.78211173|35.73717591|  Wake|NC_Wake_4092021693|
|4092017012|CAR

  logging.warn("*** Looking at partitions")
  logging.warn("Partition count before repartition: {}".format(partitionCount))


+---------------+-----------+-----+---------------+------+---------------+
|           name|       city|state|           type|county|             id|
+---------------+-----------+-----+---------------+------+---------------+
|           WABA|    RALEIGH|   NC|     Restaurant|  Wake|NC_Wake_4092...|
|WALMART DELI...|       CARY|   NC|     Food Stand|  Wake|NC_Wake_4092...|
|CAROLINA SUS...|    RALEIGH|   NC|     Restaurant|  Wake|NC_Wake_4092...|
|THE CORNER V...|    RALEIGH|   NC|Mobile Food ...|  Wake|NC_Wake_4092...|
|   SUBWAY #3726|WAKE FOREST|   NC|     Restaurant|  Wake|NC_Wake_4092...|
+---------------+-----------+-----+---------------+------+---------------+
only showing top 5 rows

root
 |-- datasetId: long (nullable = true)
 |-- name: string (nullable = true)
 |-- address1: string (nullable = true)
 |-- address2: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- tel: string (nullable = tru

  logging.warn("Partition count after repartition: {}".format(df.rdd.getNumPartitions()))


In [None]:
spark.stop()