# Spark in Action - Chapter 3 Python Version - Lab 210

In [None]:
import os
import json
import logging
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [None]:
spark = SparkSession.builder \
        .appName("Schema introspection for restaurants in Wake County, NC") \
        .master("local[*]") \
        .getOrCreate()

spark.sparkContext.setLogLevel('warn')

22/10/22 09:28:12 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
22/10/22 09:28:12 INFO SharedState: Warehouse path is 'file:/Users/development/ml/Spark/spark-warehouse'.
22/10/22 09:28:13 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [None]:
def get_absolute_file_path(path, filename):
    # To get absolute path for a given filename
    current_dir = os.getcwd() #os.path.dirname(__file__)
    relative_path = "{}{}".format(path, filename)
    absolute_file_path = os.path.join(current_dir, relative_path)
    return absolute_file_path

In [None]:
# The processing code.
filename = 'Restaurants_in_Wake_County_NC.csv'
path = 'net.jgp.books.spark.ch03/data/'
absolute_file_path = get_absolute_file_path(path, filename)
# Reads a CSV file with header, called
# Restaurants_in_Wake_County_NC.csv,
# stores it in a dataframe
df = spark.read.csv(header=True, inferSchema=True, path=absolute_file_path)

In [None]:
# Let's transform our dataframe
df = df.withColumn("county", F.lit("Wake")) \
        .withColumnRenamed("HSISID", "datasetId") \
        .withColumnRenamed("NAME", "name") \
        .withColumnRenamed("ADDRESS1", "address1") \
        .withColumnRenamed("ADDRESS2", "address2") \
        .withColumnRenamed("CITY", "city") \
        .withColumnRenamed("STATE", "state") \
        .withColumnRenamed("POSTALCODE", "zip") \
        .withColumnRenamed("PHONENUMBER", "tel") \
        .withColumnRenamed("RESTAURANTOPENDATE", "dateStart") \
        .withColumnRenamed("FACILITYTYPE", "type") \
        .withColumnRenamed("X", "geoX") \
        .withColumnRenamed("Y", "geoY")

In [None]:
df = df.withColumn("id",
        F.concat(F.col("state"), F.lit("_"), F.col("county"), F.lit("_"), F.col("datasetId")))

In [None]:
logging.warning("*** Schema as a tree:")
df.printSchema()



root
 |-- OBJECTID: integer (nullable = true)
 |-- datasetId: long (nullable = true)
 |-- name: string (nullable = true)
 |-- address1: string (nullable = true)
 |-- address2: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- tel: string (nullable = true)
 |-- dateStart: timestamp (nullable = true)
 |-- type: string (nullable = true)
 |-- PERMITID: integer (nullable = true)
 |-- geoX: double (nullable = true)
 |-- geoY: double (nullable = true)
 |-- GEOCODESTATUS: string (nullable = true)
 |-- county: string (nullable = false)
 |-- id: string (nullable = true)



In [None]:
logging.warning("*** Schema as string: {}".format(df.schema))
schemaAsJson = df.schema.json()
parsedSchemaAsJson = json.loads(schemaAsJson)



In [None]:
logging.warning("*** Schema as JSON: {}".format(json.dumps(parsedSchemaAsJson, indent=2)))

  "fields": [
    {
      "metadata": {},
      "name": "OBJECTID",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "datasetId",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "name",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "address1",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "address2",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "city",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "state",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "zip",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "tel",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "

In [None]:
spark.stop()