# Spark in Action - Chapter 3 Scala Version - Lab 210

In [None]:
import org.apache.spark.sql.functions.concat
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.SparkSession

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.1.41:4040
SparkContext available as 'sc' (version = 3.3.0, master = local[*], app id = local-1666423942219)
SparkSession available as 'spark'


import org.apache.spark.sql.functions.concat
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.SparkSession


In [None]:
val spark = SparkSession
                .builder
                .appName("Schema introspection for restaurants in Wake County, NC")
                .master("local")
                .getOrCreate

22/10/22 09:33:00 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@6a9c68b2


In [None]:
// Reads a CSV file with header, called books.csv, stores it in a
// dataframe
var df = spark
            .read
            .format("csv")
            .option("header", "true")
            .load("net.jgp.books.spark.ch03/data/Restaurants_in_Wake_County_NC.csv")

df: org.apache.spark.sql.DataFrame = [OBJECTID: string, HSISID: string ... 13 more fields]


In [None]:
// Let's transform our dataframe
df = df.withColumn("county", lit("Wake"))
      .withColumnRenamed("HSISID", "datasetId")
      .withColumnRenamed("NAME", "name")
      .withColumnRenamed("ADDRESS1", "address1")
      .withColumnRenamed("ADDRESS2", "address2")
      .withColumnRenamed("CITY", "city")
      .withColumnRenamed("STATE", "state")
      .withColumnRenamed("POSTALCODE", "zip")
      .withColumnRenamed("PHONENUMBER", "tel")
      .withColumnRenamed("RESTAURANTOPENDATE", "dateStart")
      .withColumnRenamed("FACILITYTYPE", "type")
      .withColumnRenamed("X", "geoX")
      .withColumnRenamed("Y", "geoY")

df: org.apache.spark.sql.DataFrame = [OBJECTID: string, datasetId: string ... 14 more fields]


In [None]:
df = df.withColumn("id",
      concat(df.col("state"), lit("_"), df.col("county"), lit("_"), df.col("datasetId")))

df: org.apache.spark.sql.DataFrame = [OBJECTID: string, datasetId: string ... 15 more fields]


In [None]:
val schema = df.schema

schema: org.apache.spark.sql.types.StructType = StructType(StructField(OBJECTID,StringType,true),StructField(datasetId,StringType,true),StructField(name,StringType,true),StructField(address1,StringType,true),StructField(address2,StringType,true),StructField(city,StringType,true),StructField(state,StringType,true),StructField(zip,StringType,true),StructField(tel,StringType,true),StructField(dateStart,StringType,true),StructField(type,StringType,true),StructField(PERMITID,StringType,true),StructField(geoX,StringType,true),StructField(geoY,StringType,true),StructField(GEOCODESTATUS,StringType,true),StructField(county,StringType,false),StructField(id,StringType,true))


In [None]:
println("*** Schema as a tree:")
schema.printTreeString()

*** Schema as a tree:
root
 |-- OBJECTID: string (nullable = true)
 |-- datasetId: string (nullable = true)
 |-- name: string (nullable = true)
 |-- address1: string (nullable = true)
 |-- address2: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- tel: string (nullable = true)
 |-- dateStart: string (nullable = true)
 |-- type: string (nullable = true)
 |-- PERMITID: string (nullable = true)
 |-- geoX: string (nullable = true)
 |-- geoY: string (nullable = true)
 |-- GEOCODESTATUS: string (nullable = true)
 |-- county: string (nullable = false)
 |-- id: string (nullable = true)



In [None]:
val schemaAsString = schema.mkString
println("*** Schema as string: " + schemaAsString)

*** Schema as string: StructField(OBJECTID,StringType,true)StructField(datasetId,StringType,true)StructField(name,StringType,true)StructField(address1,StringType,true)StructField(address2,StringType,true)StructField(city,StringType,true)StructField(state,StringType,true)StructField(zip,StringType,true)StructField(tel,StringType,true)StructField(dateStart,StringType,true)StructField(type,StringType,true)StructField(PERMITID,StringType,true)StructField(geoX,StringType,true)StructField(geoY,StringType,true)StructField(GEOCODESTATUS,StringType,true)StructField(county,StringType,false)StructField(id,StringType,true)


schemaAsString: String = StructField(OBJECTID,StringType,true)StructField(datasetId,StringType,true)StructField(name,StringType,true)StructField(address1,StringType,true)StructField(address2,StringType,true)StructField(city,StringType,true)StructField(state,StringType,true)StructField(zip,StringType,true)StructField(tel,StringType,true)StructField(dateStart,StringType,true)StructField(type,StringType,true)StructField(PERMITID,StringType,true)StructField(geoX,StringType,true)StructField(geoY,StringType,true)StructField(GEOCODESTATUS,StringType,true)StructField(county,StringType,false)StructField(id,StringType,true)


In [None]:
val schemaAsJson = schema.prettyJson
println("*** Schema as JSON: " + schemaAsJson)

*** Schema as JSON: {
  "type" : "struct",
  "fields" : [ {
    "name" : "OBJECTID",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "datasetId",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "name",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "address1",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "address2",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "city",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "state",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "zip",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "tel",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "dateStart",
    "type" : "string",
    "nullable" : tru

schemaAsJson: String =
{
  "type" : "struct",
  "fields" : [ {
    "name" : "OBJECTID",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "datasetId",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "name",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "address1",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "address2",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "city",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "state",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "zip",
    "type" : "string",
    "nul...


In [None]:
spark.stop()