# Spark in Action - Chapter 3 Scala Version - Lab 230

In [None]:
import org.apache.spark.sql.functions.{col, concat, lit, split}
import org.apache.spark.sql.{Dataset, Row, SparkSession}

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.1.41:4041
SparkContext available as 'sc' (version = 3.3.0, master = local[*], app id = local-1667056989974)
SparkSession available as 'spark'


import org.apache.spark.sql.functions.{col, concat, lit, split}
import org.apache.spark.sql.{Dataset, Row, SparkSession}


In [None]:
// Creates a session on a local master
val spark = SparkSession
                .builder
                .appName("Union of two dataframes")
                .master("local[*]")
                .getOrCreate

22/10/29 17:23:20 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@35aab3d4


In [None]:
var df1 = spark
            .read
            .format("csv")
            .option("header", "true")
            .load("../net.jgp.books.spark.ch03/data/Restaurants_in_Wake_County_NC.csv")

df1: org.apache.spark.sql.DataFrame = [OBJECTID: string, HSISID: string ... 13 more fields]


In [None]:
var df2 = spark
            .read
            .format("json")
            .load("../net.jgp.books.spark.ch03/data/Restaurants_in_Durham_County_NC.json")

df2: org.apache.spark.sql.DataFrame = [datasetid: string, fields: struct<closing_date: string, est_group_desc: string ... 21 more fields> ... 3 more fields]


In [None]:
/**
* Builds the dataframe containing the Wake county restaurants
*
* @return A dataframe
*/
private def buildWakeRestaurantsDataframe(df: Dataset[Row]) = {
    val drop_cols = List("OBJECTID", "GEOCODESTATUS", "PERMITID")
    var df1 = df.withColumn("county", lit("Wake"))
                .withColumnRenamed("HSISID", "datasetId")
                .withColumnRenamed("NAME", "name")
                .withColumnRenamed("ADDRESS1", "address1")
                .withColumnRenamed("ADDRESS2", "address2")
                .withColumnRenamed("CITY", "city")
                .withColumnRenamed("STATE", "state")
                .withColumnRenamed("POSTALCODE", "zip")
                .withColumnRenamed("PHONENUMBER", "tel")
                .withColumnRenamed("RESTAURANTOPENDATE", "dateStart")
                .withColumn("dateEnd", lit(null))
                .withColumnRenamed("FACILITYTYPE", "type")
                .withColumnRenamed("X", "geoX")
                .withColumnRenamed("Y", "geoY")
                .drop(drop_cols:_*)

    df1 = df1.withColumn("id",
             concat(col("state"), lit("_"), col("county"), lit("_"), col("datasetId")))
    // I left the following line if you want to play with repartitioning
    // df1 = df1.repartition(4);
    df1
}

val wakeRestaurantsDf = buildWakeRestaurantsDataframe(df1)

wakeRestaurantsDf: org.apache.spark.sql.DataFrame = [datasetId: string, name: string ... 13 more fields]


In [None]:
/**
* Builds the dataframe containing the Durham county restaurants
*
* @return A dataframe
*/
private def buildDurhamRestaurantsDataframe(df: Dataset[Row]) = {
    val drop_cols=List("fields", "geometry", "record_timestamp", "recordid")
    var df1 = df.withColumn("county", lit("Durham"))
                .withColumn("datasetId", col("fields.id"))
                .withColumn("name", col("fields.premise_name"))
                .withColumn("address1", col("fields.premise_address1"))
                .withColumn("address2", col("fields.premise_address2"))
                .withColumn("city", col("fields.premise_city"))
                .withColumn("state", col("fields.premise_state"))
                .withColumn("zip", col("fields.premise_zip"))
                .withColumn("tel", col("fields.premise_phone"))
                .withColumn("dateStart", col("fields.opening_date"))
                .withColumn("dateEnd", col("fields.closing_date"))
                .withColumn("type", split(col("fields.type_description"), " - ").getItem(1))
                .withColumn("geoX", col("fields.geolocation").getItem(0))
                .withColumn("geoY", col("fields.geolocation").getItem(1))
                .drop(drop_cols:_*)

    df1 = df1.withColumn("id",
                  concat(col("state"), lit("_"), col("county"), lit("_"), col("datasetId")))
    // I left the following line if you want to play with repartitioning
    // df1 = df1.repartition(4);
    df1
}

val durhamRestaurantsDf = buildDurhamRestaurantsDataframe(df2)

durhamRestaurantsDf: org.apache.spark.sql.DataFrame = [datasetId: string, county: string ... 13 more fields]


In [None]:
wakeRestaurantsDf.printSchema()

root
 |-- datasetId: string (nullable = true)
 |-- name: string (nullable = true)
 |-- address1: string (nullable = true)
 |-- address2: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- tel: string (nullable = true)
 |-- dateStart: string (nullable = true)
 |-- type: string (nullable = true)
 |-- geoX: string (nullable = true)
 |-- geoY: string (nullable = true)
 |-- county: string (nullable = false)
 |-- dateEnd: void (nullable = true)
 |-- id: string (nullable = true)



In [None]:
durhamRestaurantsDf.printSchema()

root
 |-- datasetId: string (nullable = true)
 |-- county: string (nullable = false)
 |-- name: string (nullable = true)
 |-- address1: string (nullable = true)
 |-- address2: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- tel: string (nullable = true)
 |-- dateStart: string (nullable = true)
 |-- dateEnd: string (nullable = true)
 |-- type: string (nullable = true)
 |-- geoX: double (nullable = true)
 |-- geoY: double (nullable = true)
 |-- id: string (nullable = true)



In [None]:
/**
* Performs the union between the two dataframes.
*
* @param df1 Left Dataframe to union on
* @param df2 Right Dataframe to union from
*/
def combineDataframes(df1: Dataset[Row], df2: Dataset[Row]): Unit = {
    val df = df1.unionByName(df2)
    df.show(5)
    df.printSchema()
    println("We have " + df.count + " records.")
    val partitionCount = df.rdd.getNumPartitions
    println("Partition count: " + partitionCount)
}

combineDataframes: (df1: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row], df2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row])Unit


In [None]:
combineDataframes(wakeRestaurantsDf, durhamRestaurantsDf)

+-----------+--------------------+--------------------+--------+-----------+-----+----------+--------------+--------------------+-----------------+------------+-----------+------+-------+-------------------+
|  datasetId|                name|            address1|address2|       city|state|       zip|           tel|           dateStart|             type|        geoX|       geoY|county|dateEnd|                 id|
+-----------+--------------------+--------------------+--------+-----------+-----+----------+--------------+--------------------+-----------------+------------+-----------+------+-------+-------------------+
|04092016024|                WABA|2502 1/2 HILLSBOR...|    null|    RALEIGH|   NC|     27607|(919) 833-1710|2011-10-18T00:00:...|       Restaurant|-78.66818477|35.78783803|  Wake|   null|NC_Wake_04092016024|
|04092021693|  WALMART DELI #2247|2010 KILDAIRE FAR...|    null|       CARY|   NC|     27518|(919) 852-6651|2011-11-08T00:00:...|       Food Stand|-78.78211173|35.73717

In [None]:
spark.stop()