# Part I. Gentle Overview of Big Data and Spark

*by Bill Chambers and Matei Zaharia* 

*Notebook authored by Tarek Allam Jr.*

## Download and install Spark

```
$ cd ~/Downloads
$ tar -xf spark-2.2.0-bin-hadoop2.7.tgz 
$ cd spark-2.2.0-bin-hadoop2.7.tgz
```

OR

```
$ brew install apache-spark
$ cat ~/.bashrc
...
# ==================================================================================================
#                                           SPARK
# ==================================================================================================
SPARK_VERSION_BREW=$(brew list --versions apache-spark | awk '{print $2}')
export SPARK_HOME=/usr/local/Cellar/apache-spark/$SPARK_VERSION_BREW/libexec
export PATH=$FINK_HOME/bin:$PATH

export SPARKLIB=${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.10.7-src.zip
export PYTHONPATH="${SPARKLIB}:${FINK_HOME}:${FINK_HOME}/python:$PYTHONPATH"
export PATH="${SPARK_HOME}/bin:${SPARK_HOME}/sbin:${PATH}"

# PySpark
export PYSPARK_SUBMIT_ARGS="--master local[*] pyspark-shell"

```

## Setup environment

In [4]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

In [5]:
spark.sparkContext

# Chapter 2. A Gentle Introduction to Spark

In [6]:
myRange = spark.range(1000).toDF("number")

In [7]:
divisBy2 = myRange.where("number % 2 = 0")

In [8]:
flightData2015 = spark\
  .read\
  .option("inferSchema", "true")\
  .option("header", "true")\
  .csv("../data/flight-data/csv/2015-summary.csv")

In [9]:
flightData2015.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [10]:
flightData2015.createOrReplaceTempView("flight_data_2015")

In [11]:
sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
""")

dataFrameWay = flightData2015\
  .groupBy("DEST_COUNTRY_NAME")\
  .count()

In [12]:
sqlWay.explain()
dataFrameWay.explain()

== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#14], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#14, 200)
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#14], functions=[partial_count(1)])
      +- *(1) FileScan csv [DEST_COUNTRY_NAME#14] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/tallamjr/github/forks/Spark-The-Definitive-Guide/data/flight-data/c..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>
== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#14], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#14, 200)
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#14], functions=[partial_count(1)])
      +- *(1) FileScan csv [DEST_COUNTRY_NAME#14] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/tallamjr/github/forks/Spark-The-Definitive-Guide/data/flight-data/c..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY

In [13]:
from pyspark.sql.functions import max

flightData2015.select(max("count")).take(1)

[Row(max(count)=370002)]

In [14]:
maxSql = spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY sum(count) DESC
LIMIT 5
""")

maxSql.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [15]:
from pyspark.sql.functions import desc

flightData2015\
  .groupBy("DEST_COUNTRY_NAME")\
  .sum("count")\
  .withColumnRenamed("sum(count)", "destination_total")\
  .sort(desc("destination_total"))\
  .limit(5)\
  .show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [16]:
flightData2015\
  .groupBy("DEST_COUNTRY_NAME")\
  .sum("count")\
  .withColumnRenamed("sum(count)", "destination_total")\
  .sort(desc("destination_total"))\
  .limit(5)\
  .explain()

== Physical Plan ==
TakeOrderedAndProject(limit=5, orderBy=[destination_total#99L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#14,destination_total#99L])
+- *(2) HashAggregate(keys=[DEST_COUNTRY_NAME#14], functions=[sum(cast(count#16 as bigint))])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#14, 200)
      +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#14], functions=[partial_sum(cast(count#16 as bigint))])
         +- *(1) FileScan csv [DEST_COUNTRY_NAME#14,count#16] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/tallamjr/github/forks/Spark-The-Definitive-Guide/data/flight-data/c..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>


# Chapter 3. A Tour of Spark’s Toolset

### Running Production Applications

> Spark makes it easy to develop and create big data programs. Spark also makes it easy to turn your interactive exploration into production applications with spark-submit, a built-in
command-line tool. spark-submit does one thing: it lets you send your application code to a cluster and launch it to execute there. Upon submission, the application will run until it exits (completes the task) or encounters an error. You can do this with all of Spark’s support cluster managers including Standalone, Mesos, and YARN.
spark-submit offers several controls with which you can specify the resources your application needs as well as how it should be run and its command-line arguments.
You can write applications in any of Spark’s supported languages and then submit them for execution. The simplest example is running an application on your local machine. We’ll show this by running a sample Scala application that comes with Spark, using the following command in the directory where you downloaded Spark:
```
./bin/spark-submit \
--class org.apache.spark.examples.SparkPi \ 
--master local \ 
./examples/jars/spark-examples_2.11-2.2.0.jar 10
```
>We can also run a Python version of the application using the following command:
```
./bin/spark-submit \
--master local \ 
./examples/src/main/python/pi.py 10
```

In [17]:
staticDataFrame = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("../data/retail-data/by-day/*.csv")

staticDataFrame.createOrReplaceTempView("retail_data")
staticSchema = staticDataFrame.schema

In [18]:
from pyspark.sql.functions import window, column, desc, col
staticDataFrame\
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")\
  .groupBy(
    col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
  .sum("total_cost")\
  .sort(desc("sum(total_cost)"))\
  .show(5)

+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|   17450.0|[2011-09-20 01:00...|          71601.44|
|      null|[2011-11-14 00:00...|          55316.08|
|      null|[2011-11-07 00:00...|          42939.17|
|      null|[2011-03-29 01:00...| 33521.39999999998|
|      null|[2011-12-08 00:00...|31975.590000000007|
+----------+--------------------+------------------+
only showing top 5 rows



In [19]:
streamingDataFrame = spark.readStream\
    .schema(staticSchema)\
    .option("maxFilesPerTrigger", 1)\
    .format("csv")\
    .option("header", "true")\
    .load("../data/retail-data/by-day/*.csv")

In [20]:
purchaseByCustomerPerHour = streamingDataFrame\
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")\
  .groupBy(
    col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
  .sum("total_cost")

In [21]:
spark.streams.active

[]

In [22]:
activityQuery = purchaseByCustomerPerHour.writeStream\
                    .format("memory")\
                    .queryName("customer_purchases")\
                    .outputMode("complete")\
                    .start()

In [23]:
spark.streams.active

[<pyspark.sql.streaming.StreamingQuery at 0x117080e50>]

In [24]:
spark.sql("""
  SELECT *
  FROM customer_purchases
  ORDER BY `sum(total_cost)` DESC
  """)\
  .show(5)

+----------+------+---------------+
|CustomerId|window|sum(total_cost)|
+----------+------+---------------+
+----------+------+---------------+



In [26]:
spark.sql("""
  SELECT *
  FROM customer_purchases
  ORDER BY `sum(total_cost)` DESC
  """)\
  .show(5)

+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|   12415.0|[2011-03-03 00:00...|          16558.14|
|   15769.0|[2011-03-17 00:00...|           10065.0|
|      null|[2011-03-17 00:00...| 7876.000000000018|
|   12435.0|[2011-03-17 00:00...|3978.9899999999993|
|      null|[2011-03-03 00:00...| 3538.750000000001|
+----------+--------------------+------------------+
only showing top 5 rows



In [25]:
activityQuery.isActive

True

In [28]:
activityQuery.explain()

== Physical Plan ==
*(4) HashAggregate(keys=[CustomerID#338, window#186], functions=[sum(total_cost#176)])
+- StateStoreSave [CustomerID#338, window#186], state info [ checkpoint = file:/private/var/folders/9y/6kx7fns90pn84gtycx7dyl680000gn/T/temporary-f8e9cfcb-78d6-490e-b745-beb7e42ee19f/state, runId = 1c2f3a8f-cb07-43d1-93f2-7565269dbc99, opId = 0, ver = 4, numPartitions = 200], Complete, 0, 2
   +- *(3) HashAggregate(keys=[CustomerID#338, window#186], functions=[merge_sum(total_cost#176)])
      +- StateStoreRestore [CustomerID#338, window#186], state info [ checkpoint = file:/private/var/folders/9y/6kx7fns90pn84gtycx7dyl680000gn/T/temporary-f8e9cfcb-78d6-490e-b745-beb7e42ee19f/state, runId = 1c2f3a8f-cb07-43d1-93f2-7565269dbc99, opId = 0, ver = 4, numPartitions = 200], 2
         +- *(2) HashAggregate(keys=[CustomerID#338, window#186], functions=[merge_sum(total_cost#176)])
            +- Exchange hashpartitioning(CustomerID#338, window#186, 200)
               +- *(1) HashAggregat

In [29]:
activityQuery.stop()

In [30]:
activityQuery.isActive

False

* https://spark.apache.org/docs/2.4.5/api/python/pyspark.sql.html#module-pyspark.sql.streaming

In [31]:
from pyspark.sql.functions import date_format, col
preppedDataFrame = staticDataFrame\
  .na.fill(0)\
  .withColumn("day_of_week", date_format(col("InvoiceDate"), "EEEE"))\
  .coalesce(5)


In [32]:
trainDataFrame = preppedDataFrame\
  .where("InvoiceDate < '2011-07-01'")
testDataFrame = preppedDataFrame\
  .where("InvoiceDate >= '2011-07-01'")

In [33]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer()\
  .setInputCol("day_of_week")\
  .setOutputCol("day_of_week_index")

In [34]:
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder()\
  .setInputCol("day_of_week_index")\
  .setOutputCol("day_of_week_encoded")

In [35]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler()\
  .setInputCols(["UnitPrice", "Quantity", "day_of_week_encoded"])\
  .setOutputCol("features")

In [36]:
from pyspark.ml import Pipeline

transformationPipeline = Pipeline()\
  .setStages([indexer, encoder, vectorAssembler])

In [37]:
fittedPipeline = transformationPipeline.fit(trainDataFrame)

In [38]:
transformedTraining = fittedPipeline.transform(trainDataFrame)

In [39]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans()\
  .setK(20)\
  .setSeed(42)

In [40]:
kmModel = kmeans.fit(transformedTraining)

In [41]:
transformedTest = fittedPipeline.transform(testDataFrame)

In [42]:
from pyspark.sql import Row

spark.sparkContext.parallelize([Row(1), Row(2), Row(3)]).toDF()

DataFrame[_1: bigint]

In [44]:
pDF = spark.sparkContext.parallelize([Row(1), Row(2), Row(3)]).toDF()
type(pDF)

pyspark.sql.dataframe.DataFrame