In [None]:
# The Python packaging for Spark is not intended to replace all of the other use cases. 
# This Python packaged version of Spark is suitable for interacting with an existing cluster (be it Spark standalone, YARN, or Mesos) - but does not contain the tools required to setup your own standalone Spark cluster. 
# You can download the full version of Spark from the Apache Spark downloads page.
! pip install pyspark==2.4.4

In [None]:
# Below variables are to be set in the shell profile
# export SPARK_HOME=/Users/pmacharl/spark-2.4.4-bin-hadoop2.7
# export PATH=$PATH:$SPARK_HOME/bin
# export PYSPARK_SUBMIT_ARGS="pyspark-shell"
# export PYSPARK_DRIVER_PYTHON=/usr/local/bin/python3
# export PYSPARK_PYTHON=/usr/local/bin/python3

In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

# https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.SparkConf
config = SparkConf()
config.set("spark.driver.memory", "2g")
config.set("spark.executor.memory", "1g")

#Because you are likely running in local mode, it is a good practice to set the number of shuffle partitions
# to something that is going to fit local mode. By default, the value is 200, but there aren't many executors
# on this machine, its worth reducing this to 5
config.set("spark.sql.shuffle.partitions", "5")

# Cluster mode
# https://spark.apache.org/docs/latest/submitting-applications.html
# config.setMaster("spark://192.168.0.7:7077") # If spark is started in local cluster mode

<pyspark.conf.SparkConf at 0x10ae42e50>

In [2]:
spark = SparkSession.builder.config(conf=config).master("local").appName("MyApp").getOrCreate()
spark

In [4]:
df = spark.read.load("../height_weight.csv",
                     format="csv", sep=",", inferSchema="true", header="true")
df.show()

+---+------+------+-----+-----+
|sex|weight|height|repwt|repht|
+---+------+------+-----+-----+
|  M|    77|   182|   77|  180|
|  F|    58|   161|   51|  159|
|  F|    53|   161|   54|  158|
|  M|    68|   177|   70|  175|
|  F|    59|   157|   59|  155|
|  M|    76|   170|   76|  165|
|  M|    76|   167|   77|  165|
|  M|    69|   186|   73|  180|
|  M|    71|   178|   71|  175|
|  M|    65|   171|   64|  170|
|  M|    70|   175|   75|  174|
|  F|   166|    57|   56|  163|
|  F|    51|   161|   52|  158|
|  F|    64|   168|   64|  165|
|  F|    52|   163|   57|  160|
|  F|    65|   166|   66|  165|
|  M|    92|   187|  101|  185|
|  F|    62|   168|   62|  165|
|  M|    76|   197|   75|  200|
|  F|    61|   175|   61|  171|
+---+------+------+-----+-----+
only showing top 20 rows



# Create a temp table or view
- Temporary views in Spark SQL are session-scoped and will disappear if the session that creates it terminates (Note: You can have many Spark sessions)

In [5]:
df.createOrReplaceTempView("height_weight")

In [6]:
spark.sql("select * from height_weight").show()

+---+------+------+-----+-----+
|sex|weight|height|repwt|repht|
+---+------+------+-----+-----+
|  M|    77|   182|   77|  180|
|  F|    58|   161|   51|  159|
|  F|    53|   161|   54|  158|
|  M|    68|   177|   70|  175|
|  F|    59|   157|   59|  155|
|  M|    76|   170|   76|  165|
|  M|    76|   167|   77|  165|
|  M|    69|   186|   73|  180|
|  M|    71|   178|   71|  175|
|  M|    65|   171|   64|  170|
|  M|    70|   175|   75|  174|
|  F|   166|    57|   56|  163|
|  F|    51|   161|   52|  158|
|  F|    64|   168|   64|  165|
|  F|    52|   163|   57|  160|
|  F|    65|   166|   66|  165|
|  M|    92|   187|  101|  185|
|  F|    62|   168|   62|  165|
|  M|    76|   197|   75|  200|
|  F|    61|   175|   61|  171|
+---+------+------+-----+-----+
only showing top 20 rows



# Slightly more involved code 

In [7]:
df.groupBy("sex").count().show()

+---+-----+
|sex|count|
+---+-----+
|  F|  112|
|  M|   88|
+---+-----+



In [8]:
spark.sql("select count(*) from height_weight group by sex").show()

+--------+
|count(1)|
+--------+
|     112|
|      88|
+--------+



# Inspect the Physical plan for both
- Both are exactly the same
- There is no performance difference between writing SQL queries or writing DataFrame code, they both “compile” to the same underlying plan that we specify in DataFrame code

In [9]:
df.groupBy("sex").count().explain()

== Physical Plan ==
*(2) HashAggregate(keys=[sex#10], functions=[count(1)])
+- Exchange hashpartitioning(sex#10, 200)
   +- *(1) HashAggregate(keys=[sex#10], functions=[partial_count(1)])
      +- *(1) FileScan csv [sex#10] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/pmacharl/git-projects/personal/github.com/data_analysis_pandas_spar..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<sex:string>


In [10]:
spark.sql("select count(*) from height_weight group by sex").explain()

== Physical Plan ==
*(2) HashAggregate(keys=[sex#10], functions=[count(1)])
+- Exchange hashpartitioning(sex#10, 200)
   +- *(1) HashAggregate(keys=[sex#10], functions=[partial_count(1)])
      +- *(1) FileScan csv [sex#10] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/pmacharl/git-projects/personal/github.com/data_analysis_pandas_spar..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<sex:string>


# Global Temp View
- If you want to have a temporary view that is shared among all sessions and keep alive until the Spark application terminates, you can create a global temporary view. 
- Global temporary view is tied to a system preserved database global_temp, and we must use the qualified name to refer it, e.g. SELECT * FROM global_temp.view1.

In [11]:
df_real_estate = spark.read.load("../Real_Estate_Sales_2001-2017.csv", 
                                 format="csv", sep=",", header="true")

In [12]:
# Register the DataFrame as a global temporary view
df_real_estate.createGlobalTempView("real_estate_sales")

In [13]:
# Global temporary view is tied to a system preserved database `global_temp`
spark.sql("SELECT * FROM global_temp.real_estate_sales").show()

+------+------------+--------+--------------------+----------+--------------------+-------------+----------+-----------------+------------+---------------+----------------+--------------------+
|    ID|SerialNumber|ListYear|        DateRecorded|      Town|             Address|AssessedValue|SaleAmount|       SalesRatio|PropertyType|ResidentialType|      NonUseCode|             Remarks|
+------+------------+--------+--------------------+----------+--------------------+-------------+----------+-----------------+------------+---------------+----------------+--------------------+
|815906|      170177|    2017|04/05/1999 12:00:...|New London|      293 PEQUOT AVE|       132440|    252500|            0.525|        null|     Two Family|            null|                null|
|     2|      900035|    2009|07/20/2010 12:00:...|   Andover|     1 DOGWOOD DRIVE|        55600|     99000|0.561616161616162| Vacant Land|             NA|              NA|                  NA|
|     3|       14011|    2014|

In [14]:
# Global temporary view is cross-session
spark.newSession().sql("SELECT * FROM global_temp.real_estate_sales").show()

+------+------------+--------+--------------------+----------+--------------------+-------------+----------+-----------------+------------+---------------+----------------+--------------------+
|    ID|SerialNumber|ListYear|        DateRecorded|      Town|             Address|AssessedValue|SaleAmount|       SalesRatio|PropertyType|ResidentialType|      NonUseCode|             Remarks|
+------+------------+--------+--------------------+----------+--------------------+-------------+----------+-----------------+------------+---------------+----------------+--------------------+
|815906|      170177|    2017|04/05/1999 12:00:...|New London|      293 PEQUOT AVE|       132440|    252500|            0.525|        null|     Two Family|            null|                null|
|     2|      900035|    2009|07/20/2010 12:00:...|   Andover|     1 DOGWOOD DRIVE|        55600|     99000|0.561616161616162| Vacant Land|             NA|              NA|                  NA|
|     3|       14011|    2014|

# Creating New Session (Indepth Internal and optional topic)
- If SparkSession.builder is used, the below way does NOT return new app. It returns the same app which was previously created. It is by design and single responsibility principle
- As discussed before there can only be one SparkContext per JVM and SparkSession is a wrapper around SparkContext
- A new session is created by calling .newSession() on existing session only
- Feel free to open the spark UI and you can see that "MyApp" will be the name of the app
- The way to have MyApp1 would be to execute spark.stop() and start new

**Although configuration option spark.driver.allowMultipleContexts exists, it is misleading because usage of multiple Spark contexts is discouraged. This option is used only for Spark internal tests and is not supposed to be used in user programs. You can get unexpected results while running more than one Spark context in a single JVM.**  
[StackOverFlow](https://stackoverflow.com/questions/32827333/spark-multiple-contexts)

In [20]:
spark_new = SparkSession.builder.config(conf=config).master("local").appName("MyApp1").getOrCreate()
spark_new # This is the same session as spark

In [23]:
spark == spark_new

True

# One more slightly complex example
- Top 3 PropertyType by TotalSales
- Observe that physical plan for both is exactly the same

In [15]:
df_real_estate = spark.read.load("../Real_Estate_Sales_2001-2017.csv", 
                                 format="csv", sep=",", header="true")
df_real_estate.printSchema()

root
 |-- ID: string (nullable = true)
 |-- SerialNumber: string (nullable = true)
 |-- ListYear: string (nullable = true)
 |-- DateRecorded: string (nullable = true)
 |-- Town: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- AssessedValue: string (nullable = true)
 |-- SaleAmount: string (nullable = true)
 |-- SalesRatio: string (nullable = true)
 |-- PropertyType: string (nullable = true)
 |-- ResidentialType: string (nullable = true)
 |-- NonUseCode: string (nullable = true)
 |-- Remarks: string (nullable = true)



In [16]:
# Use \ to break multiple lines
from pyspark.sql.types import * # for DoubleType()
df_top_3 = df_real_estate.withColumn("SaleAmount",df_real_estate.SaleAmount.cast(DoubleType())) \
.groupBy("PropertyType").sum("SaleAmount") \
.withColumnRenamed("sum(SaleAmount)","TotalSales") \
.sort("TotalSales",ascending=False).limit(3)

In [17]:
df_top_3.withColumn('TotalSales', df_top_3.TotalSales.cast(DecimalType(18, 2))).show()

+------------+---------------+
|PropertyType|     TotalSales|
+------------+---------------+
| Residential|204614966269.29|
|  Commercial| 30602284248.00|
|       Condo| 25301417810.00|
+------------+---------------+



In [18]:
df_real_estate.createOrReplaceTempView("real_estate_sales")

In [19]:
df_sql_top_3 = spark.sql("select PropertyType, sum(SaleAmount) as TotalSales from real_estate_sales GROUP BY PropertyType ORDER BY sum(SaleAmount) DESC LIMIT 3")

In [20]:
df_sql_top_3.withColumn('TotalSales', df_sql_top_3.TotalSales.cast(DecimalType(18, 2))).show()

+------------+---------------+
|PropertyType|     TotalSales|
+------------+---------------+
| Residential|204614966269.29|
|  Commercial| 30602284248.00|
|       Condo| 25301417810.00|
+------------+---------------+



In [57]:
df_top_3.explain()

== Physical Plan ==
TakeOrderedAndProject(limit=3, orderBy=[TotalSales#911 DESC NULLS LAST], output=[PropertyType#180,TotalSales#911])
+- *(2) HashAggregate(keys=[PropertyType#180], functions=[sum(SaleAmount#880)])
   +- Exchange hashpartitioning(PropertyType#180, 200)
      +- *(1) HashAggregate(keys=[PropertyType#180], functions=[partial_sum(SaleAmount#880)])
         +- *(1) Project [cast(SaleAmount#178 as double) AS SaleAmount#880, PropertyType#180]
            +- *(1) FileScan csv [SaleAmount#178,PropertyType#180] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/pmacharl/git-projects/personal/github.com/data_analysis_pandas_spar..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<SaleAmount:string,PropertyType:string>


In [58]:
df_sql_top_3.explain()

== Physical Plan ==
TakeOrderedAndProject(limit=3, orderBy=[aggOrder#930 DESC NULLS LAST], output=[PropertyType#180,TotalSales#928])
+- *(2) HashAggregate(keys=[PropertyType#180], functions=[sum(cast(SaleAmount#178 as double))])
   +- Exchange hashpartitioning(PropertyType#180, 200)
      +- *(1) HashAggregate(keys=[PropertyType#180], functions=[partial_sum(cast(SaleAmount#178 as double))])
         +- *(1) FileScan csv [SaleAmount#178,PropertyType#180] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/pmacharl/git-projects/personal/github.com/data_analysis_pandas_spar..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<SaleAmount:string,PropertyType:string>


# Stop Spark

In [21]:
spark.stop()