In [1]:
from pyspark.sql import SparkSession, SQLContext
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("spark-guide-book")\
    .getOrCreate()

In [3]:
myRange = spark.range(1000).toDF("number")

In [4]:
divisBy2 = myRange.where("number % 2 = 0")

In [5]:
!pwd

/home/gong/spark/books/Spark-The-Definitive-Guide/notebook


In [6]:
!ls ../data

activity-data	       flight-data-hive		  sample_movielens_ratings.txt
bike-data	       multiclass-classification  simple-ml
binary-classification  README.md		  simple-ml-integers
clustering	       regression		  simple-ml-scaling
deep-learning-images   retail-data
flight-data	       sample_libsvm_data.txt


In [7]:
file_path = "../data/flight-data/csv/2015-summary.csv"
flightData2015 = spark\
  .read\
  .option("inferSchema", "true")\
  .option("header", "true")\
  .csv(file_path)

In [11]:
flightData2015.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [8]:
flightData2015.createOrReplaceTempView("flight_data_2015")

In [9]:
sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
""")

In [10]:
sqlWay.show(5)

+-----------------+--------+
|DEST_COUNTRY_NAME|count(1)|
+-----------------+--------+
|         Anguilla|       1|
|           Russia|       1|
|         Paraguay|       1|
|          Senegal|       1|
|           Sweden|       1|
+-----------------+--------+
only showing top 5 rows



In [12]:
dataFrameWay = flightData2015\
  .groupBy("DEST_COUNTRY_NAME")\
  .count()

In [13]:
sqlWay.explain()

== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#14], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#14, 200)
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#14], functions=[partial_count(1)])
      +- *(1) FileScan csv [DEST_COUNTRY_NAME#14] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/gong/spark-2.4.3-bin-hadoop2.7/books/Spark-The-Definitive-Guide/data..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


In [14]:
dataFrameWay.explain()

== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#14], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#14, 200)
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#14], functions=[partial_count(1)])
      +- *(1) FileScan csv [DEST_COUNTRY_NAME#14] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/gong/spark-2.4.3-bin-hadoop2.7/books/Spark-The-Definitive-Guide/data..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


In [16]:
from pyspark.sql.functions import max

flightData2015.select(max("count")).take(1)

[Row(max(count)=370002)]

In [17]:
maxSql = spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY sum(count) DESC
LIMIT 5
""")

maxSql.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [18]:
from pyspark.sql.functions import desc

flightData2015\
  .groupBy("DEST_COUNTRY_NAME")\
  .sum("count")\
  .withColumnRenamed("sum(count)", "destination_total")\
  .sort(desc("destination_total"))\
  .limit(5)\
  .show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [19]:
flightData2015\
  .groupBy("DEST_COUNTRY_NAME")\
  .sum("count")\
  .withColumnRenamed("sum(count)", "destination_total")\
  .sort(desc("destination_total"))\
  .limit(5)\
  .explain()

== Physical Plan ==
TakeOrderedAndProject(limit=5, orderBy=[destination_total#107L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#14,destination_total#107L])
+- *(2) HashAggregate(keys=[DEST_COUNTRY_NAME#14], functions=[sum(cast(count#16 as bigint))])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#14, 200)
      +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#14], functions=[partial_sum(cast(count#16 as bigint))])
         +- *(1) FileScan csv [DEST_COUNTRY_NAME#14,count#16] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/gong/spark-2.4.3-bin-hadoop2.7/books/Spark-The-Definitive-Guide/data..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>
