In [47]:

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
from pyspark.sql.functions import max, desc, col

In [2]:
spark = SparkSession.builder.master("local[6]") \
                    .appName('intro_to_spark') \
                    .getOrCreate()

myRange = spark.range(1000).toDF("number")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/20 12:24:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:

# COMMAND ----------

divisBy2 = myRange.where("number % 2 = 0")


In [4]:


# COMMAND ----------
schema = StructType([
      StructField("DEST_COUNTRY_NAME", StringType(),True), 
      StructField("ORIGIN_COUNTRY_NAME", StringType(),True), 
      StructField("count", IntegerType(), True)
      ])
flightData2015 = spark\
  .read\
  .option("inferSchema", "true")\
  .option("header", "true")\
  .option("delimiter", ",")\
  .schema(schema)\
  .csv("../data/flight-data/csv/2015-summary.csv")



In [5]:
# COMMAND ----------

flightData2015.createOrReplaceTempView("flight_data_2015")

flightData2015.sort("count").take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

In [6]:

# COMMAND ----------

sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
""")

dataFrameWay = flightData2015\
  .groupBy("DEST_COUNTRY_NAME")\
  .count()



In [7]:
sqlWay.explain()


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#4], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#4, 200), ENSURE_REQUIREMENTS, [id=#20]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#4], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#4] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/Vivek_Goyal/dev/core-codecommit/repos/ds/Spark-The-Definit..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>




In [8]:
dataFrameWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#4], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#4, 200), ENSURE_REQUIREMENTS, [id=#33]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#4], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#4] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/Vivek_Goyal/dev/core-codecommit/repos/ds/Spark-The-Definit..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>




In [9]:
spark.sql("SELECT max(count) FROM flight_data_2015").take(1)


[Row(max(count)=370002)]

In [10]:
flightData2015.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [14]:
spark.sql("SELECT * FROM flight_data_2015 WHERE count in (SELECT max(count) FROM flight_data_2015)").show()


+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
+-----------------+-------------------+------+



In [49]:
flightData2015.where( flightData2015["count"] == flightData2015.select(max("count")).collect()[0][0]).show()

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
+-----------------+-------------------+------+



In [44]:
flightData2015.select(max("count")).collect()[0][0]


370002

In [12]:
spark.sql("""
            SELECT DEST_COUNTRY_NAME, sum(count) as total_count 
            FROM flight_data_2015 
            GROUP BY DEST_COUNTRY_NAME 
            ORDER BY sum(count) DESC 
            LIMIT 5
        """).show()

+-----------------+-----------+
|DEST_COUNTRY_NAME|total_count|
+-----------------+-----------+
|    United States|     411352|
|           Canada|       8399|
|           Mexico|       7140|
|   United Kingdom|       2025|
|            Japan|       1548|
+-----------------+-----------+



In [13]:
flightData2015 \
    .groupBy("DEST_COUNTRY_NAME") \
    .sum("count") \
    .withColumnRenamed("sum(count)", "total_count") \
    .sort(desc("total_count")) \
    .limit(5) \
    .explain()


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- TakeOrderedAndProject(limit=5, orderBy=[total_count#74L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#4,total_count#74L])
   +- HashAggregate(keys=[DEST_COUNTRY_NAME#4], functions=[sum(count#6)])
      +- Exchange hashpartitioning(DEST_COUNTRY_NAME#4, 200), ENSURE_REQUIREMENTS, [id=#165]
         +- HashAggregate(keys=[DEST_COUNTRY_NAME#4], functions=[partial_sum(count#6)])
            +- FileScan csv [DEST_COUNTRY_NAME#4,count#6] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/Vivek_Goyal/dev/core-codecommit/repos/ds/Spark-The-Definit..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>


