In [1]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, col, expr

In [2]:
spark = SparkSession.builder.appName("ch01").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/05 23:13:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/05 23:13:47 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
schema = '''`CallNumber` INT, 
`UnitID` STRING, 
`IncidentNumber` INT, 
`CallType` STRING, 
`CallDate` STRING, 
`WatchDate` STRING, 
`CallFinalDisposition` STRING, 
`AvailableDtTm` STRING, 
`Address` STRING, 
`City` STRING, 
`Zipcode` INT, 
`Battalion` STRING, 
`StationArea` STRING, 
`Box` STRING, 
`OriginalPriority` STRING, 
`Priority` STRING, 
`FinalPriority` INT, 
`ALSUnit` BOOLEAN, 
`CallTypeGroup` STRING, 
`NumAlarms` INT, 
`UnitType` STRING, 
`UnitSequenceInCallDispatch` INT, 
`FirePreventionDistrict` STRING, 
`SupervisorDistrict` STRING, 
`Neighborhood` STRING, 
`Location` STRING, 
`RowID` STRING, 
`Delay` FLOAT'''

> **DataFrameReader**, that enables you to read data into a DataFrame from myriad
data sources in formats such as JSON, CSV, Parquet, Text, Avro, ORC, etc. Likewise,
to write a DataFrame back to a data source in a particular format, Spark uses
**DataFrameWriter**

In [5]:
sf_fire_file = "./learning-spark-v2/sf-fire/sf-fire-calls.csv"
fire_df = spark.read.csv(sf_fire_file, header=True, schema=schema)

In [7]:
print(fire_df.printSchema())
fire_df.show(5)

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: string (nullable = true)
 |-- WatchDate: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 

24/05/05 23:16:07 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------+--------------------------+----------------------+------------------+--------------------+--------------------+-------------+---------+
|CallNumber|UnitID|IncidentNumber|        CallType|  CallDate| WatchDate|CallFinalDisposition|       AvailableDtTm|             Address|City|Zipcode|Battalion|StationArea| Box|OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|        Neighborhood|            Location|        RowID|    Delay|
+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+------------

> Parquet, a popular columnar format, is the default format; it uses
snappy compression to compress the data. If the DataFrame is written as Parquet, the
schema is preserved as part of the Parquet metadata.
> 

# Transformation in Action

In [8]:
few_fire_df = (fire_df
.select("IncidentNumber", "AvailableDtTm", "CallType")
.where(col("CallType") != "Medical Incident")) # select where not
few_fire_df.show(5, truncate=False)

+--------------+----------------------+--------------+
|IncidentNumber|AvailableDtTm         |CallType      |
+--------------+----------------------+--------------+
|2003235       |01/11/2002 01:51:44 AM|Structure Fire|
|2003250       |01/11/2002 04:16:46 AM|Vehicle Fire  |
|2003259       |01/11/2002 06:01:58 AM|Alarms        |
|2003279       |01/11/2002 08:03:26 AM|Structure Fire|
|2003301       |01/11/2002 09:46:44 AM|Alarms        |
+--------------+----------------------+--------------+
only showing top 5 rows



In [9]:
# import all possible Functions

In [10]:
# In Python
import pyspark.sql.functions as F

In [14]:
# use () for multi-line
(fire_df.select("CallType")
.where(col("CallType").isNotNull())
.agg(F.countDistinct("CallType").alias("DistinctCallTypes")) #countDistinct
.show())

+-----------------+
|DistinctCallTypes|
+-----------------+
|               30|
+-----------------+



In [16]:
# or one line
fire_df.select("CallType").where(col("CallType").isNotNull()).agg(F.countDistinct("CallType").alias("DistinctCallTypes")).show()

+-----------------+
|DistinctCallTypes|
+-----------------+
|               30|
+-----------------+



In [17]:
(fire_df
.select("CallType")
.where(col("CallType").isNotNull())
.distinct()
.show(10, False))

+-----------------------------+
|CallType                     |
+-----------------------------+
|Elevator / Escalator Rescue  |
|Aircraft Emergency           |
|Alarms                       |
|Odor (Strange / Unknown)     |
|Citizen Assist / Service Call|
|HazMat                       |
|Explosion                    |
|Oil Spill                    |
|Vehicle Fire                 |
|Suspicious Package           |
+-----------------------------+
only showing top 10 rows



In [18]:
# Rename a column
new_fire_df = fire_df.withColumnRenamed("Delay", "ResponseDelayedinMins")
(new_fire_df
.select("ResponseDelayedinMins")
.where(col("ResponseDelayedinMins") > 5)
.show(5, False))

+---------------------+
|ResponseDelayedinMins|
+---------------------+
|5.35                 |
|6.25                 |
|5.2                  |
|5.6                  |
|7.25                 |
+---------------------+
only showing top 5 rows



> the columns CallDate, WatchDate, and AlarmDtTm are strings
rather than either Unix timestamps or SQL dates, both of which Spark supports and
can easily manipulate during transformations or actions <br>
> spark.sql.functions has a set of to/from date/timestamp functions such as to_timestamp() and to_date() that we can use for just this
purpose

In [20]:
fire_ts_df = (new_fire_df
.withColumn("IncidentDate", F.to_timestamp(col("CallDate"), "MM/dd/yyyy"))
.drop("CallDate")
.withColumn("OnWatchDate", F.to_timestamp(col("WatchDate"), "MM/dd/yyyy"))
.drop("WatchDate")
.withColumn("AvailableDtTS", F.to_timestamp(col("AvailableDtTm"),
"MM/dd/yyyy hh:mm:ss a"))
.drop("AvailableDtTm"))

In [21]:
fire_ts_df.select("IncidentDate", "OnWatchDate", "AvailableDtTS").show(5, False)

+-------------------+-------------------+-------------------+
|IncidentDate       |OnWatchDate        |AvailableDtTS      |
+-------------------+-------------------+-------------------+
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 01:51:44|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 03:01:18|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 02:39:50|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 04:16:46|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 06:01:58|
+-------------------+-------------------+-------------------+
only showing top 5 rows



> Now that we have modified the dates, we can query using functions from
spark.sql.functions like month(), year(), and day() to explore our data further.

In [23]:
(fire_ts_df
.select(F.year('IncidentDate'))
.distinct()
.orderBy(F.year('IncidentDate'))
.show())

+------------------+
|year(IncidentDate)|
+------------------+
|              2000|
|              2001|
|              2002|
|              2003|
|              2004|
|              2005|
|              2006|
|              2007|
|              2008|
|              2009|
|              2010|
|              2011|
|              2012|
|              2013|
|              2014|
|              2015|
|              2016|
|              2017|
|              2018|
+------------------+

