In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark import StorageLevel

from pyspark.sql import Row
from pyspark.sql.functions import *

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .appName("Pyspark") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("INFO")

In [3]:
stringJSONRDD = sc.parallelize((""" 
  { "id": "123",
    "name": "Katie",
    "age": 19,
    "eyeColor": "brown"
  }""",
   """{
    "id": "234",
    "name": "Michael",
    "age": 22,
    "eyeColor": "green"
  }""", 
  """{
    "id": "345",
    "name": "Simone",
    "age": 23,
    "eyeColor": "blue"
  }""")
)

In [4]:
# Create DataFrame
swimmersJSON = spark.read.json(stringJSONRDD)

In [6]:
# Create temporary table
swimmersJSON.createOrReplaceTempView('swimmersJSON')

In [7]:
# DataFrame API
swimmersJSON.show()

+---+--------+---+-------+
|age|eyeColor| id|   name|
+---+--------+---+-------+
| 19|   brown|123|  Katie|
| 22|   green|234|Michael|
| 23|    blue|345| Simone|
+---+--------+---+-------+



In [9]:
#SQL Query
spark.sql('select * from swimmersJSON').collect()

[Row(age=19, eyeColor='brown', id='123', name='Katie'),
 Row(age=22, eyeColor='green', id='234', name='Michael'),
 Row(age=23, eyeColor='blue', id='345', name='Simone')]

In [13]:
%sql
--Query Data
select * from swimmersJSON

<b>Inferring the Schema Using Reflection</b>


Note that Apache Spark is inferring the schema using reflection; i.e. it automaticlaly determines the schema of the data based on reviewing the JSON data.

In [14]:
# Print the schema
swimmersJSON.printSchema()

root
 |-- age: long (nullable = true)
 |-- eyeColor: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)



Notice that Spark was able to determine infer the schema (when reviewing the schema using .printSchema).

But what if we want to programmatically specify the schema?

<b> Programmatically Specifying the Schema</b>

In this case, let's specify the schema for a CSV text file.



In [15]:
from pyspark.sql.types import *

In [17]:
stringCSVRDD = sc.parallelize([(123, 'Katie', 19, 'brown'), (234, 'Michael', 22, 'green'), (345, 'Simone', 23, 'blue')])

In [18]:
schemaString = 'id name age eyeColor'

schema = StructType([
    StructField('id', LongType(), True),
    StructField('name', StringType(), True),
    StructField('age', LongType(), True),
    StructField('eyeColor', StringType(), True)
])

In [19]:
# Apply the schema to the RDD and Create DataFrame

swimmers = spark.createDataFrame(stringCSVRDD, schema)

In [34]:
swimmers.createOrReplaceTempView('swimmers')

In [35]:
swimmers.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- eyeColor: string (nullable = true)



In [36]:
swimmers.show()

+---+-------+---+--------+
| id|   name|age|eyeColor|
+---+-------+---+--------+
|123|  Katie| 19|   brown|
|234|Michael| 22|   green|
|345| Simone| 23|    blue|
+---+-------+---+--------+



In [37]:
type(swimmersJSON)

pyspark.sql.dataframe.DataFrame

In [38]:
type(swimmers)

pyspark.sql.dataframe.DataFrame

## Querying with SQL
With DataFrames, you can start writing your queries using Spark SQL - a SQL dialect that is compatible with the Hive Query Language (or HiveQL).

In [41]:
# Executure SQL Query and return the data
spark.sql('select * from swimmers').show()

+---+-------+---+--------+
| id|   name|age|eyeColor|
+---+-------+---+--------+
|123|  Katie| 19|   brown|
|234|Michael| 22|   green|
|345| Simone| 23|    blue|
+---+-------+---+--------+



In [42]:
# Get count of rows in SQL
spark.sql('select count(1) from swimmers').show()

+--------+
|count(1)|
+--------+
|       3|
+--------+



Note, you can make use of %sql within the notebook cells of a Databricks notebook.



In [50]:
spark.sql('select id ,age from swimmers where age ==22').show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



In [45]:
# Query id and age for swimmers with age=22 via DataFrame API
swimmers.select('id','age').filter('age ==22').show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



In [51]:
# Query id and age for swimmers with age = 22 via
# DataFrame API in another way

swimmers.select(swimmers.id, swimmers.age).filter(swimmers.age == 22).show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



In [58]:
swimmers.show()

+---+-------+---+--------+
| id|   name|age|eyeColor|
+---+-------+---+--------+
|123|  Katie| 19|   brown|
|234|Michael| 22|   green|
|345| Simone| 23|    blue|
+---+-------+---+--------+



In [59]:
# Query name and eye color for swimmers with eye color starting with the letter 'b'
spark.sql('select name, eyeColor from swimmers where eyeColor like "b%"' ).show()


+------+--------+
|  name|eyeColor|
+------+--------+
| Katie|   brown|
|Simone|    blue|
+------+--------+



In [61]:
swimmers.select('name','eyeColor').filter('eyeColor like "b%"').show()


+------+--------+
|  name|eyeColor|
+------+--------+
| Katie|   brown|
|Simone|    blue|
+------+--------+



### Querying with the DataFrame API

With DataFrames, you can start writing your queries using the DataFrame API



In [62]:
# Show the values
swimmers.show()

+---+-------+---+--------+
| id|   name|age|eyeColor|
+---+-------+---+--------+
|123|  Katie| 19|   brown|
|234|Michael| 22|   green|
|345| Simone| 23|    blue|
+---+-------+---+--------+



In [64]:
swimmers.count()

3

In [69]:
# Get the id, age where age == 22
swimmers.select('id','age').filter('age = 22').show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



In [70]:
spark.sql('select id, age from swimmers where age = 22').show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



## On-Time Flight Performance

Query flight departure delays by State and City by joining the departure delay and join to the airport codes (to identify state and city).



### DataFrame Queries 
Let's run a flight performance using DataFrames; let's first build the DataFrames from the source datasets.



In [72]:
import os
os.listdir()

['.DS_Store',
 'spark-warehouse',
 '.ipynb_checkpoints',
 'Pyspark.ipynb',
 'Chapter3',
 'Chapter4']

In [None]:
# Set File Path
flightPerFilePath = 'data/'