In [1]:
from time import sleep

from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col
from pyspark.sql.types import *


spark = SparkSession.builder.master("local").getOrCreate()
sc = spark.sparkContext

# 1. HOW TO CREATE DATAFRAME

In [3]:
first_df = spark.read. \
    format("json"). \
    option("inferSchema", "true"). \
    load("data/cars")

first_df.show()
first_df.printSchema()

+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|Acceleration|Cylinders|Displacement|Horsepower|Miles_per_Gallon|                Name|Origin|Weight_in_lbs|      Year|
+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|        12.0|        8|       307.0|       130|            18.0|chevrolet chevell...|   USA|         3504|      null|
|        12.0|        8|       307.0|       130|            18.0|chevrolet chevell...|   USA|         3504|1970-01-01|
|        11.5|        8|       350.0|       165|            15.0|   buick skylark 320|   USA|         3693|1970-01-01|
|        11.0|        8|       318.0|       150|            18.0|  plymouth satellite|   USA|         3436|1970-01-01|
|        12.0|        8|       304.0|       150|            16.0|       amc rebel sst|   USA|         3433|1970-01-01|
|        10.5|        8|       302.0|       140|

In [3]:
# specify a schema manually
cars_schema = StructType([
    StructField("Name", StringType()),
    StructField("Acceleration", DoubleType()),
    StructField("Cylinders", LongType()),
    StructField("Displacement", DoubleType()),
    StructField("Horsepower", IntegerType()),
    StructField("Miles_per_Gallon", DoubleType()),
    StructField("Origin", StringType()),
    StructField("Weight_in_lbs", LongType()),
    StructField("Year", StringType()),
])

# TODO - Int Type
#!!!!!!!!!!! Very Important Warning Long Type vs IntType

# reading a DF with a manual schema
cars_df = spark.read. \
    format("json"). \
    schema(cars_schema). \
    load("data/cars")

cars_df.filter(col("Cylinders") > 7).show()

+--------------------+------------+---------+------------+----------+----------------+------+-------------+----------+
|                Name|Acceleration|Cylinders|Displacement|Horsepower|Miles_per_Gallon|Origin|Weight_in_lbs|      Year|
+--------------------+------------+---------+------------+----------+----------------+------+-------------+----------+
|chevrolet chevell...|        12.0|        8|       307.0|       130|            18.0|   USA|         3504|      null|
|chevrolet chevell...|        12.0|        8|       307.0|       130|            18.0|   USA|         3504|1970-01-01|
|   buick skylark 320|        11.5|        8|       350.0|       165|            15.0|   USA|         3693|1970-01-01|
|  plymouth satellite|        11.0|        8|       318.0|       150|            18.0|   USA|         3436|1970-01-01|
|       amc rebel sst|        12.0|        8|       304.0|       150|            16.0|   USA|         3433|1970-01-01|
|         ford torino|        10.5|        8|   

In [13]:
# How to compare Data Frames

# TODO
# Parsed Plan = dont check fields, functions, expression
# Analyzed plan =
# Opt logical plan = 30-40 optimisations,
# Physical plan =

# Comparing
#  1 ==> first_df == first_df_inferSchema
#  2 ==> count1 union + distinct  first_df first_df_inferSchema, count1 = 10, count2 = 10,  count1 union (union all) + distinct = union = 20 or 10, antijoin
#  3 ==>

first_df_inferSchema = spark.read. \
    format("json"). \
    option("inferSchema", "true"). \
    load("data/cars")

first_df_inferSchema.printSchema()
cars_df.printSchema()
assert(first_df_inferSchema.schema == cars_df.schema)

root
 |-- Acceleration: double (nullable = true)
 |-- Cylinders: long (nullable = true)
 |-- Displacement: double (nullable = true)
 |-- Horsepower: long (nullable = true)
 |-- Miles_per_Gallon: double (nullable = true)
 |-- Name: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Weight_in_lbs: long (nullable = true)
 |-- Year: string (nullable = true)

root
 |-- Name: string (nullable = true)
 |-- Acceleration: double (nullable = true)
 |-- Cylinders: long (nullable = true)
 |-- Displacement: double (nullable = true)
 |-- Horsepower: integer (nullable = true)
 |-- Miles_per_Gallon: double (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Weight_in_lbs: long (nullable = true)
 |-- Year: string (nullable = true)



AssertionError: 

# 3. Catalyst Optimiser = Logical Plans + Codegeneration

In [5]:
#  DAG => Parsed Plan => Analyzed plan => Opt plan => Physical plan => Codegeneration
# Logical Plans
# Parsed Plan
# Analyzed plan
# Opt plan

most_powered_df = cars_df. \
    where(cars_df.Cylinders > 4). \
    withColumn("new", expr("Acceleration + 10")). \
    sort(cars_df.Horsepower.desc(), cars_df.Acceleration.asc())

most_powered_df.explain(True)

most_powered_df.show()

# WORKS ONLY IN SCALA
# most_powered_df.queryExecution().debug().codegen()

== Parsed Logical Plan ==
Sort [Horsepower#4 DESC NULLS LAST, Acceleration#1 ASC NULLS FIRST], true
+- Project [Name#0, Acceleration#1, Cylinders#2L, Displacement#3, Horsepower#4, Miles_per_Gallon#5, Origin#6, Weight_in_lbs#7L, Year#8, (Acceleration#1 + cast(10 as double)) AS new#125]
   +- Filter (Cylinders#2L > cast(4 as bigint))
      +- Relation [Name#0,Acceleration#1,Cylinders#2L,Displacement#3,Horsepower#4,Miles_per_Gallon#5,Origin#6,Weight_in_lbs#7L,Year#8] json

== Analyzed Logical Plan ==
Name: string, Acceleration: double, Cylinders: bigint, Displacement: double, Horsepower: int, Miles_per_Gallon: double, Origin: string, Weight_in_lbs: bigint, Year: string, new: double
Sort [Horsepower#4 DESC NULLS LAST, Acceleration#1 ASC NULLS FIRST], true
+- Project [Name#0, Acceleration#1, Cylinders#2L, Displacement#3, Horsepower#4, Miles_per_Gallon#5, Origin#6, Weight_in_lbs#7L, Year#8, (Acceleration#1 + cast(10 as double)) AS new#125]
   +- Filter (Cylinders#2L > cast(4 as bigint))
    