In [37]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').getOrCreate()

# Consultas e seleções

In [38]:
df = spark.sql('''select 'ok' as Status''')
df.show()

+------+
|Status|
+------+
|    ok|
+------+



# Importing Data

In [39]:
df = spark.read.csv(r'Dados\cereal.csv', sep=',', inferSchema=True, header=True)
df.show()

+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|                name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|           100% Bran|  N|   C|      70|      4|  1|   130| 10.0|  5.0|     6|   280|      25|    3|   1.0|0.33|68.402973|
|   100% Natural Bran|  Q|   C|     120|      3|  5|    15|  2.0|  8.0|     8|   135|       0|    3|   1.0| 1.0|33.983679|
|            All-Bran|  K|   C|      70|      4|  1|   260|  9.0|  7.0|     5|   320|      25|    3|   1.0|0.33|59.425505|
|All-Bran with Ext...|  K|   C|      50|      4|  0|   140| 14.0|  8.0|     0|   330|      25|    3|   1.0| 0.5|93.704912|
|      Almond Delight|  R|   C|     110|      2|  2|   200|  1.0| 14.0|     8|    -1|      25|    3|   1.0|0.75|34.384843|
|Apple Cinnamon 

 # Manipulation Data with Spark SQL

In [40]:
df.createOrReplaceTempView('cereal')

In [41]:
cereal = spark.sql('''SELECT * FROM cereal WHERE type = 'C' ''')
cereal.show()

+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|                name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|           100% Bran|  N|   C|      70|      4|  1|   130| 10.0|  5.0|     6|   280|      25|    3|   1.0|0.33|68.402973|
|   100% Natural Bran|  Q|   C|     120|      3|  5|    15|  2.0|  8.0|     8|   135|       0|    3|   1.0| 1.0|33.983679|
|            All-Bran|  K|   C|      70|      4|  1|   260|  9.0|  7.0|     5|   320|      25|    3|   1.0|0.33|59.425505|
|All-Bran with Ext...|  K|   C|      50|      4|  0|   140| 14.0|  8.0|     0|   330|      25|    3|   1.0| 0.5|93.704912|
|      Almond Delight|  R|   C|     110|      2|  2|   200|  1.0| 14.0|     8|    -1|      25|    3|   1.0|0.75|34.384843|
|Apple Cinnamon 

# Select Distinct no Spark SQL

In [42]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- mfr: string (nullable = true)
 |-- type: string (nullable = true)
 |-- calories: integer (nullable = true)
 |-- protein: integer (nullable = true)
 |-- fat: integer (nullable = true)
 |-- sodium: integer (nullable = true)
 |-- fiber: double (nullable = true)
 |-- carbo: double (nullable = true)
 |-- sugars: integer (nullable = true)
 |-- potass: integer (nullable = true)
 |-- vitamins: integer (nullable = true)
 |-- shelf: integer (nullable = true)
 |-- weight: double (nullable = true)
 |-- cups: double (nullable = true)
 |-- rating: double (nullable = true)



In [43]:
df.createOrReplaceTempView('cereal')

In [44]:
cereal = spark.sql(''' SELECT DISTINCT type, mfr FROM cereal ''')
cereal.show()

+----+---+
|type|mfr|
+----+---+
|   C|  P|
|   C|  Q|
|   C|  N|
|   H|  Q|
|   C|  R|
|   H|  N|
|   C|  G|
|   H|  A|
|   C|  K|
+----+---+



# Where no Spark SQL

In [45]:
cereal = spark.sql(''' SELECT * FROM cereal WHERE mfr = 'K' AND calories >= 100 ''')
cereal.count()

19

# Group By

In [46]:
cereal = spark.sql(''' 
                   SELECT 
                        mfr,
                        type,
                        COUNT(*) AS total,
                        SUM(calories) AS total_calories
                   FROM
                        cereal
                   GROUP BY
                        mfr,
                        type ''')
cereal.show()

+---+----+-----+--------------+
|mfr|type|total|total_calories|
+---+----+-----+--------------+
|  A|   H|    1|           100|
|  P|   C|    9|           980|
|  K|   C|   23|          2500|
|  G|   C|   22|          2450|
|  Q|   C|    7|           660|
|  R|   C|    8|           920|
|  Q|   H|    1|           100|
|  N|   H|    1|           100|
|  N|   C|    5|           420|
+---+----+-----+--------------+



# Case When

In [47]:
cereal = spark.sql(''' SELECT DISTINCT type FROM cereal ''')
cereal.show()

+----+
|type|
+----+
|   C|
|   H|
+----+



In [48]:
cereal = spark.sql(''' 
                    SELECT
                        mfr,
                        type,
                        (CASE 
                            WHEN type = 'C' THEN 'A'
                            WHEN type = 'H' THEN 'B'
                            ELSE 'D' END) AS type_new,
                        COUNT(*) AS total,                   
                        SUM(calories) AS total_calories
                    FROM
                        cereal
                    GROUP BY
                        mfr,
                        type
                    ''')

cereal.show()

+---+----+--------+-----+--------------+
|mfr|type|type_new|total|total_calories|
+---+----+--------+-----+--------------+
|  A|   H|       B|    1|           100|
|  P|   C|       A|    9|           980|
|  K|   C|       A|   23|          2500|
|  G|   C|       A|   22|          2450|
|  Q|   C|       A|    7|           660|
|  R|   C|       A|    8|           920|
|  Q|   H|       B|    1|           100|
|  N|   H|       B|    1|           100|
|  N|   C|       A|    5|           420|
+---+----+--------+-----+--------------+



# Consultas avançadas em SQL usando PySpark

In [49]:
df.show(5)

+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|                name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+--------------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|           100% Bran|  N|   C|      70|      4|  1|   130| 10.0|  5.0|     6|   280|      25|    3|   1.0|0.33|68.402973|
|   100% Natural Bran|  Q|   C|     120|      3|  5|    15|  2.0|  8.0|     8|   135|       0|    3|   1.0| 1.0|33.983679|
|            All-Bran|  K|   C|      70|      4|  1|   260|  9.0|  7.0|     5|   320|      25|    3|   1.0|0.33|59.425505|
|All-Bran with Ext...|  K|   C|      50|      4|  0|   140| 14.0|  8.0|     0|   330|      25|    3|   1.0| 0.5|93.704912|
|      Almond Delight|  R|   C|     110|      2|  2|   200|  1.0| 14.0|     8|    -1|      25|    3|   1.0|0.75|34.384843|
+---------------

In [50]:
cereal = spark.sql('''
                    SELECT
                        mfr,
                        type,
                   
                        SUM(calories) AS total_calories,
                        MIN(calories) AS min_calories,
                        MAX(calories) AS max_calories,
                        CAST(AVG(calories) AS DECIMAL(10, 2)) AS mean_calories,
                   
                        SUM(carbo) AS total_carbo,
                        MIN(carbo) AS min_carbo,
                        MAX(carbo) AS max_carbo,
                        CAST(AVG(carbo) AS DECIMAL(10, 2)) AS mean_carbo,

                        SUM(vitamins) AS total_vitamins,
                        MIN(vitamins) AS min_vitamins,
                        MAX(vitamins) AS max_vitamins,
                        CAST(AVG(vitamins) AS DECIMAL(10, 2)) AS mean_vitamins,

                        COUNT(DISTINCT name) AS dist_qtd_name,
                        COUNT(name) AS qtd_name
                   FROM cereal
                   GROUP BY mfr, type
                   ORDER BY mfr, type
                ''')

cereal.show()

+---+----+--------------+------------+------------+-------------+-----------+---------+---------+----------+--------------+------------+------------+-------------+-------------+--------+
|mfr|type|total_calories|min_calories|max_calories|mean_calories|total_carbo|min_carbo|max_carbo|mean_carbo|total_vitamins|min_vitamins|max_vitamins|mean_vitamins|dist_qtd_name|qtd_name|
+---+----+--------------+------------+------------+-------------+-----------+---------+---------+----------+--------------+------------+------------+-------------+-------------+--------+
|  A|   H|           100|         100|         100|       100.00|       16.0|     16.0|     16.0|     16.00|            25|          25|          25|        25.00|            1|       1|
|  G|   C|          2450|         100|         140|       111.36|      324.0|     10.5|     21.0|     14.73|           775|          25|         100|        35.23|           22|      22|
|  K|   C|          2500|          50|         160|       108.70|

In [51]:
cereal = spark.sql('''
                    SELECT
                        mfr,
                        type,
                        (CASE
                            WHEN mfr = 'A' THEN 'Abacaxi'
                            WHEN mfr = 'G' THEN 'Goiaba'
                            WHEN mfr = 'K' THEN 'Kiwi'
                            WHEN mfr = 'N' THEN 'Morango'
                            WHEN mfr = 'P' THEN 'Pera'
                            WHEN mfr = 'Q' THEN 'Banana'
                            WHEN mfr = 'R' THEN 'Maca'
                            ELSE 'NA'
                        END) AS type_fruit,
                   
                        SUM(calories) AS total_calories,
                        MIN(calories) AS min_calories,
                        MAX(calories) AS max_calories,
                        CAST(AVG(calories) AS DECIMAL(10, 2)) AS mean_calories,
                   
                        SUM(carbo) AS total_carbo,
                        MIN(carbo) AS min_carbo,
                        MAX(carbo) AS max_carbo,
                        CAST(AVG(carbo) AS DECIMAL(10, 2)) AS mean_carbo,

                        SUM(vitamins) AS total_vitamins,
                        MIN(vitamins) AS min_vitamins,
                        MAX(vitamins) AS max_vitamins,
                        CAST(AVG(vitamins) AS DECIMAL(10, 2)) AS mean_vitamins,

                        COUNT(DISTINCT name) AS dist_qtd_name,
                        COUNT(name) AS qtd_name
                   FROM cereal
                   GROUP BY mfr, type
                   ORDER BY mfr, type
                ''')

cereal.show()

+---+----+----------+--------------+------------+------------+-------------+-----------+---------+---------+----------+--------------+------------+------------+-------------+-------------+--------+
|mfr|type|type_fruit|total_calories|min_calories|max_calories|mean_calories|total_carbo|min_carbo|max_carbo|mean_carbo|total_vitamins|min_vitamins|max_vitamins|mean_vitamins|dist_qtd_name|qtd_name|
+---+----+----------+--------------+------------+------------+-------------+-----------+---------+---------+----------+--------------+------------+------------+-------------+-------------+--------+
|  A|   H|   Abacaxi|           100|         100|         100|       100.00|       16.0|     16.0|     16.0|     16.00|            25|          25|          25|        25.00|            1|       1|
|  G|   C|    Goiaba|          2450|         100|         140|       111.36|      324.0|     10.5|     21.0|     14.73|           775|          25|         100|        35.23|           22|      22|
|  K|   C|

# JOINs

## INNER