In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, desc
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

In [0]:
spark = SparkSession.builder \
    .appName("PySpark Introduction") \
    .getOrCreate()

In [0]:
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("nome", StringType(), False),
    StructField("idade", IntegerType(), True),
    StructField("salario", DoubleType(), True),
])

dados = [
    (1, "João", 30, 5000.0),
    (2, "Maria", 28, 6200.0),
    (3, "Pedro", 35, 7100.0),
    (4, "Ana", 40, 8800.0),
    (5, "Lucas", 25, 4500.0),
    (6, "Carla", 32, 5400.0),
    (7, "Bruno", 29, 4900.0),
    (8, "Fernanda", 37, 7600.0),
    (9, "Tiago", 41, 8200.0),
    (10, "Juliana", 27, 5100.0)
]

df = spark.createDataFrame(dados, schema)

In [0]:
df.show()

+---+--------+-----+-------+
| id|    nome|idade|salario|
+---+--------+-----+-------+
|  1|    João|   30| 5000.0|
|  2|   Maria|   28| 6200.0|
|  3|   Pedro|   35| 7100.0|
|  4|     Ana|   40| 8800.0|
|  5|   Lucas|   25| 4500.0|
|  6|   Carla|   32| 5400.0|
|  7|   Bruno|   29| 4900.0|
|  8|Fernanda|   37| 7600.0|
|  9|   Tiago|   41| 8200.0|
| 10| Juliana|   27| 5100.0|
+---+--------+-----+-------+



In [0]:
df.printSchema()

root
 |-- id: integer (nullable = false)
 |-- nome: string (nullable = false)
 |-- idade: integer (nullable = true)
 |-- salario: double (nullable = true)



In [0]:
df.filter(col("salario") > 5000.0).show()

+---+--------+-----+-------+
| id|    nome|idade|salario|
+---+--------+-----+-------+
|  2|   Maria|   28| 6200.0|
|  3|   Pedro|   35| 7100.0|
|  4|     Ana|   40| 8800.0|
|  6|   Carla|   32| 5400.0|
|  8|Fernanda|   37| 7600.0|
|  9|   Tiago|   41| 8200.0|
| 10| Juliana|   27| 5100.0|
+---+--------+-----+-------+



In [0]:
df.groupBy("idade")\
.agg(
    count("id").alias("quantidade"),
    avg("salario").alias("media_salario")
    ).show()

+-----+----------+-------------+
|idade|quantidade|media_salario|
+-----+----------+-------------+
|   30|         1|       5000.0|
|   28|         1|       6200.0|
|   35|         1|       7100.0|
|   40|         1|       8800.0|
|   25|         1|       4500.0|
|   32|         1|       5400.0|
|   29|         1|       4900.0|
|   37|         1|       7600.0|
|   27|         1|       5100.0|
|   41|         1|       8200.0|
+-----+----------+-------------+



In [0]:
df.orderBy(desc("salario")).show()

+---+--------+-----+-------+
| id|    nome|idade|salario|
+---+--------+-----+-------+
|  4|     Ana|   40| 8800.0|
|  9|   Tiago|   41| 8200.0|
|  8|Fernanda|   37| 7600.0|
|  3|   Pedro|   35| 7100.0|
|  2|   Maria|   28| 6200.0|
|  6|   Carla|   32| 5400.0|
| 10| Juliana|   27| 5100.0|
|  1|    João|   30| 5000.0|
|  7|   Bruno|   29| 4900.0|
|  5|   Lucas|   25| 4500.0|
+---+--------+-----+-------+



In [0]:
df.select("nome", "salario").show()

+--------+-------+
|    nome|salario|
+--------+-------+
|    João| 5000.0|
|   Maria| 6200.0|
|   Pedro| 7100.0|
|     Ana| 8800.0|
|   Lucas| 4500.0|
|   Carla| 5400.0|
|   Bruno| 4900.0|
|Fernanda| 7600.0|
|   Tiago| 8200.0|
| Juliana| 5100.0|
+--------+-------+

