In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("PySpark Docker Example") \
    .getOrCreate()

In [2]:
data = [(1, "Alice"), (2, "Bob"), (3, "Charlie")]
columns = ["id", "name"]
df = spark.createDataFrame(data, columns)

In [3]:
df.show()

+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
+---+-------+



In [6]:
df = spark.read.csv("./iris.csv", header=True, inferSchema=True)

df.show()

num_linhas = df.count()
print(f"Número de linhas no DataFrame: {num_linhas}")

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
| 11|          5.4|         3.7|          1.5|         0.2|Iris-

In [8]:
df_transformado = df.select("SepalLengthCm", "SepalWidthCm").filter(df["Species"] == 'Iris-setosa')

In [9]:
df_transformado.show()

+-------------+------------+
|SepalLengthCm|SepalWidthCm|
+-------------+------------+
|          5.1|         3.5|
|          4.9|         3.0|
|          4.7|         3.2|
|          4.6|         3.1|
|          5.0|         3.6|
|          5.4|         3.9|
|          4.6|         3.4|
|          5.0|         3.4|
|          4.4|         2.9|
|          4.9|         3.1|
|          5.4|         3.7|
|          4.8|         3.4|
|          4.8|         3.0|
|          4.3|         3.0|
|          5.8|         4.0|
|          5.7|         4.4|
|          5.4|         3.9|
|          5.1|         3.5|
|          5.7|         3.8|
|          5.1|         3.8|
+-------------+------------+
only showing top 20 rows



In [10]:
# Registrar DataFrame como tabela temporária
df.createOrReplaceTempView("tabela_temporaria")

# Executar consulta SQL no DataFrame
resultado_sql = spark.sql("SELECT SepalLengthCm, AVG(SepalWidthCm) FROM tabela_temporaria GROUP BY SepalLengthCm")
resultado_sql.show()

+-------------+------------------+
|SepalLengthCm| avg(SepalWidthCm)|
+-------------+------------------+
|          5.4|3.5500000000000003|
|          7.0|               3.2|
|          6.1|              2.85|
|          7.7|              3.05|
|          6.6|              2.95|
|          4.5|               2.3|
|          5.7|               3.1|
|          6.7|3.0500000000000003|
|          7.4|               2.8|
|          6.5|               3.0|
|          4.9|2.8666666666666667|
|          6.2|2.8249999999999997|
|          5.1| 3.477777777777778|
|          7.3|               2.9|
|          4.3|               3.0|
|          7.9|               3.8|
|          4.7|               3.2|
|          5.3|               3.7|
|          7.2| 3.266666666666667|
|          7.6|               3.0|
+-------------+------------------+
only showing top 20 rows

