In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = (
    SparkSession
    .builder
    .config("spark.sql.warehouse.dir", "./temp")
    .master('local[*]')
    .getOrCreate()
)

In [5]:
df = spark.read.csv(r"C:\dados-tcs-itau\Etapa 4\dados\transacoes.csv", header=True, inferSchema=True)
df.show()

+------------+----------+-----+--------------+-----------+
|id_transacao|id_cliente|valor|data_transacao|  categoria|
+------------+----------+-----+--------------+-----------+
|           1|         1|100.0|    2023-11-01|    Compras|
|           2|         2|200.0|    2023-11-02|Alimentação|
|           3|         3|150.0|    2023-11-03|    Compras|
|           4|         4|300.0|    2023-11-01|     Viagem|
|           5|         5| 50.0|    2023-11-04|Alimentação|
+------------+----------+-----+--------------+-----------+



In [10]:
df.withColumn("ano_transacao", F.year(F.col("data_transacao")))
df.show()

+------------+----------+-----+--------------+-----------+-------------+
|id_transacao|id_cliente|valor|data_transacao|  categoria|ano_transacao|
+------------+----------+-----+--------------+-----------+-------------+
|           1|         1|100.0|    2023-11-01|    Compras|         2023|
|           2|         2|200.0|    2023-11-02|Alimentação|         2023|
|           3|         3|150.0|    2023-11-03|    Compras|         2023|
|           4|         4|300.0|    2023-11-01|     Viagem|         2023|
|           5|         5| 50.0|    2023-11-04|Alimentação|         2023|
+------------+----------+-----+--------------+-----------+-------------+



In [17]:
df_resultado = df.groupBy("categoria") \
    .agg(
        F.sum("valor").alias("valor_total"),
        F.avg("valor").alias("valor_medio")
    ) \
    .orderBy(F.col("valor_total").desc())
    
df_resultado.show()

+-----------+-----------+-----------+
|  categoria|valor_total|valor_medio|
+-----------+-----------+-----------+
|     Viagem|      300.0|      300.0|
|    Compras|      250.0|      125.0|
|Alimentação|      250.0|      125.0|
+-----------+-----------+-----------+

