In [41]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import os
import shutil
from pathlib import Path
import csv

In [42]:
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
import os

def run_notebook(notebook_path):
    try:
        with open(notebook_path) as f:
            nb = nbformat.read(f, as_version=4)

        ep = ExecutePreprocessor(timeout=600, kernel_name='python3')
        ep.preprocess(nb, {'metadata': {'path': os.path.dirname(notebook_path)}})

        print(f"Test {notebook_path} passed successfully. OK!")
    except Exception as e:
        print(f"Test {notebook_path} failed: {e}")
        raise e

In [43]:
run_notebook("tests/test_extract.ipynb")

25/03/29 13:05:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Setting Spark log level to "OFF".
Setting Spark log level to "OFF".
Setting Spark log level to "OFF".                                               
Setting Spark log level to "OFF".                                               
Setting Spark log level to "OFF".
Setting Spark log level to "OFF".                                               
Setting Spark log level to "OFF".                                               
Setting Spark log level to "OFF".
Setting Spark log level to "OFF".                                               
Setting Spark log level to "OFF".                                               
Setting Spark log level to "OFF".
Setting Spark log level to "OFF".                                      

Test tests/test_extract.ipynb passed successfully. OK!


In [44]:
spark = SparkSession.builder \
    .appName("ETL Completo Python") \
    .master("spark://spark-master:7077") \
    .config("spark.jars", "/opt/bitnami/spark/jars/mysql-connector-j-8.0.33.jar") \
    .config("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem") \
    .config("spark.log.level", "OFF") \
    .getOrCreate()

jdbc_url = "jdbc:mysql://mysql:3306/desafio?"
jdbc_properties = {
    "user": "sparkuser",
    "password": "sparkpass",
    "driver": "com.mysql.cj.jdbc.Driver"
}

Setting Spark log level to "OFF".


In [45]:
associados = spark.read.jdbc(jdbc_url, "associado", properties=jdbc_properties)
contas = spark.read.jdbc(jdbc_url, "conta", properties=jdbc_properties)
cartoes = spark.read.jdbc(jdbc_url, "cartao", properties=jdbc_properties)
movimentos = spark.read.jdbc(jdbc_url, "movimento", properties=jdbc_properties)

In [46]:
movimento_flat_join = associados \
    .join(contas, associados["id"] == contas["id_associado"], "inner") \
    .join(cartoes, (associados["id"] == cartoes["id_associado"]) & (contas["id"] == cartoes["id_conta"]), "inner") \
    .join(movimentos, cartoes["id"] == movimentos["id_cartao"], "inner")

In [47]:
movimento_flat_columns = movimento_flat_join.select(
    col("nome").alias("nome_associado"),
    col("sobrenome").alias("sobrenome_associado"),
    col("idade").alias("idade_associado"),
    col("vlr_transacao").alias("vlr_transacao_movimento"),
    col("des_transacao").alias("des_transacao_movimento"),
    col("data_movimento"),
    col("num_cartao").alias("numero_cartao"),
    col("nom_impresso").alias("nome_impresso_cartao"),
    col("tipo_conta"),
    col("data_criacao").alias("data_criacao_conta")
)

In [48]:
movimento_flat = movimento_flat_columns.select([col(c).cast("string").alias(c) for c in movimento_flat_columns.columns])

In [49]:
# movimento_flat.show()

In [50]:
diretorio_atual = os.getcwd()
print(f"Diretório atual: {diretorio_atual}")

dir_destino = os.path.join(diretorio_atual, "csv")  # Cria /notebooks/csv
nome = "movimento_flat_python.csv"
caminho_final = os.path.join(dir_destino, nome)

os.makedirs(dir_destino, exist_ok=True)
os.chmod(dir_destino, 0o777)
print(f"Pasta criada e permissões ajustadas: {dir_destino}")
print(f"Permissões de {dir_destino}: {oct(os.stat(dir_destino).st_mode)[-3:]}")

print("Extraindo dados do DataFrame do Spark...")
data = movimento_flat.collect()  # Traz os dados pro driver
header = movimento_flat.columns  # Pega os nomes das colunas
rows = [row.asDict() for row in data]  # Converte cada linha pra um dicionário

print(f"Salvando CSV com Python puro em {caminho_final}...")
with open(caminho_final, mode='w', newline='', encoding='ISO-8859-1') as file:
    writer = csv.DictWriter(file, fieldnames=header)
    writer.writeheader()  # Escreve o cabeçalho
    for row in rows:
        writer.writerow(row)  # Escreve cada linha
print(f"CSV salvo com sucesso em {caminho_final}")

spark.stop()

print(f"ETL concluído! Arquivo salvo em {caminho_final}")

Diretório atual: /notebooks
Pasta criada e permissões ajustadas: /notebooks/csv
Permissões de /notebooks/csv: 777
Extraindo dados do DataFrame do Spark...


                                                                                

Salvando CSV com Python puro em /notebooks/csv/movimento_flat_python.csv...
CSV salvo com sucesso em /notebooks/csv/movimento_flat_python.csv
ETL concluído! Arquivo salvo em /notebooks/csv/movimento_flat_python.csv


In [51]:
run_notebook("tests/test_load.ipynb")

25/03/29 13:05:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Setting Spark log level to "OFF".
Setting Spark log level to "OFF".
Setting Spark log level to "OFF".
Setting Spark log level to "OFF".
Setting Spark log level to "OFF".
Setting Spark log level to "OFF".


Test tests/test_load.ipynb passed successfully. OK!


In [52]:
run_notebook("tests/test_transformation.ipynb")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Setting Spark log level to "OFF".
Setting Spark log level to "OFF".
Setting Spark log level to "OFF".
Setting Spark log level to "OFF".
Setting Spark log level to "OFF".
Setting Spark log level to "OFF".


Test tests/test_transformation.ipynb passed successfully. OK!
