In [2]:
import unittest
import os
import csv
from pyspark.sql import SparkSession

In [3]:
class TestTransformations(unittest.TestCase):
    def setUp(self):
        self.spark = SparkSession.builder \
            .appName("Transformation Test") \
            .master("local[*]") \
            .config("spark.log.level", "OFF") \
            .getOrCreate()
        
        self.dir_destino = "file:///notebooks/csv/"
        self.nome_python = "movimento_flat_python.csv"
        self.nome_scala = "movimento_flat_scala.csv"
        self.caminho_final_python = os.path.join(self.dir_destino, self.nome_python)
        self.caminho_final_scala = os.path.join(self.dir_destino, self.nome_scala)

    def tearDown(self):
        if self.spark is not None:
            self.spark.stop()
            self.spark = None

    def test_python_colunas_esperadas(self):
        df = self.spark.read.option("header", "true").csv(self.caminho_final_python)
        colunas_esperadas = [
            "nome_associado", "sobrenome_associado", "idade_associado",
            "vlr_transacao_movimento", "des_transacao_movimento", "data_movimento",
            "numero_cartao", "nome_impresso_cartao", "tipo_conta", "data_criacao_conta"
        ]
        self.assertEqual(sorted(df.columns), sorted(colunas_esperadas))

    def test_scala_colunas_esperadas(self):
        df = self.spark.read.option("header", "true").csv(self.caminho_final_scala)
        colunas_esperadas = [
            "nome_associado", "sobrenome_associado", "idade_associado",
            "vlr_transacao_movimento", "des_transacao_movimento", "data_movimento",
            "numero_cartao", "nome_impresso_cartao", "tipo_conta", "data_criacao_conta"
        ]
        self.assertEqual(sorted(df.columns), sorted(colunas_esperadas))
# 
    def test_python_tipos_dados(self):
        df = self.spark.read.option("header", "true").csv(self.caminho_final_python)
        for col_name, dtype in df.dtypes:
            self.assertEqual(dtype, "string")
# 
    def test_scala_tipos_dados(self):
        df = self.spark.read.option("header", "true").csv(self.caminho_final_scala)
        for col_name, dtype in df.dtypes:
            self.assertEqual(dtype, "string")
# 
    def test_python_valores_nulos(self):
        df = self.spark.read.option("header", "true").csv(self.caminho_final_python)
        colunas_criticas = ["nome_associado", "vlr_transacao_movimento", "numero_cartao"]
        for col in colunas_criticas:
            nulos = df.filter(df[col].isNull()).count()
            self.assertEqual(nulos, 0)
# 
    def test_scala_valores_nulos(self):
        df = self.spark.read.option("header", "true").csv(self.caminho_final_scala)
        colunas_criticas = ["nome_associado", "vlr_transacao_movimento", "numero_cartao"]
        for col in colunas_criticas:
            nulos = df.filter(df[col].isNull()).count()
            self.assertEqual(nulos, 0)
 
        


In [4]:
loader = unittest.TestLoader()
suite = loader.loadTestsFromTestCase(TestTransformations)
runner = unittest.TextTestRunner()
runner.run(suite)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/29 12:44:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/29 12:44:56 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
.25/03/29 12:45:00 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
.25/03/29 12:45:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
.25/03/29 12:45:02 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
.25/03/29 12:45:02 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
.25/03/29 12:45:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
.
----------------------------------------------------------------------
Ran 6 tests in 10.085s

OK


<unittest.runner.TextTestResult run=6 errors=0 failures=0>

25/03/29 12:47:29 WARN JavaUtils: Attempt to delete using native Unix OS command failed for path = /tmp/spark-96c9ec72-0c7b-4210-b937-44afcc4fe443/pyspark-dfaf3645-bccf-4994-9b77-1da087c7026b. Falling back to Java IO way
java.io.IOException: Failed to delete: /tmp/spark-96c9ec72-0c7b-4210-b937-44afcc4fe443/pyspark-dfaf3645-bccf-4994-9b77-1da087c7026b
	at org.apache.spark.network.util.JavaUtils.deleteRecursivelyUsingUnixNative(JavaUtils.java:166)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:109)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:90)
	at org.apache.spark.util.SparkFileUtils.deleteRecursively(SparkFileUtils.scala:121)
	at org.apache.spark.util.SparkFileUtils.deleteRecursively$(SparkFileUtils.scala:120)
	at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1126)
	at org.apache.spark.util.ShutdownHookManager$.$anonfun$new$4(ShutdownHookManager.scala:65)
	at org.apache.spark.util.ShutdownHookManager$.$anonfun