In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, expr, concat_ws, lit, floor, rand

In [2]:
spark = SparkSession.builder \
    .appName("Gerar Massa de Dados") \
    .master("spark://spark-master:7077") \
    .config("spark.jars", "/opt/bitnami/spark/jars/mysql-connector-j-8.0.33.jar") \
    .getOrCreate()

jdbc_url = "jdbc:mysql://mysql:3306/desafio"
jdbc_properties = {
    "user": "sparkuser",
    "password": "sparkpass",
    "driver": "com.mysql.cj.jdbc.Driver"
}

25/03/29 15:26:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
nomes = ["Wilian", "Natalia", "Gustavo", "Eder", "Wallace", "Wendy", "Caio", "Joao"]
sobrenomes = ["Gomes", "Santana", "Mateo", "Murada", "Vieira", "Cruz", "Monteiro", "Lanzana"]



In [4]:
associados = spark.range(1, 51).select(
    col("id"),
    expr(f"array({','.join([f"'{n}'" for n in nomes])})[cast(floor(rand() * {len(nomes)}) as int)]").alias("nome"),
    expr(f"array({','.join([f"'{s}'" for s in sobrenomes])})[cast(floor(rand() * {len(sobrenomes)}) as int)]").alias("sobrenome"),(floor(rand() * 50) + 18).alias("idade"),
    concat_ws("", expr("lower(nome)"), expr("lower(sobrenome)"), expr("cast(id as string)"), lit("@email.com")).alias("email")
)

print("Dados de 'associado':")
associados.show(5)


Dados de 'associado':
+---+-------+---------+-----+--------------------+
| id|   nome|sobrenome|idade|               email|
+---+-------+---------+-----+--------------------+
|  1|   Caio|   Murada|   21|caiomurada1@email...|
|  2|Natalia|   Murada|   28|nataliamurada2@em...|
|  3|Wallace|     Cruz|   24|wallacecruz3@emai...|
|  4| Wilian|    Gomes|   44|wiliangomes4@emai...|
|  5| Wilian|    Gomes|   67|wiliangomes5@emai...|
+---+-------+---------+-----+--------------------+
only showing top 5 rows



                                                                                

In [5]:
contas = associados.select(
    col("id"),
    expr("if(rand() > 0.5, 'Corrente', 'Poupança')").alias("tipo_conta"),
    current_timestamp().alias("data_criacao"),
    col("id").alias("id_associado")
)
print("DataFrame 'conta' criado:")
contas.show(5)

DataFrame 'conta' criado:
+---+----------+--------------------+------------+
| id|tipo_conta|        data_criacao|id_associado|
+---+----------+--------------------+------------+
|  1|  Corrente|2025-03-29 15:26:...|           1|
|  2|  Poupança|2025-03-29 15:26:...|           2|
|  3|  Corrente|2025-03-29 15:26:...|           3|
|  4|  Corrente|2025-03-29 15:26:...|           4|
|  5|  Poupança|2025-03-29 15:26:...|           5|
+---+----------+--------------------+------------+
only showing top 5 rows



In [6]:
cartoes = associados.select(
    col("id"),
    (floor(rand() * 90000000) + 10000000).alias("num_cartao"),
    concat_ws(" ", col("nome"), col("sobrenome")).alias("nom_impresso"),
    col("id").alias("id_conta"),  # Vincula ao mesmo id da conta
    col("id").alias("id_associado")
)
print("DataFrame 'cartao' criado:")
cartoes.show(5)

DataFrame 'cartao' criado:
+---+----------+--------------+--------+------------+
| id|num_cartao|  nom_impresso|id_conta|id_associado|
+---+----------+--------------+--------+------------+
|  1|  29759406|   Caio Murada|       1|           1|
|  2|  84824596|Natalia Murada|       2|           2|
|  3|  86389315|  Wallace Cruz|       3|           3|
|  4|  72568346|  Wilian Gomes|       4|           4|
|  5|  75052481|  Wilian Gomes|       5|           5|
+---+----------+--------------+--------+------------+
only showing top 5 rows



In [7]:
movimentos = cartoes.select(
    col("id"),
    (rand() * 1000).cast("decimal(10,2)").alias("vlr_transacao"),
    expr("array('Compra', 'Saque', 'Depósito')[cast(floor(rand() * 3) as int)]").alias("des_transacao"),
    current_timestamp().alias("data_movimento"),
    col("id").alias("id_cartao")
)
print("DataFrame 'movimento' criado:")
movimentos.show(5)

DataFrame 'movimento' criado:
+---+-------------+-------------+--------------------+---------+
| id|vlr_transacao|des_transacao|      data_movimento|id_cartao|
+---+-------------+-------------+--------------------+---------+
|  1|       737.50|     Depósito|2025-03-29 15:26:...|        1|
|  2|       436.70|     Depósito|2025-03-29 15:26:...|        2|
|  3|       291.96|     Depósito|2025-03-29 15:26:...|        3|
|  4|       262.64|       Compra|2025-03-29 15:26:...|        4|
|  5|       533.38|     Depósito|2025-03-29 15:26:...|        5|
+---+-------------+-------------+--------------------+---------+
only showing top 5 rows



In [8]:
print("Gravando no MySQL...")

Gravando no MySQL...


In [9]:
associados.write.jdbc(url=jdbc_url, table="associado", mode="append", properties=jdbc_properties)

                                                                                

In [10]:
contas.write.jdbc(url=jdbc_url, table="conta", mode="append", properties=jdbc_properties)

In [11]:
cartoes.write.jdbc(url=jdbc_url, table="cartao", mode="append", properties=jdbc_properties)

In [12]:
movimentos.write.jdbc(url=jdbc_url, table="movimento", mode="append", properties=jdbc_properties)

In [13]:
print("Dados gerados e inseridos com sucesso!")

Dados gerados e inseridos com sucesso!


In [14]:
# Parar a sessão
spark.stop()