In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, IntegerType, StringType, DoubleType
import json

# 1. Cargar Archivo
with open("config.json") as f:
    config = json.load(f)

# 2. inicializaer Variables
kafka_bootstrap_servers = config["kafka"]["bootstrap_servers"]
kafka_topic = config["kafka"]["topic"]
kafka_sasl_username = config["kafka"]["username"]
kafka_sasl_password = config["kafka"]["password"]

mongo_uri = config["mongodb"]["uri"]
mongo_db = config["mongodb"]["database"]
mongo_collection = config["mongodb"]["collection"]

# 3. Crear sesión de Spark
spark = SparkSession.builder \
    .appName("KafkaToMongoBatchJob") \
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.0," 
            "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1") \
    .getOrCreate()

# 4. Definir esquema del mensaje Kafka
schema = StructType() \
    .add("Id", IntegerType()) \
    .add("ClienteId", StringType()) \
    .add("Monto", DoubleType()) \
    .add("FechaPago", StringType()) \
    .add("MetodoPago", StringType()) \
    .add("Estado", StringType())

# 5. Leer mensajes desde Kafka (modo streaming)
kafka_options = {
    "kafka.bootstrap.servers": kafka_bootstrap_servers,
    "subscribe": kafka_topic,
    "kafka.security.protocol": "SASL_SSL",
    "kafka.sasl.mechanism": "PLAIN",
    "kafka.sasl.jaas.config": f'org.apache.kafka.common.security.plain.PlainLoginModule required username="{kafka_sasl_username}" password="{kafka_sasl_password}";',
    "startingOffsets": "earliest"
}
df_kafka_streaming = spark.readStream.format("kafka").options(**kafka_options).load()
df_kafka_raw = df_kafka_streaming

# 6. Convertir 'value' a string y parsear JSON con el esquema
df_parsed = df_kafka_raw.select(
    from_json(col("value").cast("string"), schema).alias("data")
)

# 7. Seleccionar campos del struct 'data' para mostrar columnas separadas
df_final = df_parsed.select("data.*")

# 8. Renombrar la columna 'Id' a '_id'
df_mongo = df_final.withColumnRenamed("Id", "_id")
df_mongo.display()

# 9. Escribir en MongoDB Atlas
df_mongo.writeStream   \
    .format("mongodb") \
    .option("spark.mongodb.connection.uri", mongo_uri) \
    .option("spark.mongodb.database", mongo_db) \
    .option("spark.mongodb.collection", mongo_collection) \
    .option("checkpointLocation", "dbfs:/mnt/checkpoints/pagos_nrt") \
    .outputMode("append") \
    .start()

#.option("checkpointLocation", "/tmp/checkpoint_pagos") \