In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, IntegerType, StringType, DoubleType

# 1. Crear sesión de Spark
spark = SparkSession.builder \
    .appName("KafkaToMongoBatchJob") \
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.0," 
            "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1") \
    .getOrCreate()

# 2. Definir esquema del mensaje Kafka
schema = StructType() \
    .add("Id", IntegerType()) \
    .add("ClienteId", StringType()) \
    .add("Monto", DoubleType()) \
    .add("FechaPago", StringType()) \
    .add("MetodoPago", StringType()) \
    .add("Estado", StringType())

# 3. Leer mensajes desde Kafka (modo batch con trigger 'availableNow')
# === Configuración de Kafka (Confluent Cloud) ===
kafka_bootstrap_servers = "pkc-619z3.us-east1.gcp.confluent.cloud:9092"
kafka_topic = "Pagos"
kafka_sasl_username = "DJYWPXMBGK457WXF"
kafka_sasl_password = "STQpFpmDDCYgNjHpeS7aHlYAtnqOIlRimkPk5uwNgo6+JRU5nOafjgDXwURg2QiN"

kafka_options = {
    "kafka.bootstrap.servers": kafka_bootstrap_servers,
    "subscribe": kafka_topic,
    "kafka.security.protocol": "SASL_SSL",
    "kafka.sasl.mechanism": "PLAIN",
    "kafka.sasl.jaas.config": f'org.apache.kafka.common.security.plain.PlainLoginModule required username="{kafka_sasl_username}" password="{kafka_sasl_password}";',
    "startingOffsets": "earliest"
}

df_kafka_streaming = spark.readStream.format("kafka").options(**kafka_options).load()

df_kafka_raw = df_kafka_streaming
# Convertir 'value' a string y parsear JSON con el esquema
df_parsed = df_kafka_raw.select(
    from_json(col("value").cast("string"), schema).alias("data")
)
# Seleccionar campos del struct 'data' para mostrar columnas separadas
df_final = df_parsed.select("data.*")

# Renombrar la columna 'Id' a '_id'
df_mongo = df_final.withColumnRenamed("Id", "_id")

# 4. Escribir en MongoDB Atlas
mongo_uri = "mongodb+srv://warlospp:Admin.123@cluster0.w92hpyq.mongodb.net/bddprodserv?retryWrites=true&w=majority"
mongo_db = "bddprodserv"
mongo_collection = "Pagos"

df_mongo.writeStream   \
    .format("mongodb") \
    .option("spark.mongodb.connection.uri", mongo_uri) \
    .option("spark.mongodb.database", mongo_db) \
    .option("spark.mongodb.collection", mongo_collection) \
    .option("checkpointLocation", "/tmp/checkpoint_pagos") \
    .outputMode("append") \
    .start()
