In [1]:
!pip install pandas geopandas matplotlib faker pyspark pyarrow

[0m--- Logging error ---
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/utils/logging.py", line 177, in emit
    self.console.print(renderable, overflow="ignore", crop=False, style=style)
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/console.py", line 1752, in print
    extend(render(renderable, render_options))
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/console.py", line 1390, in render
    for render_output in iter_render:
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/utils/logging.py", line 134, in __rich_console__
    for line in lines:
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/segment.py", line 245, in split_lines
    for segment in segments:
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/console.py", line 1368, in render
    renderable = rich_cast(renderable)
  File "/usr/local/lib/python3.8/dist-packages/pip/_vendor/rich/protocol.py", lin

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType, DoubleType, IntegerType, TimestampType

# Crear sesión de Spark
spark = SparkSession \
    .builder \
    .appName("SparkStreamingFromSocket") \
    .master("local[*]") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "4") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

# Definir el esquema para los datos JSON que se recibirán
schema = StructType() \
    .add("latitude", DoubleType()) \
    .add("longitude", DoubleType()) \
    .add("date", TimestampType()) \
    .add("customer_id", StringType()) \
    .add("employee_id", StringType()) \
    .add("quantity_products", IntegerType()) \
    .add("order_id", StringType()) \
    .add("commune_code", StringType()) \
    .add("commune_name", StringType()) \
    .add("customer_name", StringType()) \
    .add("employee_name", StringType()) \
    .add("employee_commission", DoubleType())

# Leer datos desde el socket
streaming_df = spark \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 12345) \
    .load()

# Parsear los datos JSON utilizando el esquema definido
parsed_df = streaming_df \
    .select(from_json(col("value").cast("string"), schema).alias("parsed_value")) \
    .select("parsed_value.*")

# Función para guardar los datos recibidos en bronze
def process_data(df, epoch_id):
    try:
        hdfs_path = "/user/root/bronze"
        df.write \
          .format("parquet") \
          .mode("append") \
          .save(hdfs_path)
        df.show(truncate=False)
    except Exception as e:
        print(f"Error al procesar los datos: {e}")
        
# Escribir los resultados en la consola
query = parsed_df \
    .writeStream \
    .foreachBatch(process_data) \
    .outputMode("append") \
    .start()

# Mantener el stream en ejecución
query.awaitTermination()

+--------+---------+----+-----------+-----------+-----------------+--------+------------+------------+-------------+-------------+-------------------+
|latitude|longitude|date|customer_id|employee_id|quantity_products|order_id|commune_code|commune_name|customer_name|employee_name|employee_commission|
+--------+---------+----+-----------+-----------+-----------------+--------+------------+------------+-------------+-------------+-------------------+
+--------+---------+----+-----------+-----------+-----------------+--------+------------+------------+-------------+-------------+-------------------+



                                                                                

+------------------+------------------+-------------------+-----------+-----------+-----------------+----------+------------+------------------------------------------+------------------+-----------------+-------------------+
|latitude          |longitude         |date               |customer_id|employee_id|quantity_products|order_id  |commune_code|commune_name                              |customer_name     |employee_name    |employee_commission|
+------------------+------------------+-------------------+-----------+-----------+-----------------+----------+------------+------------------------------------------+------------------+-----------------+-------------------+
|6.230073881503961 |-75.4992337499913 |2024-02-09 06:15:09|5041       |6337       |64               |463624432 |90          |CORREGIMIENTO DE SANTA ELENA              |Benedict Underwood|Phyllis Hubbard  |0.1                |
|6.248935218793185 |-75.61616199766836|2023-01-04 16:52:51|3690       |5668       |85           

                                                                                

+-----------------+------------------+-------------------+-----------+-----------+-----------------+----------+------------+-------------------------------------+------------------+---------------+-------------------+
|latitude         |longitude         |date               |customer_id|employee_id|quantity_products|order_id  |commune_code|commune_name                         |customer_name     |employee_name  |employee_commission|
+-----------------+------------------+-------------------+-----------+-----------+-----------------+----------+------------+-------------------------------------+------------------+---------------+-------------------+
|6.178295435372668|-75.53388470694372|2024-06-03 03:38:02|8033       |5668       |29               |2843614140|90          |CORREGIMIENTO DE SANTA ELENA         |Justine Burch     |Melanie Ball   |0.18               |
|6.251426142581339|-75.60798102540919|2024-02-16 18:21:31|7062       |9438       |98               |5374211788|12          |LA A

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, expr, round, lit, year, month, dayofmonth, trim
import os 

# Obtener la sesión de Spark existente si está activa
spark = SparkSession.builder.getOrCreate()

# Función para leer archivos Parquet desde HDFS
def leer_archivos_parquet(path: str) -> DataFrame:
    try:
        # Verificar la existencia del archivo antes de leerlo
        if os.system(f"hdfs dfs -test -e {path}") == 0:
            return spark.read.parquet(path)
        else:
            print(f"El archivo Parquet {path} no existe.")
            return None
    except Exception as e:
        print(f"Error al leer el archivo Parquet {path}: {e}")
        return None
    
# Ruta de bronze
bronze_path = "hdfs:///user/root/bronze"  

# Leer archivos Parquet desde el directorio en HDFS
df_bronze = leer_archivos_parquet(bronze_path)

# Función para agregar una columna con valor constante al precio y dividir la fecha
def transformar_df(df: DataFrame) -> DataFrame:
        df_transformado = df \
            .withColumn("price", lit(3500)) \
            .withColumn("sales", col("quantity_products") * col("price")) \
            .withColumn("commission_value", round(col("sales") * col("employee_commission"), 0)) \
            .withColumn("customer_name", trim(col("customer_name"))) \
            .withColumn("employee_name", trim(col("employee_name"))) \
            .withColumn("commune_name", trim(col("commune_name"))) \
            .withColumn("year", year(col("date"))) \
            .withColumn("month", month(col("date"))) \
            .withColumn("day", dayofmonth(col("date")))
        return df_transformado

# Función para guardar DataFrame en un archivo Parquet en la capa Silver, siempre sobreescribe
def guardar_archivo_parquet(df: DataFrame, path: str) -> None:
        df.coalesce(1).write.mode("overwrite").parquet(path)

# Función principal para unir archivos de Bronze a Silver
def unir_archivos_bronze_a_silver(bronze_path: str, silver_path: str) -> None:
    # Leer archivos Parquet desde la capa Bronze
    df_bronze = leer_archivos_parquet(bronze_path)

    # Transformar el DataFrame de Bronze
    df_transformado = transformar_df(df_bronze)

    # Guardar el DataFrame transformado en la capa Silver
    guardar_archivo_parquet(df_transformado, silver_path)

    # Leer archivos Parquet desde la capa Silver después de transformar
    df_silver_transformado = leer_archivos_parquet(silver_path)

    if df_silver_transformado is not None:
        # Contar la cantidad de registros en la capa Silver después de transformar
        records_processed = df_silver_transformado.count()
        print(f"Cantidad de registros procesados en silver después de transformar: {records_processed}")
    
    # Mostrar el DataFrame transformado (opcional)
    df_transformado.printSchema()
    df_transformado.show()

# Rutas de ejemplo ajustadas
bronze_path = "hdfs:///user/root/bronze"  
silver_path = "hdfs:///user/root/silver/unificado.parquet"  

# Ejecutar el proceso de unión
unir_archivos_bronze_a_silver(bronze_path, silver_path)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

Cantidad de registros procesados en silver después de transformar: 590
root
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- employee_id: string (nullable = true)
 |-- quantity_products: integer (nullable = true)
 |-- order_id: string (nullable = true)
 |-- commune_code: string (nullable = true)
 |-- commune_name: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- employee_name: string (nullable = true)
 |-- employee_commission: double (nullable = true)
 |-- price: integer (nullable = false)
 |-- sales: integer (nullable = true)
 |-- commission_value: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)

+------------------+------------------+-------------------+-----------+-----------+-----------------+----------+------------+--------------------+----------------

In [28]:
#!hdfs dfs -ls /user/root/silver/unificado.parquet
#!hdfs dfs -mkdir -p /user/root/gold
#!hdfs dfs -ls /user/root/bronze
#!hdfs dfs -copyFromLocal medellin_neighborhoods.parquet /user/root/bronze
#!hdfs dfs -ls /user/root/silver
#!hdfs dfs -ls /user/root/gold
#!hdfs dfs -ls /user/root/bronze
!hdfs dfs -ls /user/root/bronze | grep medellin_neighborhoods.parquet

-rw-r--r--   1 root supergroup    2287806 2024-06-17 04:19 /user/root/bronze/medellin_neighborhoods.parquet


In [30]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import FloatType
from shapely.geometry import Point
import geopandas as gpd
import matplotlib.pyplot as plt

# Crear la sesión de Spark con configuración ajustada
spark = SparkSession.builder \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

# Definir la ruta del archivo Parquet con datos de ventas en la capa Silver
cargue_inicial_path = 'hdfs:///user/root/silver/unificado.parquet'

try:
    # Cargar el archivo Parquet de cargue inicial como un DataFrame de Spark
    df_cargue = spark.read.parquet(cargue_inicial_path).limit(10)  # Limitar a 10 registros para pruebas

    # Definir la función Python para crear puntos a partir de latitud y longitud
    def create_point(longitude, latitude):
        return Point(float(longitude), float(latitude))

    # Registrar la función como una UDF (User Defined Function) en Spark
    create_point_udf = udf(create_point, returnType=FloatType())

    # Convertir las columnas de longitud y latitud a tipo Float y crear columna 'geom'
    df_cargue = df_cargue.withColumn('longitude', col('longitude').cast(FloatType())) \
                         .withColumn('latitude', col('latitude').cast(FloatType())) \
                         .withColumn('geom', create_point_udf(col('longitude'), col('latitude')))
    df_cargue.show()
#     # Convertir el DataFrame de Spark en un GeoDataFrame de GeoPandas
#     gdf_cargue = gpd.GeoDataFrame(df_cargue.toPandas(), geometry='geom')
    
#     # Imprimir las primeras filas para verificar la conversión
#     print("Primeras filas de GeoDataFrame:")
#     print(gdf_cargue.head())

#     # Definir la ruta del archivo Parquet con geometrías de Medellín en la capa Bronze
#     medellin_neighborhoods_path = 'hdfs:///user/root/bronze/medellin_neighborhoods.parquet'
    
#     # Cargar el archivo Parquet de geometrías de Medellín como un GeoDataFrame de GeoPandas
#     medellin_neighborhoods = gpd.read_parquet(medellin_neighborhoods_path)
    
#     # Imprimir las primeras filas para verificar la carga de datos
#     print("\nPrimeras filas de medellin_neighborhoods GeoDataFrame:")
#     print(medellin_neighborhoods.head())

# #     # Crear el gráfico con las geometrías de Medellín y los puntos del cargue inicial
# #     fig, ax = plt.subplots(figsize=(20, 20))
# #     medellin_neighborhoods.plot(ax=ax, color='lightgrey', edgecolor='darkblue')
# #     gdf_cargue.plot(ax=ax, color='blue', markersize=10, alpha=0.6)

# #     plt.title('Datos simulados de ventas en Medellín y ubicaciones de clientes')
# #     plt.xlabel('Longitud')
# #     plt.ylabel('Latitud')
# #     plt.grid(True)
# #     plt.show()
    
# #     # Guardar la figura en un archivo si la visualización es correcta
# #     plt.savefig('./data/medellin_neighborhoods_simulacion.png')
# #     plt.close()

except Exception as e:
    print(f"Error al convertir o visualizar los datos con GeoPandas: {str(e)}")


Error al convertir o visualizar los datos con GeoPandas: An error occurred while calling o568.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 45.0 failed 1 times, most recent failure: Lost task 0.0 in stage 45.0 (TID 45) (0f90e151db83 executor driver): net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict (for shapely.io.from_wkb)
	at net.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)
	at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:773)
	at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:213)
	at net.razorvine.pickle.Unpickler.load(Unpickler.java:123)
	at net.razorvine.pickle.Unpickler.loads(Unpickler.java:136)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.$anonfun$evaluate$6(BatchEvalPythonExec.scala:94)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator

24/06/17 04:46:44 ERROR executor.Executor: Exception in task 0.0 in stage 45.0 (TID 45)
net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict (for shapely.io.from_wkb)
	at net.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)
	at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:773)
	at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:213)
	at net.razorvine.pickle.Unpickler.load(Unpickler.java:123)
	at net.razorvine.pickle.Unpickler.loads(Unpickler.java:136)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.$anonfun$evaluate$6(BatchEvalPythonExec.scala:94)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$Generat