In [1]:
!pip install pandas geopandas matplotlib faker pyspark pyarrow

Collecting geopandas
  Downloading geopandas-0.13.2-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Collecting shapely>=1.7.1
  Downloading shapely-2.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hCollecting pyproj>=3.0.1
  Downloading pyproj-3.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting fiona>=1.8.19
  Downloading fiona-1.9.6-cp38-cp38-manylinux2014_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting click-plugins>

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType, DoubleType, IntegerType, TimestampType

# Crear sesión de Spark
spark = SparkSession \
    .builder \
    .appName("SparkStreamingFromSocket") \
    .master("local[*]") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "4") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

# Definir el esquema para los datos JSON que se recibirán
schema = StructType() \
    .add("latitude", DoubleType()) \
    .add("longitude", DoubleType()) \
    .add("date", TimestampType()) \
    .add("customer_id", StringType()) \
    .add("employee_id", StringType()) \
    .add("quantity_products", IntegerType()) \
    .add("order_id", StringType()) \
    .add("commune_code", StringType()) \
    .add("commune_name", StringType()) \
    .add("customer_name", StringType()) \
    .add("employee_name", StringType()) \
    .add("employee_commission", DoubleType())

# Leer datos desde el socket
streaming_df = spark \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 12345) \
    .load()

# Parsear los datos JSON utilizando el esquema definido
parsed_df = streaming_df \
    .select(from_json(col("value").cast("string"), schema).alias("parsed_value")) \
    .select("parsed_value.*")

# Función para procesar los datos recibidos
def process_data(df, epoch_id):
    try:
        hdfs_path = "/user/root/bronze"
        df.write \
          .format("parquet") \
          .mode("append") \
          .save(hdfs_path)
        df.show(truncate=False)
    except Exception as e:
        print(f"Error al procesar los datos: {e}")
        
# Escribir los resultados en la consola
query = parsed_df \
    .writeStream \
    .foreachBatch(process_data) \
    .outputMode("append") \
    .start()

# Mantener el stream en ejecución
query.awaitTermination()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/16 22:17:25 ERROR streaming.MicroBatchExecution: Query [id = 3b50c140-81d7-4359-b8fa-468205acd6cd, runId = fbe7fb6c-59b3-4b12-8dd6-dde928db893d] terminated with error
java.net.ConnectException: Connection refused (Connection refused)
	at java.net.PlainSocketImpl.socketConnect(Native Method)
	at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
	at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206)
	at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
	at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
	at java.net.Socket.connect(Socket.java:607)
	at java.net.Socket.connect(Socket.java:556)
	at java.net.Socket.<init>(Socket.java:452)
	at java.net.Socket.<init>(Socket.java:229)
	at org.apache.spark.sql.execution.streaming.sources.TextSocketMicroBatchStream.ini

StreamingQueryException: Connection refused (Connection refused)
=== Streaming Query ===
Identifier: [id = 3b50c140-81d7-4359-b8fa-468205acd6cd, runId = fbe7fb6c-59b3-4b12-8dd6-dde928db893d]
Current Committed Offsets: {}
Current Available Offsets: {TextSocketV2[host: localhost, port: 12345]: -1}

Current State: ACTIVE
Thread State: RUNNABLE

Logical Plan:
Project [parsed_value#2.latitude AS latitude#4, parsed_value#2.longitude AS longitude#5, parsed_value#2.date AS date#6, parsed_value#2.customer_id AS customer_id#7, parsed_value#2.employee_id AS employee_id#8, parsed_value#2.quantity_products AS quantity_products#9, parsed_value#2.order_id AS order_id#10, parsed_value#2.commune_code AS commune_code#11, parsed_value#2.commune_name AS commune_name#12, parsed_value#2.customer_name AS customer_name#13, parsed_value#2.employee_name AS employee_name#14, parsed_value#2.employee_commission AS employee_commission#15]
+- Project [from_json(StructField(latitude,DoubleType,true), StructField(longitude,DoubleType,true), StructField(date,TimestampType,true), StructField(customer_id,StringType,true), StructField(employee_id,StringType,true), StructField(quantity_products,IntegerType,true), StructField(order_id,StringType,true), StructField(commune_code,StringType,true), StructField(commune_name,StringType,true), StructField(customer_name,StringType,true), StructField(employee_name,StringType,true), StructField(employee_commission,DoubleType,true), cast(value#0 as string), Some(Etc/UTC)) AS parsed_value#2]
   +- StreamingDataSourceV2Relation [value#0], org.apache.spark.sql.execution.streaming.sources.TextSocketTable$$anon$1@675992a8, TextSocketV2[host: localhost, port: 12345]


In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType, DoubleType, IntegerType, TimestampType

# Crear sesión de Spark
spark = SparkSession \
    .builder \
    .appName("SparkStreamingFromSocket") \
    .master("local[*]") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "4") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

# Definir el esquema para los datos JSON que se recibirán
schema = StructType() \
    .add("latitude", DoubleType()) \
    .add("longitude", DoubleType()) \
    .add("date", TimestampType()) \
    .add("customer_id", StringType()) \
    .add("employee_id", StringType()) \
    .add("quantity_products", IntegerType()) \
    .add("order_id", StringType()) \
    .add("commune_code", StringType()) \
    .add("commune_name", StringType()) \
    .add("customer_name", StringType()) \
    .add("employee_name", StringType()) \
    .add("employee_commission", DoubleType())

# Leer datos desde el socket
streaming_df = spark \
    .readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 12345) \
    .load()

# Parsear los datos JSON utilizando el esquema definido
parsed_df = streaming_df \
    .select(from_json(col("value").cast("string"), schema).alias("parsed_value")) \
    .select("parsed_value.*")

# Función para procesar los datos recibidos
def process_data(df, epoch_id):
    try:
        hdfs_path = "/user/root/bronze"
        df.write \
          .format("parquet") \
          .mode("append") \
          .save(hdfs_path)
        df.show(truncate=False)
    except Exception as e:
        print(f"Error al procesar los datos: {e}")
        
# Escribir los resultados en la consola
query = parsed_df \
    .writeStream \
    .foreachBatch(process_data) \
    .outputMode("append") \
    .start()

# Mantener el stream en ejecución
query.awaitTermination()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/16 22:17:25 ERROR streaming.MicroBatchExecution: Query [id = 3b50c140-81d7-4359-b8fa-468205acd6cd, runId = fbe7fb6c-59b3-4b12-8dd6-dde928db893d] terminated with error
java.net.ConnectException: Connection refused (Connection refused)
	at java.net.PlainSocketImpl.socketConnect(Native Method)
	at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
	at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206)
	at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
	at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
	at java.net.Socket.connect(Socket.java:607)
	at java.net.Socket.connect(Socket.java:556)
	at java.net.Socket.<init>(Socket.java:452)
	at java.net.Socket.<init>(Socket.java:229)
	at org.apache.spark.sql.execution.streaming.sources.TextSocketMicroBatchStream.ini

StreamingQueryException: Connection refused (Connection refused)
=== Streaming Query ===
Identifier: [id = 3b50c140-81d7-4359-b8fa-468205acd6cd, runId = fbe7fb6c-59b3-4b12-8dd6-dde928db893d]
Current Committed Offsets: {}
Current Available Offsets: {TextSocketV2[host: localhost, port: 12345]: -1}

Current State: ACTIVE
Thread State: RUNNABLE

Logical Plan:
Project [parsed_value#2.latitude AS latitude#4, parsed_value#2.longitude AS longitude#5, parsed_value#2.date AS date#6, parsed_value#2.customer_id AS customer_id#7, parsed_value#2.employee_id AS employee_id#8, parsed_value#2.quantity_products AS quantity_products#9, parsed_value#2.order_id AS order_id#10, parsed_value#2.commune_code AS commune_code#11, parsed_value#2.commune_name AS commune_name#12, parsed_value#2.customer_name AS customer_name#13, parsed_value#2.employee_name AS employee_name#14, parsed_value#2.employee_commission AS employee_commission#15]
+- Project [from_json(StructField(latitude,DoubleType,true), StructField(longitude,DoubleType,true), StructField(date,TimestampType,true), StructField(customer_id,StringType,true), StructField(employee_id,StringType,true), StructField(quantity_products,IntegerType,true), StructField(order_id,StringType,true), StructField(commune_code,StringType,true), StructField(commune_name,StringType,true), StructField(customer_name,StringType,true), StructField(employee_name,StringType,true), StructField(employee_commission,DoubleType,true), cast(value#0 as string), Some(Etc/UTC)) AS parsed_value#2]
   +- StreamingDataSourceV2Relation [value#0], org.apache.spark.sql.execution.streaming.sources.TextSocketTable$$anon$1@675992a8, TextSocketV2[host: localhost, port: 12345]


In [3]:
!ls

Gen_Datos.sh		ambiente_python.txt    leer_fuentes.py	spark.txt
Hilos_Multiples_Gen.py	data		       simulacion.py
Socket.py		grafica_simulacion.py  spark.ipynb
