# 🔄 Update 5 Records

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
# Create Spark Session
spark = SparkSession.builder \
    .appName("Update5Records") \
    .master("local[*]") \
    .config("spark.es.nodes", "elasticsearch") \
    .config("spark.es.port", "9200") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.jars.packages", "org.elasticsearch:elasticsearch-spark-30_2.12:8.14.3") \
    .getOrCreate()

print(f"Application ID: {spark.sparkContext.applicationId}")

Application ID: local-1757837541431


In [3]:
# Define updated records data
updated_records_data = [
    (24410114, "Tran Trieu Thuan updated", 30),
    (24410100, "Nguyen Phuong Tan updated", 30),
    (24410109, "Nguyen Thi Thu Thao updated", 28),
    (24410092, "Huynh Duy Quoc updated", 35),
    (24410040, "Ha Huy Hung updated", 22)
]

update_ids = [record[0] for record in updated_records_data]
print(f"Updating records with IDs: {update_ids}")

# Read current data
current_df = spark.read \
    .format("org.elasticsearch.spark.sql") \
    .option("es.nodes", "elasticsearch") \
    .option("es.port", "9200") \
    .option("es.resource", "2_people_data_2k") \
    .load()

Updating records with IDs: [24410114, 24410100, 24410109, 24410092, 24410040]


In [4]:
# Show records before update
records_before_update = current_df.filter(col("id").isin(update_ids))
print("Records before update:")
records_before_update.show()

Records before update:
+----------+--------+---+-----+--------+----+--------+----+-------+-------------------+
|@timestamp|@version|age|event|filename|host|      id| log|message|               name|
+----------+--------+---+-----+--------+----+--------+----+-------+-------------------+
|      NULL|    NULL| 28| NULL|    NULL|NULL|24410109|NULL|   NULL|Nguyen Thi Thu Thao|
|      NULL|    NULL| 30| NULL|    NULL|NULL|24410100|NULL|   NULL|  Nguyen Phuong Tan|
|      NULL|    NULL| 30| NULL|    NULL|NULL|24410114|NULL|   NULL|   Tran Trieu Thuan|
|      NULL|    NULL| 35| NULL|    NULL|NULL|24410092|NULL|   NULL|     Huynh Duy Quoc|
|      NULL|    NULL| 22| NULL|    NULL|NULL|24410040|NULL|   NULL|        Ha Huy Hung|
+----------+--------+---+-----+--------+----+--------+----+-------+-------------------+



In [6]:
# Create DataFrame with updated data
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])

updated_df = spark.createDataFrame(updated_records_data, schema)

print("Records after update:")
updated_df.show()

Records after update:
+--------+--------------------+---+
|      id|                name|age|
+--------+--------------------+---+
|24410114|Tran Trieu Thuan ...| 30|
|24410100|Nguyen Phuong Tan...| 30|
|24410109|Nguyen Thi Thu Th...| 28|
|24410092|Huynh Duy Quoc up...| 35|
|24410040| Ha Huy Hung updated| 22|
+--------+--------------------+---+



In [7]:
# Write updated records back to Elasticsearch
print("Writing updated records to Elasticsearch...")
try:
    updated_df.write \
        .format("org.elasticsearch.spark.sql") \
        .option("es.nodes", "elasticsearch") \
        .option("es.port", "9200") \
        .option("es.resource", "2_people_data_2k") \
        .option("es.mapping.id", "id") \
        .option("es.batch.size.bytes", "10mb") \
        .option("es.batch.size.entries", "1000") \
        .mode("append") \
        .save()
    print("✓ Records updated successfully!")
except Exception as e:
    print(f"✗ Update failed: {e}")

Writing updated records to Elasticsearch...
✓ Records updated successfully!


In [None]:
# Verify update
print("\nVerifying update...")
# Read fresh data to verify the update
verification_df = spark.read \
    .format("org.elasticsearch.spark.sql") \
    .option("es.nodes", "elasticsearch") \
    .option("es.port", "9200") \
    .option("es.resource", "2_people_data_2k") \
    .load()

updated_records = verification_df.filter(col("id").isin(update_ids))
updated_records.show()

In [None]:
# Stop Spark session
spark.stop()
print("Spark session stopped.")