# 📝 Insert 5 Records

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [2]:
# Create Spark Session
spark = SparkSession.builder \
    .appName("Insert5Records") \
    .master("local[*]") \
    .config("spark.es.nodes", "elasticsearch") \
    .config("spark.es.port", "9200") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.jars.packages", "org.elasticsearch:elasticsearch-spark-30_2.12:8.14.3") \
    .getOrCreate()

print(f"Application ID: {spark.sparkContext.applicationId}")

Application ID: local-1757837430143


In [3]:
# Define schema for new records
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])

# Create 5 new records
new_data = [
    (24410114, "Tran Trieu Thuan", 30),
    (24410100, "Nguyen Phuong Tan", 30),
    (24410109, "Nguyen Thi Thu Thao", 28),
    (24410092, "Huynh Duy Quoc", 35),
    (24410040, "Ha Huy Hung", 22)
]

print("Creating 5 new records:")
for record in new_data:
    print(f"  ID: {record[0]}, Name: {record[1]}, Age: {record[2]}")

Creating 5 new records:
  ID: 24410114, Name: Tran Trieu Thuan, Age: 30
  ID: 24410100, Name: Nguyen Phuong Tan, Age: 30
  ID: 24410109, Name: Nguyen Thi Thu Thao, Age: 28
  ID: 24410092, Name: Huynh Duy Quoc, Age: 35
  ID: 24410040, Name: Ha Huy Hung, Age: 22


In [4]:
# Create DataFrame
new_df = spark.createDataFrame(new_data, schema)
new_df.show()

+--------+-------------------+---+
|      id|               name|age|
+--------+-------------------+---+
|24410114|   Tran Trieu Thuan| 30|
|24410100|  Nguyen Phuong Tan| 30|
|24410109|Nguyen Thi Thu Thao| 28|
|24410092|     Huynh Duy Quoc| 35|
|24410040|        Ha Huy Hung| 22|
+--------+-------------------+---+



In [5]:
# Insert to Elasticsearch
print("Inserting records to Elasticsearch...")
try:
    new_df.write \
        .format("org.elasticsearch.spark.sql") \
        .option("es.nodes", "elasticsearch") \
        .option("es.port", "9200") \
        .option("es.resource", "2_people_data_2k") \
        .option("es.write.operation", "index") \
        .option("es.mapping.id", "id") \
        .option("es.batch.size.bytes", "10mb") \
        .option("es.batch.size.entries", "1000") \
        .mode("append") \
        .save()
    print("✓ Records inserted successfully!")
except Exception as e:
    print(f"✗ Insert failed: {e}")

Inserting records to Elasticsearch...
✗ Insert failed: An error occurred while calling o63.save.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 9 in stage 3.0 failed 1 times, most recent failure: Lost task 9.0 in stage 3.0 (TID 25) (jupyter executor driver): org.elasticsearch.hadoop.EsHadoopIllegalArgumentException: Failed to parse [10485760#ex:10MB]
	at org.elasticsearch.hadoop.util.unit.ByteSizeValue.parseBytesSizeValue(ByteSizeValue.java:177)
	at org.elasticsearch.hadoop.cfg.Settings.getBatchSizeInBytes(Settings.java:170)
	at org.elasticsearch.hadoop.rest.bulk.BulkProcessor.<init>(BulkProcessor.java:105)
	at org.elasticsearch.hadoop.rest.RestRepository.lazyInitWriting(RestRepository.java:136)
	at org.elasticsearch.hadoop.rest.RestRepository.writeToIndex(RestRepository.java:169)
	at org.elasticsearch.spark.rdd.EsRDDWriter.write(EsRDDWriter.scala:83)
	at org.elasticsearch.spark.sql.EsSparkSQL$.$anonfun$saveToEs$1(EsSparkSQL.scala:103)
	at org.elasticsearch.s

In [None]:
# Verify insertion
print("Verifying insertion...")
all_df = spark.read \
    .format("org.elasticsearch.spark.sql") \
    .option("es.nodes", "elasticsearch") \
    .option("es.port", "9200") \
    .option("es.resource", "2_people_data_2k") \
    .load()

new_total = all_df.count()
print(f"Total records after insertion: {new_total}")

In [None]:
# Stop Spark session
spark.stop()
print("Spark session stopped.")