# 📝 Insert 5 Records

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [None]:
# Create Spark Session
spark = SparkSession.builder \
    .appName("Insert5Records") \
    .master("local[*]") \
    .config("spark.es.nodes", "elasticsearch") \
    .config("spark.es.port", "9200") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.jars.packages", "org.elasticsearch:elasticsearch-spark-30_2.12:8.14.3") \
    .getOrCreate()

print(f"Application ID: {spark.sparkContext.applicationId}")

In [None]:
# Define schema for new records
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])

# Create 5 new records
new_data = [
    (24410114, "Tran Trieu Thuan", 30),
    (24410100, "Nguyen Phuong Tan", 30),
    (24410109, "Nguyen Thi Thu Thao", 28),
    (24410092, "Huynh Duy Quoc", 35),
    (24410040, "Ha Huy Hung", 22)
]

print("Creating 5 new records:")
for record in new_data:
    print(f"  ID: {record[0]}, Name: {record[1]}, Age: {record[2]}")

In [None]:
# Create DataFrame
new_df = spark.createDataFrame(new_data, schema)
new_df.show()

In [None]:
# Insert to Elasticsearch
print("Inserting records to Elasticsearch...")
try:
    new_df.write \
        .format("org.elasticsearch.spark.sql") \
        .option("es.nodes", "elasticsearch") \
        .option("es.port", "9200") \
        .option("es.resource", "2_people_data_2k_spark") \
        .option("es.mapping.id", "id") \
        .mode("append") \
        .save()
    print("✓ Records inserted successfully!")
except Exception as e:
    print(f"✗ Insert failed: {e}")

In [None]:
# Verify insertion
print("Verifying insertion...")
all_df = spark.read \
    .format("org.elasticsearch.spark.sql") \
    .option("es.nodes", "elasticsearch") \
    .option("es.port", "9200") \
    .option("es.resource", "2_people_data_2k_spark") \
    .load()

new_total = all_df.count()
print(f"Total records after insertion: {new_total}")

In [None]:
# Stop Spark session
spark.stop()
print("Spark session stopped.")