# 📊 Get All Data from Elasticsearch

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd

In [10]:
# Create Spark Session
spark = SparkSession.builder \
    .appName("GetAllData") \
    .master("local[*]") \
    .config("spark.es.nodes", "elasticsearch") \
    .config("spark.es.port", "9200") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.jars.packages", "org.elasticsearch:elasticsearch-spark-30_2.12:8.14.3") \
    .getOrCreate()

print(f"Application ID: {spark.sparkContext.applicationId}")

Application ID: local-1757837416759


In [12]:
# Read all data from people index
print("Reading from 2_people_data_2k index...")
people_df = spark.read \
    .format("org.elasticsearch.spark.sql") \
    .option("es.nodes", "elasticsearch") \
    .option("es.port", "9200") \
    .option("es.resource", "2_people_data_2k") \
    .load()

# Show total count
total_count = people_df.count()
print(f"Total records: {total_count}")

Reading from 2_people_data_2k index...
Total records: 2001


In [14]:
# Show sample data
print("Sample data (first 10 records):")
people_df.show(10)

Sample data (first 10 records):
+--------------------+--------+---+--------------------+--------------------+--------------+---+--------------------+--------------------+---------------+
|          @timestamp|@version|age|               event|            filename|          host| id|                 log|             message|           name|
+--------------------+--------+---+--------------------+--------------------+--------------+---+--------------------+--------------------+---------------+
|2025-09-14 08:09:...|       1| 63| {642,Lê Tấn Lâm,63}|2_people_data_2k.csv|{00656bc1591c}|642|{{/usr/share/logs...|   642,Lê Tấn Lâm,63|     Lê Tấn Lâm|
|2025-09-14 08:09:...|       1| 53| {643,Lê Hải Lâm,53}|2_people_data_2k.csv|{00656bc1591c}|643|{{/usr/share/logs...|   643,Lê Hải Lâm,53|     Lê Hải Lâm|
|2025-09-14 08:09:...|       1| 46|{644,Nguyễn Đức H...|2_people_data_2k.csv|{00656bc1591c}|644|{{/usr/share/logs...|644,Nguyễn Đức Hạ...|Nguyễn Đức Hạnh|
|2025-09-14 08:09:...|       1| 38|{64

In [16]:
# Show age distribution
print("Age distribution:")
age_dist = people_df.groupBy("age").count().orderBy("age")
age_dist.show(20)

Age distribution:
+---+-----+
|age|count|
+---+-----+
|  0|    1|
| 18|   41|
| 19|   43|
| 20|   26|
| 21|   28|
| 22|   38|
| 23|   27|
| 24|   41|
| 25|   33|
| 26|   39|
| 27|   37|
| 28|   25|
| 29|   37|
| 30|   33|
| 31|   25|
| 32|   32|
| 33|   37|
| 34|   33|
| 35|   35|
| 36|   36|
+---+-----+
only showing top 20 rows



In [18]:
# Show name statistics
print("Most common names (top 10):")
name_stats = people_df.groupBy("name").count().orderBy(col("count").desc())
name_stats.show(10)

Most common names (top 10):
+----------+-----+
|      name|count|
+----------+-----+
| Hoàng Bảo|   14|
|Trần Thành|   12|
| Bùi Thành|   10|
|   Mai Bảo|   10|
| Hoàng Chi|    9|
| Phạm Hạnh|    8|
|Đặng Thành|    8|
| Dương Mai|    8|
|Bùi Phương|    8|
|Dương Tùng|    8|
+----------+-----+
only showing top 10 rows



In [19]:
# Stop Spark session
spark.stop()
print("Spark session stopped.")

Spark session stopped.
