# 📊 Get All Data from Elasticsearch

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd

In [None]:
# Create Spark Session
spark = SparkSession.builder \
    .appName("GetAllData") \
    .master("local[*]") \
    .config("spark.es.nodes", "elasticsearch") \
    .config("spark.es.port", "9200") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.jars.packages", "org.elasticsearch:elasticsearch-spark-30_2.12:8.14.3") \
    .getOrCreate()

print(f"Application ID: {spark.sparkContext.applicationId}")

In [None]:
# Read all data from people index
print("Reading from 2_people_data_2k index...")
people_df = spark.read \
    .format("org.elasticsearch.spark.sql") \
    .option("es.nodes", "elasticsearch") \
    .option("es.port", "9200") \
    .option("es.resource", "2_people_data_2k") \
    .load()

# Show total count
total_count = people_df.count()
print(f"Total records: {total_count}")

In [None]:
# Show sample data
print("Sample data (first 10 records):")
people_df.show(10)

In [None]:
# Show age distribution
print("Age distribution:")
age_dist = people_df.groupBy("age").count().orderBy("age")
age_dist.show(20)

In [None]:
# Show name statistics
print("Most common names (top 10):")
name_stats = people_df.groupBy("name").count().orderBy(col("count").desc())
name_stats.show(10)

In [None]:
# Stop Spark session
spark.stop()
print("Spark session stopped.")