# 🗑️ Delete 5 Records

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import requests

In [None]:
# Create Spark Session
spark = SparkSession.builder \
    .appName("Delete5Records") \
    .master("local[*]") \
    .config("spark.es.nodes", "elasticsearch") \
    .config("spark.es.port", "9200") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.jars.packages", "org.elasticsearch:elasticsearch-spark-30_2.12:8.14.3") \
    .getOrCreate()

print(f"Application ID: {spark.sparkContext.applicationId}")

In [None]:
def delete_record_by_id(doc_id):
    """Delete a record from Elasticsearch by ID using HTTP DELETE"""
    url = f"http://elasticsearch:9200/2_people_data_2k_spark/_doc/{doc_id}"
    try:
        response = requests.delete(url)
        return response.status_code == 200
    except Exception as e:
        print(f"Error deleting record {doc_id}: {e}")
        return False

In [None]:
# IDs to delete
delete_ids = [24410114, 24410100, 24410109, 24410092, 24410040]
print(f"Deleting records with IDs: {delete_ids}")

# Read current data to show records before deletion
current_df = spark.read \
    .format("org.elasticsearch.spark.sql") \
    .option("es.nodes", "elasticsearch") \
    .option("es.port", "9200") \
    .option("es.resource", "2_people_data_2k_spark") \
    .load()

# Show records before deletion
records_to_delete = current_df.filter(col("id").isin(delete_ids))
print(f"Records to delete ({records_to_delete.count()}):")
records_to_delete.show()

In [None]:
# Delete records using HTTP DELETE
print("Deleting records...")
deleted_count = 0
for doc_id in delete_ids:
    if delete_record_by_id(doc_id):
        print(f"✓ Deleted record ID: {doc_id}")
        deleted_count += 1
    else:
        print(f"✗ Failed to delete record ID: {doc_id}")

print(f"Deleted {deleted_count} out of {len(delete_ids)} records")

In [None]:
# Verify deletion
print("Verifying deletion...")
updated_df = spark.read \
    .format("org.elasticsearch.spark.sql") \
    .option("es.nodes", "elasticsearch") \
    .option("es.port", "9200") \
    .option("es.resource", "2_people_data_2k_spark") \
    .load()

remaining_records = updated_df.filter(col("id").isin(delete_ids))
remaining_count = remaining_records.count()
total_count = updated_df.count()

print(f"Remaining records with deleted IDs: {remaining_count}")
print(f"Total records after deletion: {total_count}")

if remaining_count > 0:
    print("Remaining records:")
    remaining_records.show()

In [None]:
# Stop Spark session
spark.stop()
print("Spark session stopped.")