In [0]:
import math

def calculate_executor_memory(node_memory_gb, executors_per_node, overhead_ratio=0.1, min_overhead_gb=0.384):
    mem_per_executor = node_memory_gb / executors_per_node
    overhead = max(min_overhead_gb, overhead_ratio * mem_per_executor)
    executor_memory_gb = mem_per_executor - overhead
    return round(executor_memory_gb, 2), round(overhead, 2)

def recommend_cluster_config(data_size_gb, partition_size_mb=128, node_memory_gb=16, cores_per_node=16, cores_per_executor=4, partitions_per_executor=4):
    # Step 1: Calculate number of partitions required
    total_partitions = math.ceil(data_size_gb * 1024 / partition_size_mb)

    # Step 2: Calculate total executors needed
    total_executors_needed = math.ceil(total_partitions / partitions_per_executor)

    # Step 3: Calculate executors per node
    executors_per_node = math.floor(cores_per_node / cores_per_executor)
    total_nodes_required = math.ceil(total_executors_needed / executors_per_node)

    # Step 4: Compute memory allocation per executor
    executor_memory_gb, overhead_gb = calculate_executor_memory(node_memory_gb, executors_per_node)

    # Output summary
    print(f"Recommended Spark Cluster Configuration for {data_size_gb} GB of Input Data")
    print("=" * 70)
    print(f"Ideal Partition Size (MB)                : {partition_size_mb}")
    print(f"Total Partitions                        : {total_partitions}")
    print(f"Partitions per Executor                 : {partitions_per_executor}")
    print(f"Total Executors Required                : {total_executors_needed}")
    print(f"Cores per Executor                      : {cores_per_executor}")
    print(f"Executors per Node                      : {executors_per_node}")
    print(f"Total Nodes Required                    : {total_nodes_required}")
    print(f"Node Memory (GB)                        : {node_memory_gb}")
    print(f"Cores per Node                          : {cores_per_node}")
    print(f"Executor Memory (usable)                : {executor_memory_gb} GB")
    print(f"Memory Overhead per Executor            : {overhead_gb} GB")

    # Heuristic guidance
    if executor_memory_gb < 4:
        print("⚠️ Warning: Executor memory is too low. Consider increasing node memory.")
    elif executor_memory_gb > 8:
        print("💡 Suggestion: You may be over-allocating memory. Tune down to reduce cost.")

# Example usage
recommend_cluster_config(data_size_gb=500)