In [None]:
# Spark Session

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("Executor-Profiling")
    .config("spark.sql.shuffle.partitions", 32)
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")


In [None]:
# Import Profiler

from src.evaluation.executor_profiler import (
    get_executor_info,
    monitor_resources
)

import pandas as pd

In [None]:
# Executor Inventory

executors = get_executor_info(spark)
df_exec = pd.DataFrame(executors)

print("Executor Configuration:")
df_exec


In [None]:
# Run Workload + Monitor

"""
Simulated workload similar to tweet processing.
"""

import threading

def run_workload():
    rdd = spark.sparkContext.parallelize(range(5_000_000), 32)
    rdd.map(lambda x: x * x).count()

# start workload
t = threading.Thread(target=run_workload)
t.start()

# monitor resources during execution
snapshots = monitor_resources(interval_sec=2, duration_sec=30)

t.join()

df_usage = pd.DataFrame(snapshots)
df_usage


In [None]:
# CPU Utilization Plot

import matplotlib.pyplot as plt

plt.figure(figsize=(7,5))

plt.plot(df_usage["cpu_percent"], marker="o")

plt.xlabel("Time Step")
plt.ylabel("CPU Utilization (%)")
plt.title("Driver CPU Utilization During Distributed Execution")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Memory Utilization Plot

plt.figure(figsize=(7,5))

plt.plot(df_usage["memory_percent"], marker="s")

plt.xlabel("Time Step")
plt.ylabel("Memory Utilization (%)")
plt.title("Driver Memory Usage During Spark Processing")
plt.grid(True)
plt.tight_layout()
plt.show()
