In [None]:
# Spark Session

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("Spark-Parallelism-Proof")
    .config("spark.sql.shuffle.partitions", 32)
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")


In [None]:
# Run Benchmark

from src.analysis.parallelism_analysis import benchmark_partitions
import pandas as pd

DATA_SIZE = 2_000_000  # representative workload

PARTITIONS = [1, 2, 4, 8, 16, 32]

results = benchmark_partitions(
    spark=spark,
    data_size=DATA_SIZE,
    partition_list=PARTITIONS
)

df_results = pd.DataFrame(results)
df_results

In [None]:
# Scaling Curve

import matplotlib.pyplot as plt

plt.figure(figsize=(7,5))

plt.plot(df_results["partitions"], df_results["time_sec"], marker="o")
plt.xlabel("Number of Partitions (p)")
plt.ylabel("Execution Time T(p) [sec]")
plt.title("Strong Scaling Behavior of Spark Pipeline")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Speedup Curve

plt.figure(figsize=(7,5))

plt.plot(df_results["partitions"], df_results["speedup"], marker="s", label="Observed")

# Ideal linear speedup
plt.plot(
    df_results["partitions"],
    df_results["partitions"],
    linestyle="--",
    label="Ideal Linear"
)

plt.xlabel("Number of Partitions (p)")
plt.ylabel("Speedup S(p)")
plt.title("Spark Parallel Speedup Analysis")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Efficiency Curve

plt.figure(figsize=(7,5))

plt.plot(df_results["partitions"], df_results["efficiency"], marker="^")
plt.xlabel("Number of Partitions (p)")
plt.ylabel("Efficiency E(p)")
plt.title("Parallel Efficiency of Distributed Processing")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Throughput Scaling

plt.figure(figsize=(7,5))

plt.plot(
    df_results["partitions"],
    df_results["throughput_rec_per_sec"],
    marker="d"
)

plt.xlabel("Number of Partitions (p)")
plt.ylabel("Throughput (records/sec)")
plt.title("Throughput Scalability of Fusion-Spark Pipeline")
plt.grid(True)
plt.tight_layout()
plt.show()
