# Spark Event Analysis

This notebook analyzes Spark event logs and extracts performance metrics.

In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

# Load event logs
event_logs = []
# Sample event structure
sample_events = [
    {"timestamp": "2023-12-15T10:00:00", "task_id": 1, "executor_id": "1", "duration_ms": 1000},
    {"timestamp": "2023-12-15T10:00:01", "task_id": 2, "executor_id": "2", "duration_ms": 1500},
    {"timestamp": "2023-12-15T10:00:02", "task_id": 3, "executor_id": "1", "duration_ms": 800},
]

df_events = pd.DataFrame(sample_events)
print("Event Summary:")
print(df_events.head())

In [None]:
# Extract metrics
total_tasks = len(df_events)
avg_duration = df_events['duration_ms'].mean()
max_duration = df_events['duration_ms'].max()
executor_count = df_events['executor_id'].nunique()

print(f"Total Tasks: {total_tasks}")
print(f"Average Duration: {avg_duration:.2f} ms")
print(f"Max Duration: {max_duration} ms")
print(f"Number of Executors: {executor_count}")

In [None]:
# Visualize task duration distribution
plt.figure(figsize=(10, 6))
plt.hist(df_events['duration_ms'], bins=20, edgecolor='black')
plt.xlabel('Task Duration (ms)')
plt.ylabel('Frequency')
plt.title('Task Duration Distribution')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Analyze by executor
executor_stats = df_events.groupby('executor_id')['duration_ms'].agg([
    ('count', 'count'),
    ('mean', 'mean'),
    ('max', 'max'),
    ('min', 'min')
]).round(2)

print("\nExecutor Statistics:")
print(executor_stats)

## Performance Insights

1. Identify slow tasks
2. Detect executor imbalance
3. Find optimization opportunities