# ML Training Notebook

This notebook demonstrates ML model training for runtime prediction.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Generate synthetic training data
np.random.seed(42)
n_samples = 1000

data = {
    'partition_count': np.random.randint(10, 500, n_samples),
    'data_size_gb': np.random.uniform(1, 100, n_samples),
    'executor_cores': np.random.choice([4, 8, 16], n_samples),
    'executor_memory_gb': np.random.choice([4, 8, 16, 32], n_samples),
}

# Create target: runtime (synthetic)
data['runtime_ms'] = (
    data['partition_count'] * 10 +
    data['data_size_gb'] * 100 +
    data['executor_cores'] * 50 +
    np.random.normal(0, 500, n_samples)
)

df = pd.DataFrame(data)
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

In [None]:
# Split data
X = df[['partition_count', 'data_size_gb', 'executor_cores', 'executor_memory_gb']]
y = df['runtime_ms']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

In [None]:
# Train model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Model Evaluation:")
print(f"RMSE: {rmse:.2f} ms")
print(f"RÂ² Score: {r2:.4f}")

In [None]:
# Feature importance
feature_names = ['partition_count', 'data_size_gb', 'executor_cores', 'executor_memory_gb']
coefficients = model.coef_

feature_importance = pd.DataFrame({
    'feature': feature_names,
    'coefficient': np.abs(coefficients)
}).sort_values('coefficient', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

In [None]:
# Visualize predictions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Actual vs Predicted
axes[0].scatter(y_test, y_pred, alpha=0.5)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Runtime (ms)')
axes[0].set_ylabel('Predicted Runtime (ms)')
axes[0].set_title('Actual vs Predicted Runtime')
axes[0].grid(True, alpha=0.3)

# Residuals
residuals = y_test - y_pred
axes[1].scatter(y_pred, residuals, alpha=0.5)
axes[1].axhline(y=0, color='r', linestyle='--')
axes[1].set_xlabel('Predicted Runtime (ms)')
axes[1].set_ylabel('Residuals')
axes[1].set_title('Residual Plot')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Model Summary

This linear regression model successfully predicts Spark job runtime based on key features:
- Partition count
- Data size
- Executor configuration

The model can be saved and deployed for real-time predictions.