In [6]:
"""
Google Borg Cluster Trace Processing for AI-Based CPU Scheduler
================================================================
This notebook processes Borg trace data to train a Linear Regression model
that predicts the next CPU burst time based on the history of the last 3 bursts.

Author: AI Scheduler Project
Dataset: Google Borg Cluster Trace (2019) - Kaggle version
"""

# ============================================================================
# STEP 1: Setup and Imports
# ============================================================================
print("Installing required packages...")
import sys
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import joblib
from google.colab import drive, files
import warnings
warnings.filterwarnings('ignore')

print("✓ Imports successful")

# ============================================================================
# STEP 2: Mount Google Drive
# ============================================================================
print("\n" + "="*70)
print("STEP 2: Mounting Google Drive")
print("="*70)
drive.mount('/content/drive')
print("✓ Google Drive mounted successfully")

# ============================================================================
# STEP 3: Load CSV Data
# ============================================================================
print("\n" + "="*70)
print("STEP 3: Loading Borg Trace Data")
print("="*70)

file_path = '/content/borg_traces_data.csv'
print(f"Reading from: {file_path}")
print("Loading first 1,000,000 rows...")

try:
    # Load data with robust error handling (pandas >= 1.3)
    df = pd.read_csv(
        file_path,
        nrows=1000000,
        on_bad_lines='skip'
    )
    print(f"✓ Loaded {len(df):,} rows successfully")
    print(f"✓ Columns: {list(df.columns)}")
    print(f"\nFirst few rows:")
    print(df.head())

except Exception as e:
    print(f"✗ Error loading CSV: {e}")
    print("\nTrying alternative loading method...")
    try:
        # Try with C engine and low_memory
        df = pd.read_csv(
            file_path,
            nrows=1000000,
            low_memory=False
        )
    except Exception as e2:
        print(f"✗ Second attempt failed: {e2}")
        print("\nTrying basic loading...")
        # Fallback to simplest approach
        df = pd.read_csv(file_path, nrows=1000000)

    print(f"✓ Loaded {len(df):,} rows with alternative method")

# ============================================================================
# STEP 4: Robust Feature Extraction
# ============================================================================
print("\n" + "="*70)
print("STEP 4: Extracting CPU Burst Durations")
print("="*70)

# Keep only necessary columns
required_cols = ['collection_id', 'instance_index', 'time']
df = df[required_cols].copy()

print(f"Working with {len(df):,} rows")
print("Sorting by task and time...")

# Sort by collection_id, instance_index, and time
df = df.sort_values(['collection_id', 'instance_index', 'time']).reset_index(drop=True)

print("Calculating time deltas between consecutive events...")

# Calculate duration as time difference between consecutive logs for the same task
df['prev_time'] = df.groupby(['collection_id', 'instance_index'])['time'].shift(1)
df['duration'] = df['time'] - df['prev_time']

# Filter out invalid durations
print("Filtering valid durations...")
valid_durations = df[
    (df['duration'].notna()) &
    (df['duration'] > 0) &
    (df['duration'] < 1e15)  # Remove extremely large outliers
].copy()

print(f"✓ Extracted {len(valid_durations):,} valid burst durations")
print(f"\nDuration statistics (raw):")
print(valid_durations['duration'].describe())

# ============================================================================
# STEP 5: Scaling & Normalization
# ============================================================================
print("\n" + "="*70)
print("STEP 5: Scaling and Normalizing Durations")
print("="*70)

# Apply log transform to handle wide variance
print("Applying log transform...")
valid_durations['log_duration'] = np.log1p(valid_durations['duration'])

print("Normalizing to 1-100 range (simulating milliseconds)...")
# Normalize to 1-100 range
min_log = valid_durations['log_duration'].min()
max_log = valid_durations['log_duration'].max()
valid_durations['normalized_duration'] = (
    1 + 99 * (valid_durations['log_duration'] - min_log) / (max_log - min_log)
)
valid_durations['normalized_duration'] = valid_durations['normalized_duration'].round().astype(int)

print(f"\n✓ Normalized duration statistics:")
print(valid_durations['normalized_duration'].describe())

# ============================================================================
# STEP 6: Create Training Sequences
# ============================================================================
print("\n" + "="*70)
print("STEP 6: Creating Training Sequences")
print("="*70)

# Flatten into a stream of bursts
burst_stream = valid_durations['normalized_duration'].values

print(f"Total bursts in stream: {len(burst_stream):,}")

# Create sliding windows: [T-3, T-2, T-1] -> [T]
X = []  # Input features (last 3 bursts)
y = []  # Target (next burst)

window_size = 3
print(f"Creating sliding windows (window_size={window_size})...")

for i in range(len(burst_stream) - window_size):
    X.append(burst_stream[i:i+window_size])
    y.append(burst_stream[i+window_size])

X = np.array(X)
y = np.array(y)

print(f"✓ Created {len(X):,} training samples")
print(f"✓ Input shape: {X.shape}")
print(f"✓ Target shape: {y.shape}")

print(f"\nSample training data:")
print(f"First 5 samples:")
for i in range(min(5, len(X))):
    print(f"  Input: {X[i]} -> Target: {y[i]}")

# ============================================================================
# STEP 7: Train Linear Regression Model
# ============================================================================
print("\n" + "="*70)
print("STEP 7: Training Linear Regression Model")
print("="*70)

# Split data (80% train, 20% test)
split_idx = int(0.8 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print(f"Training set: {len(X_train):,} samples")
print(f"Test set: {len(X_test):,} samples")

# Train the model
print("\nTraining Linear Regression model...")
model = LinearRegression()
model.fit(X_train, y_train)

print("✓ Model trained successfully")
print(f"\nModel coefficients: {model.coef_}")
print(f"Model intercept: {model.intercept_:.2f}")

# ============================================================================
# STEP 8: Evaluate Model
# ============================================================================
print("\n" + "="*70)
print("STEP 8: Model Evaluation")
print("="*70)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Calculate errors
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)

print(f"Mean Absolute Error (MAE):")
print(f"  Training set: {mae_train:.2f} ms")
print(f"  Test set:     {mae_test:.2f} ms")

print(f"\nSample predictions vs actual (Test set):")
for i in range(min(10, len(X_test))):
    print(f"  Input: {X_test[i]} -> Predicted: {y_pred_test[i]:.1f} ms, Actual: {y_test[i]} ms")

# ============================================================================
# STEP 9: Save Model and Download
# ============================================================================
print("\n" + "="*70)
print("STEP 9: Saving and Downloading Model")
print("="*70)

# Save model
model_filename = 'burst_predictor.pkl'
print(f"Saving model to {model_filename}...")
joblib.dump(model, model_filename)
print("✓ Model saved successfully")

# Save model metadata
metadata = {
    'model_type': 'LinearRegression',
    'window_size': window_size,
    'num_training_samples': len(X_train),
    'num_test_samples': len(X_test),
    'mae_train': mae_train,
    'mae_test': mae_test,
    'coefficients': model.coef_.tolist(),
    'intercept': float(model.intercept_),
    'min_log': float(min_log),
    'max_log': float(max_log),
    'normalization_range': [1, 100]
}

metadata_filename = 'model_metadata.txt'
with open(metadata_filename, 'w') as f:
    for key, value in metadata.items():
        f.write(f"{key}: {value}\n")

print(f"✓ Metadata saved to {metadata_filename}")

# Trigger download
print("\nDownloading files to your local machine...")
files.download(model_filename)
files.download(metadata_filename)

print("\n" + "="*70)
print("✓✓✓ PIPELINE COMPLETE ✓✓✓")
print("="*70)
print(f"\nSummary:")
print(f"  - Processed {len(df):,} raw events")
print(f"  - Extracted {len(valid_durations):,} valid bursts")
print(f"  - Created {len(X):,} training samples")
print(f"  - Model MAE: {mae_test:.2f} ms")
print(f"  - Model saved as: {model_filename}")
print("\nYou can now use this model in your SJF scheduler simulator!")

# ============================================================================
# BONUS: Example Usage Function
# ============================================================================
print("\n" + "="*70)
print("BONUS: Example Model Usage")
print("="*70)

def predict_next_burst(last_three_bursts):
    """
    Predict the next CPU burst time given the last 3 bursts.

    Args:
        last_three_bursts: List/array of 3 integers (1-100 range)

    Returns:
        Predicted next burst time (float)
    """
    input_data = np.array(last_three_bursts).reshape(1, -1)
    prediction = model.predict(input_data)[0]
    return max(1, min(100, prediction))  # Clamp to valid range

# Example predictions
print("\nExample predictions:")
examples = [
    [10, 15, 20],
    [50, 45, 40],
    [80, 85, 90],
    [25, 30, 35]
]

for example in examples:
    pred = predict_next_burst(example)
    print(f"  Last 3 bursts: {example} -> Predicted next: {pred:.1f} ms")

print("\n" + "="*70)
print("Script execution completed successfully!")
print("="*70)

Installing required packages...
✓ Imports successful

STEP 2: Mounting Google Drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Google Drive mounted successfully

STEP 3: Loading Borg Trace Data
Reading from: /content/borg_traces_data.csv
Loading first 1,000,000 rows...
✓ Loaded 405,894 rows successfully
✓ Columns: ['Unnamed: 0', 'time', 'instance_events_type', 'collection_id', 'scheduling_class', 'collection_type', 'priority', 'alloc_collection_id', 'instance_index', 'machine_id', 'resource_request', 'constraint', 'collections_events_type', 'user', 'collection_name', 'collection_logical_name', 'start_after_collection_ids', 'vertical_scaling', 'scheduler', 'start_time', 'end_time', 'average_usage', 'maximum_usage', 'random_sample_usage', 'assigned_memory', 'page_cache_memory', 'cycles_per_instruction', 'memory_accesses_per_instruction', 'sample_rate', 'cpu_usage_distribution', 'tail_cpu_usage_distrib

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✓✓✓ PIPELINE COMPLETE ✓✓✓

Summary:
  - Processed 405,894 raw events
  - Extracted 159,850 valid bursts
  - Created 159,847 training samples
  - Model MAE: 5.68 ms
  - Model saved as: burst_predictor.pkl

You can now use this model in your SJF scheduler simulator!

BONUS: Example Model Usage

Example predictions:
  Last 3 bursts: [10, 15, 20] -> Predicted next: 37.5 ms
  Last 3 bursts: [50, 45, 40] -> Predicted next: 58.0 ms
  Last 3 bursts: [80, 85, 90] -> Predicted next: 80.8 ms
  Last 3 bursts: [25, 30, 35] -> Predicted next: 46.8 ms

Script execution completed successfully!
