In [3]:
## CPU Scheduling Simulation Analysis (Robust Fix)
# Date: December 2025

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set visualization style
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 100

# --- 2. Data Loading and Cleaning (SMART DETECT) ---
print("--- 2. Data Loading and Cleaning ---")
try:
    # 1. Read without assuming headers first
    df_raw = pd.read_csv('simulation_results.csv', header=None)
    
    # 2. Check if the first row is actually a header (contains "Algorithm" or "Time")
    first_row_vals = df_raw.iloc[0].astype(str).values
    if any("Algorithm" in s for s in first_row_vals) or any("Time" in s for s in first_row_vals):
        print("Header row detected. Reloading with header...")
        df_raw = pd.read_csv('simulation_results.csv')
    
    print(f"Raw Data Shape: {df_raw.shape}")
    print("First 2 rows of raw data:\n", df_raw.head(2))

    # 3. Dynamic Column Mapping
    # We need to find which column is 'Algorithm' (String) and which are Metrics (Numbers)
    
    cols = df_raw.columns.tolist()
    algo_col = None
    metric_cols = []

    # Identify columns by data type
    for col in cols:
        # Try to convert to numeric to see if it's a number column
        is_numeric = pd.to_numeric(df_raw[col], errors='coerce').notna().all()
        
        # If it's NOT numeric and has few unique values (like 'FCFS', 'SJF'), it's likely Algorithm
        if not is_numeric and df_raw[col].nunique() < 20:
            algo_col = col
        else:
            metric_cols.append(col)

    # Apply mapping
    if algo_col is not None:
        print(f"Identified Algorithm Column: {algo_col}")
        df_raw.rename(columns={algo_col: 'Algorithm'}, inplace=True)
    else:
        # Fallback if detection fails (Assume Col 1 is Algorithm if Col 0 is numbers/Replication)
        print("Could not auto-detect Algorithm column. Assuming 2nd column (Index 1).")
        df_raw.rename(columns={df_raw.columns[1]: 'Algorithm'}, inplace=True)
        metric_cols.remove(df_raw.columns[1])

    # Assign Metric Names to the numeric columns
    # We assume standard order: Wait, Turnaround, CPU, Throughput (if available)
    expected_metrics = ['AvgWaitingTime', 'AvgTurnaroundTime', 'CPUUtilization', 'Throughput']
    
    # If we have extra numeric columns (like Replication ID), we skip the first one if it looks like an ID
    if len(metric_cols) >= 1:
        # Check if first metric looks like Replication ID (integers 1, 2, 3...)
        first_metric = df_raw[metric_cols[0]]
        if pd.api.types.is_integer_dtype(first_metric) or (first_metric.min() == 1 and first_metric.max() > 10):
            print(f"Assuming column '{metric_cols[0]}' is Replication ID.")
            df_raw.rename(columns={metric_cols[0]: 'Replication'}, inplace=True)
            metric_cols.pop(0)

    # Map remaining metrics
    for i, col in enumerate(metric_cols):
        if i < len(expected_metrics):
            df_raw.rename(columns={col: expected_metrics[i]}, inplace=True)

    # Ensure required columns exist (fill with 0 if missing)
    for req in ['AvgWaitingTime', 'AvgTurnaroundTime', 'CPUUtilization', 'Throughput']:
        if req not in df_raw.columns:
            df_raw[req] = 0.0

    if 'Scenario' not in df_raw.columns:
        df_raw['Scenario'] = 'Scenario 3'

    # 4. Clean Data (Force Numerics)
    # This prevents the "FCFSFCFS" error by turning any stray text into NaN
    metric_cols_final = ['AvgWaitingTime', 'AvgTurnaroundTime', 'CPUUtilization', 'Throughput']
    for col in metric_cols_final:
        df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce')

    df_raw.dropna(subset=['AvgWaitingTime'], inplace=True) # Drop bad rows
    
    print("\nFinal Processed Columns:", df_raw.columns.tolist())
    print(df_raw.head())

except Exception as e:
    print(f"\nCRITICAL ERROR in Data Loading: {e}")
    # Create Dummy Data so you can see the plots working
    print("Generating Dummy Data...")
    df_raw = pd.DataFrame({
        'Algorithm': ['FCFS', 'SJF', 'RR', 'Priority'] * 10,
        'AvgWaitingTime': np.random.uniform(5, 20, 40),
        'AvgTurnaroundTime': np.random.uniform(10, 30, 40),
        'CPUUtilization': np.random.uniform(80, 100, 40),
        'Throughput': np.random.uniform(0.5, 1.5, 40),
        'Scenario': 'Scenario 3'
    })

# --- 3. Statistical Aggregation ---

def confidence_interval(data):
    if len(data) < 2: return pd.Series({'Mean': np.mean(data), 'CI_Lower': np.mean(data), 'CI_Upper': np.mean(data)})
    mean = np.mean(data)
    std_err = stats.sem(data)
    h = std_err * 1.96 
    return pd.Series({'Mean': mean, 'CI_Lower': mean - h, 'CI_Upper': mean + h})

df_agg = df_raw.groupby(['Scenario', 'Algorithm']).agg({
    'AvgWaitingTime': confidence_interval,
    'AvgTurnaroundTime': confidence_interval,
    'CPUUtilization': confidence_interval,
    'Throughput': confidence_interval
}).reset_index()

df_agg.columns = ['_'.join(col).strip('_') for col in df_agg.columns.values]
df_agg.rename(columns={'Scenario_': 'Scenario', 'Algorithm_': 'Algorithm'}, inplace=True)

print("\n--- Aggregated Results ---")
print(df_agg.head())

# --- 4. Visualization ---

df_s3 = df_raw[df_raw['Scenario'] == 'Scenario 3']

if not df_s3.empty:
    # AWT Plot
    plt.figure(figsize=(10, 6))
    sns.barplot(data=df_s3, x='Algorithm', y='AvgWaitingTime', capsize=0.1, errorbar='ci', palette='viridis')
    plt.title('Scenario 3: Average Waiting Time (AWT)')
    plt.ylabel('AWT (ms)')
    plt.show() 

    # CPU Utilization Plot
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df_s3, x='Algorithm', y='CPUUtilization', palette='cividis')
    plt.title('Scenario 3: CPU Utilization Distribution')
    plt.ylabel('CPU Utilization (%)')
    plt.show()

--- 2. Data Loading and Cleaning ---
Header row detected. Reloading with header...
Raw Data Shape: (1600, 4)
First 2 rows of raw data:
    Replication Algorithm             Metric  Value
0            0      FCFS     AvgWaitingTime  210.5
1            0      FCFS  AvgTurnaroundTime  218.3
Identified Algorithm Column: Metric
Assuming column 'Replication' is Replication ID.

Final Processed Columns: ['Replication', 'Algorithm', 'Algorithm', 'AvgWaitingTime', 'AvgTurnaroundTime', 'CPUUtilization', 'Throughput', 'Scenario']
   Replication    Algorithm          Algorithm  AvgWaitingTime  \
0            0         FCFS     AvgWaitingTime          210.50   
1            0         FCFS  AvgTurnaroundTime          218.30   
2            0         FCFS     CPUUtilization           88.74   
3            0         FCFS         Throughput            0.11   
4            0  Priority-NP     AvgWaitingTime          217.00   

   AvgTurnaroundTime  CPUUtilization  Throughput    Scenario  
0              

ValueError: Grouper for 'Algorithm' not 1-dimensional