In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import xgboost as xgb
import shap
import joblib
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

# Define file paths
DATA_DIR = Path('../data')
RAW_DATA_PATH = DATA_DIR / 'raw'
PROCESSED_DATA_PATH = DATA_DIR / 'processed'
MODELS_DIR = Path('../models')

# Ensure models directory exists
MODELS_DIR.mkdir(parents=True, exist_ok=True)

print("Libraries imported and paths defined.")

Libraries imported and paths defined.


In [2]:
# Load the original simulated data
simulated_df = pd.read_csv(RAW_DATA_PATH / 'simulated_patient_data.csv', parse_dates=['date'])

# Load our new, real EEG features
eeg_features_df = pd.read_csv(PROCESSED_DATA_PATH / 'patient_01_eeg_features.csv')

print(f"Loaded simulated data: {simulated_df.shape}")
print("Loaded real EEG features:")
eeg_features_df.head()

Loaded simulated data: (90, 8)
Loaded real EEG features:


Unnamed: 0,patient_id,date,r_e_l___p_o_w_e_r___d_e_l_t_a,r_e_l___p_o_w_e_r___t_h_e_t_a,r_e_l___p_o_w_e_r___a_l_p_h_a,r_e_l___p_o_w_e_r___b_e_t_a,r_e_l___p_o_w_e_r___g_a_m_m_a,s_p_i_k_e___c_o_u_n_t
0,chb01,2025-01-01,0.690999,0.105066,0.016118,0.072614,0.083428,
1,chb01,2025-01-01,0.107901,0.032186,0.005577,0.055179,0.059479,
2,chb01,2025-01-01,,,,,,1767.0


In [3]:
# --- DEBUG CELL: FIND YOUR COLUMN NAMES ---
print("Your actual column names are:")
print(simulated_df.columns)


Your actual column names are:
Index(['date', 'patient_id', 'hours_of_sleep', 'stress_level',
       'medication_taken', 'eeg_feature_1', 'mri_lesion_present',
       'seizure_occurred'],
      dtype='object')


In [4]:
print("Extracting real statistics from EEG feature file...")

# Load the main simulated data
full_df = pd.read_csv(RAW_DATA_PATH / 'simulated_patient_data.csv', parse_dates=['date'])

# Load our new, real EEG features
eeg_features_df = pd.read_csv(PROCESSED_DATA_PATH / 'patient_01_eeg_features.csv')

# --- START OF SMART FIX ---
# Programmatically find the mean/std columns to avoid KeyErrors
stats_row = eeg_features_df.iloc[0]
eeg_stats = {}
new_feature_names = []

# Find all mean/std columns
mean_cols = [col for col in eeg_features_df.columns if '_mean' in col]
std_cols = [col for col in eeg_features_df.columns if '_std' in col]
sum_cols = [col for col in eeg_features_df.columns if '_sum' in col]

# Process power features
for mean_col in mean_cols:
    base_name = mean_col.replace('_mean', '')
    std_col = f"{base_name}_std"
    
    mean = stats_row[mean_col]
    std = stats_row[std_col] if std_col in stats_row else (mean * 0.1) # Default 10% std
    
    eeg_stats[base_name] = (mean, std)
    new_feature_names.append(base_name)
    print(f"  - Found power feature: {base_name} (mean={mean:.4f}, std={std:.4f})")

# Process count features
for sum_col in sum_cols:
    base_name = sum_col.replace('_sum', '')
    mean = stats_row[sum_col]
    std = mean * 0.1 # Default 10% std
    
    eeg_stats[base_name] = (mean, std)
    new_feature_names.append(base_name)
    print(f"  - Found count feature: {base_name} (mean={mean:.4f}, std={std:.4f})")

# Now, create these new features in our main simulated dataframe
num_rows = len(full_df)
for base_name, (mean, std) in eeg_stats.items():
    sim_data = np.random.normal(loc=mean, scale=std, size=num_rows)
    
    if 'count' in base_name:
        full_df[base_name] = np.abs(sim_data).round().astype(int)
    else:
        full_df[base_name] = np.clip(sim_data, 0, 1)

# Drop ALL old/garbled/simulated EEG/MRI columns
# (based on your previous error output)
cols_to_drop = [col for col in full_df.columns if 'eeg_' in col or 'mri_' in col or 'r_e_l_' in col or 's_p_i_k_e' in col]
full_df = full_df.drop(columns=cols_to_drop, errors='ignore')

print("\nFull dataset with new, realistic EEG features created.")
print("Old simulated EEG/MRI features removed.")
full_df.head()

Extracting real statistics from EEG feature file...


KeyError: 'rel_power_delta_mean'