In [None]:
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm

# Channel mapping
CHANNELS = {
    1: 'FP2',
    4: 'F3',
    5: 'P4',
    6: 'P3',
    7: 'CZ',
    8: 'F4'
}

time_format = '%H:%M:%S'  # Adjust if your format is different

def statistical_features(data):
    """
    Compute 10 statistical features from raw EEG data

    Args:
        data: pandas Series or array of raw EEG values

    Returns:
        numpy array with 10 features
    """
    features = []

    # 1. Power (sum of squared values)
    features.append(np.sum(data ** 2))

    # 2. Differential Entropy (approximation)
    # H = 0.5 * log(2*pi*e*variance)
    variance = np.var(data)
    if variance > 0:
        diff_entropy = 0.5 * np.log(2 * np.pi * np.e * variance)
    else:
        diff_entropy = 0
    features.append(diff_entropy)

    # 3. Mean
    features.append(np.mean(data))

    # 4. Standard Deviation
    features.append(np.std(data))

    # 5. Skewness
    from scipy.stats import skew
    features.append(skew(data))

    # 6. Kurtosis
    from scipy.stats import kurtosis
    features.append(kurtosis(data))

    # 7. IQR (Interquartile Range)
    features.append(np.percentile(data, 75) - np.percentile(data, 25))

    # 8. Median
    features.append(np.median(data))

    # 9. Hjorth Activity (variance)
    features.append(np.var(data))

    # 10. Hjorth Mobility
    # Mobility = sqrt(var(diff(signal)) / var(signal))
    diff_data = np.diff(data)
    if len(diff_data) > 0 and np.var(data) > 0:
        mobility = np.sqrt(np.var(diff_data) / np.var(data))
    else:
        mobility = 0
    features.append(mobility)

    return np.array(features)


def gen_features_all_channels_all_data(task=2, participant_list=None):
    """
    Generate statistical features for ALL data
    Processes all 6 channels and saves features separately

    Args:
        task: task number (1, 2, or 3)
        participant_list: list of participant IDs, or None for all
    """
    ten_sec = datetime.timedelta(seconds=10)

    # If no participant list provided, use all
    if participant_list is None:
        participant_list = range(1, 31)  # Participants 1-30

    # Process each channel separately
    for channel_num, channel_name in CHANNELS.items():

        print(f"\n{'='*80}")
        print(f"Processing Channel {channel_num} ({channel_name}) for Task {task}")
        print(f"{'='*80}")

        X_channel = None
        metadata = []  # Store participant_id and timestamp for each row

        for participant in tqdm(participant_list, desc=f"Task {task} - {channel_name}"):

            # Read raw EEG data for this participant and task
            data_file = f'../input/eeg-processed-extended/{participant}/user{participant}_t{task}.csv'

            try:
                data = pd.read_csv(data_file)

                # Clean up columns
                if 'Unnamed: 0' in data.columns:
                    data = data.drop(columns=['Unnamed: 0'])
                if 'val' in data.columns:
                    data = data.drop(columns=['val'])

                # Parse time column
                data['ftime'] = data['time'].map(
                    lambda x: datetime.datetime.strptime(
                        x[0: x.index('.')].strip() if '.' in x else x.strip(),
                        time_format
                    )
                )

                # Sort by time
                data = data.sort_values('ftime').reset_index(drop=True)

                # Get the channel column (adjust column name based on your data)
                # Assuming columns are named like '1', '4', '5', etc.
                channel_col = str(channel_num)

                if channel_col not in data.columns:
                    print(f"  Warning: Channel {channel_col} not found for participant {participant}")
                    continue

                # Process data in 10-second windows
                i = 0
                while i + 1 < len(data):
                    current_time = data.iloc[i]['ftime']
                    end_time = current_time + ten_sec

                    # Get data in this 10-second window
                    window_data = data.loc[
                        (data['ftime'] >= current_time) &
                        (data['ftime'] < end_time)
                    ]

                    # Only process if we have enough data points
                    if len(window_data) >= 5:  # Minimum 5 samples for meaningful stats
                        # Extract raw EEG values for this channel
                        raw_values = window_data[channel_col].values

                        # Compute statistical features
                        features = np.expand_dims(statistical_features(raw_values), axis=0)

                        # Accumulate features
                        X_channel = features if X_channel is None else np.concatenate((X_channel, features))

                        # Store metadata
                        metadata.append({
                            'participant_id': participant,
                            'ftime': current_time.strftime(time_format)
                        })

                    # Move to next window (shift by 10 seconds)
                    i += len(window_data)
                    if len(window_data) == 0:
                        i += 1  # Prevent infinite loop

            except FileNotFoundError:
                print(f"  Warning: Data file not found for participant {participant}")
                continue
            except Exception as e:
                print(f"  Error processing participant {participant}: {e}")
                continue

        # Save features for this channel
        if X_channel is not None:
            # Save feature matrix
            feature_file = f'features_{task}_{channel_num}.csv'

            # Create DataFrame with metadata
            feature_columns = [
                'power_1', 'differential_entropy1', 'mean1', 'std1',
                'skew1', 'kurtosis1', 'iqr1', 'median1',
                'hjorth_01', 'hjorth_11'
            ]

            df_features = pd.DataFrame(X_channel, columns=feature_columns)
            df_metadata = pd.DataFrame(metadata)

            # Combine metadata and features
            df_final = pd.concat([df_metadata, df_features], axis=1)

            # Save to CSV
            df_final.to_csv(feature_file, index=False)

            print(f"\n✓ Saved {len(df_final)} samples to {feature_file}")
            print(f"  Shape: {X_channel.shape}")
            print(f"  Participants: {df_metadata['participant_id'].nunique()}")
        else:
            print(f"\n✗ No data processed for channel {channel_num}")


# Run feature extraction for all tasks
print("="*80)
print("STATISTICAL FEATURE EXTRACTION FROM RAW EEG DATA")
print("="*80)

# Extract features for Task 1 (all participants, all data)
gen_features_all_channels_all_data(task=1)

# Extract features for Task 2 (all participants, all data)
gen_features_all_channels_all_data(task=2)

# Extract features for Task 3 (all participants, all data)
gen_features_all_channels_all_data(task=3)

print("\n" + "="*80)
print("FEATURE EXTRACTION COMPLETE")
print("="*80)
print("\nGenerated files:")
for task in [1, 2, 3]:
    for channel_num in CHANNELS.keys():
        print(f"  - features_{task}_{channel_num}.csv")
