In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.io import loadmat
from scipy.signal import butter, filtfilt

EEG_FOLDER = f'../datasets/EEG_data_for_Mental_Attention_State_Detection'

bands = {
    'delta': (1, 4),
    'theta': (4, 8),
    'alpha': (8, 14),
    'beta': (14, 32),
    'gamma': (32, 60),
}

data_root = f'{EEG_FOLDER}/EEG_Data'
channels = ['AF3', 'F7', 'F3', 'FC5', 'T7', 'P7', 'O1', 'O2', 'P8', 'T8', 'FC6', 'F4', 'F8', 'AF4']


In [2]:


def bp_filter(data, f_lo, f_hi, fs):
    """ Digital band pass filter (6-th order Butterworth)
    Args:
        data: numpy.array, time along axis 0
        (f_lo, f_hi): frequency band to extract [Hz]
        fs: sampling frequency [Hz]
    Returns:
        data_filt: band-pass filtered data, same shape as data """
    data_filt = np.zeros_like(data)
    f_ny = fs / 2.  # Nyquist frequency
    b_lo = f_lo / f_ny  # normalized frequency [0..1]
    b_hi = f_hi / f_ny  # normalized frequency [0..1]
    # band-pass filter parameters
    p_lp = {"N":6, "Wn":b_hi, "btype":"lowpass", "analog":False, "output":"ba"}
    p_hp = {"N":6, "Wn":b_lo, "btype":"highpass", "analog":False, "output":"ba"}
    bp_b1, bp_a1 = butter(**p_lp)
    bp_b2, bp_a2 = butter(**p_hp)
    data_filt = filtfilt(bp_b1, bp_a1, data, axis=0)
    data_filt = filtfilt(bp_b2, bp_a2, data_filt, axis=0)
    return data_filt


#  Function to read in the EEG data and extract the valid lead data, low and high pass filter and z-transform the data.
#  Returns a dataframe.
def get_EEG_data(data_root, filename):
    # Extract the data from one of these files.
    hz = fs
    #filename = 'eeg_record30.mat'
    mat = loadmat(f'{data_root}/{filename}')
    data = pd.DataFrame.from_dict(mat["o"]["data"][0,0])

    # Limit the data to the 7 valid EEG leads.
    dat = data.filter(list(range(3, 17)))
    dat.columns = list(range(1, 15))
    dat = dat.filter([1,2, 3, 4,5,6, 7, 8, 9,10,11,12,13,14,17], axis=1)
    labels = ['AF3','F7', 'F3','FC5','T7','P7','O1', 'O2','P8','T8', 'FC6','F4','F8','AF4']  # FP2 should really be AF4
    dat.columns = labels

    # Filter the data, high pass .5 Hz, low pass 62 Hz.
    lo, hi = .5, 62
    # Do the filtering.
    datf = bp_filter(dat.to_numpy(), lo, hi, hz)

    # Convert back to a dataframe.
    dat = pd.DataFrame({c: datf[:, i] for i, c in enumerate(labels)})

    # Z-transform each column
    # dat = dat.apply(zscore)

    return dat

def get_trial_data(trial):
    dat = get_EEG_data(data_root, f'eeg_record{trial}.mat')
    return dat

def plot_spectrogram(data, channel_name, fs, fmin, fmax, show=True):
    # plot the spectrogram
    plt.figure(figsize=(14, 5))
    Sxx, freqs, bins, im = plt.specgram(data, NFFT=2*fs, Fs=fs,
                                         noverlap=fs, cmap='jet')
    if show:
        plt.ylim(fmin, fmax)
        plt.xlabel('Time (s)')
        plt.ylabel('Frequency (Hz)')
        plt.title(f'ch={channel_name} fs={fs} band=({fmin} - {fmax})Hz')
        plt.imshow(Sxx, aspect='auto',
                   cmap='jet', origin='lower',
                   extent=[bins[0], bins[-1], freqs[0], freqs[-1]],
                   vmin=0, vmax=np.max(Sxx[(freqs >= fmin) & (freqs <= fmax)]))
        plt.colorbar()
    else:
        plt.close()
    return Sxx

In [3]:
fs = 128

class Subject:

    def __init__(self, number, slice_from, slice_to, sample_length=10, trials=None):
        self.number = number
        # records = np.arange(1, 7) + (number * 7)
        trials = trials or np.arange(3, 7) + (number * 7)
        samples = np.arange(0, slice_to - slice_from, sample_length)
        self.focus_label = Label('focus', samples, trials, slice_from, slice_to, sample_length)
        self.unfocus_label = Label('unfocus', samples, trials, slice_from + 600, slice_to + 600, sample_length)
        self.drowsed_label = Label('drowsed', samples, trials, slice_from + 1200, slice_to + 1200, sample_length)


class Label:

    def __init__(self, label, samples, trials, slice_from, slice_to, sample_length):
        self.label = label
        self.slice_to = slice_to
        self.slice_from = slice_from
        self.sample_length = sample_length

        topographic = {ch: np.zeros((len(samples) * len(trials), fs + 1, sample_length))
                       for ch in channels}
        time_signals = {ch: np.zeros((len(samples) * len(trials), fs * sample_length))
                       for ch in channels}
        for i, trial in enumerate(trials):
            trial_data = get_trial_data(trial)
            for ch in trial_data.columns:
                fft_spectogram = plot_spectrogram(trial_data[ch][slice_from * fs:slice_to * fs], channel_name=ch, fs=fs, fmin=0, fmax=63, show=False)
                for j, sample in enumerate(samples):
                    # fft_spectogram.shape[1] is 129, probably not divided by sample length.
                    actual_sample_length = fft_spectogram[:, sample:sample+sample_length].shape[1]
                    topographic[ch][i * len(samples) + j, :, :actual_sample_length] = fft_spectogram[:, sample:sample+sample_length]

                    # time domain signal
                    time_signals[ch][i * len(samples) + j, :] = trial_data[ch][(slice_from + sample) * fs:(slice_from + sample+sample_length) * fs]

        self.topographic = topographic
        self.time_signals = time_signals

slice_from = 2 * 60
slice_to = 9 * 60
subject0 = Subject(0, slice_from, slice_to)

In [4]:
def create_data_set(trials):
    dfs = []
    for trial in trials:
        trial_data = get_trial_data(trial).iloc[:30 * 60 * fs]
        trial_data['ts_sample'] = trial_data.index
        trial_data['trial'] = trial
        trial_data['subject'] = trial // 7
        trial_data['label'] = trial_data.index // (10 * 60 * fs)
        dfs.append(trial_data)
    df = pd.concat(dfs, ignore_index=True)
    return df

df = create_data_set([3, 4, 5, 6, 7])

In [5]:
df

Unnamed: 0,AF3,F7,F3,FC5,T7,P7,O1,O2,P8,T8,FC6,F4,F8,AF4,ts_sample,trial,subject,label
0,-0.241079,18.672493,3.903603,0.654297,-0.244147,-0.180874,8.679878,10.275817,9.483967,0.117731,1.895261,0.044824,-0.222238,0.224840,0,3,0,0
1,-0.252600,11.530178,-0.646735,-1.364736,-0.251230,-1.332069,4.914937,3.072823,1.157794,0.117770,5.522281,0.046851,1.314589,-6.153273,1,3,0,0
2,-0.254323,14.704255,1.606917,-0.875217,-0.266992,-0.920235,11.543228,9.394244,4.243965,0.118960,9.775144,0.044350,0.783386,-5.293466,2,3,0,0
3,-0.265599,17.773842,5.296367,2.215564,-0.273738,0.536253,12.328250,8.217208,7.218984,0.118524,7.753763,0.045689,0.286773,-2.464811,3,3,0,0
4,-0.267920,16.338601,6.030688,3.237110,-0.290486,0.953128,11.745019,2.747784,3.663866,0.119776,6.891651,0.043695,-0.262150,-4.179449,4,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1151995,0.089910,-6.362355,-5.999113,-0.145249,-2.193949,-3.102257,-2.818757,-2.121525,-7.643784,2.038443,-1.782159,-0.000313,0.434231,-7.729805,230395,7,1,2
1151996,1.686424,-4.563661,-5.544239,2.413793,-1.286190,-3.032180,-2.661725,-1.446323,-3.991723,1.084666,-2.321150,-0.000265,-0.524244,-8.369114,230396,7,1,2
1151997,1.107759,-3.246932,-6.676752,1.905364,-1.694267,-2.519027,-3.015780,2.432445,-1.074235,-0.012125,-2.809865,-0.000221,-1.621546,-7.702001,230397,7,1,2
1151998,-0.890509,-8.630045,-7.246335,-1.691854,-1.303867,-7.555983,-10.541277,-3.551215,-11.777958,0.057235,-1.805201,-0.000172,-1.549245,-6.790633,230398,7,1,2


In [12]:
# def create_df_features(df, feature, time_slot):
#     feature_result = pd.DataFrame(columns=df.columns)
#     trials = df['trial'].unique()
#     for trial in trials:
#         for ch in channels:
#             feature_result[ch] = df[df['trial'] == trial, ch]
grouped_df = df.groupby(df.index // fs)
features_df = grouped_df[channels].rolling(128).var().dropna()
features_df['label'] = grouped_df['label'].rolling(128).max().dropna().astype(np.int8)
features_df = features_df.reset_index()
features_df
# features_df['ts_sample'] = grouped_df['ts_sample'].rolling(128).max() // 128
# print(features_df)

Unnamed: 0,level_0,level_1,AF3,F7,F3,FC5,T7,P7,O1,O2,P8,T8,FC6,F4,F8,AF4,label
0,0,127,25.357877,2124.374466,591.380509,53.151116,3.762714,626.996370,1043.287750,1592.343067,803.713534,0.825309,858.540301,5.358026e-01,9.635931,293.772350,0
1,1,255,4.109462,3549.051921,895.578083,4.935973,2.429660,1119.816683,4846.676498,11730.174531,464.146193,3.695099,47.056097,2.875960e-02,1.739605,1295.712996,0
2,2,383,10.216768,1685.447400,753.402418,15.617861,2.495635,1262.820959,3211.231715,4065.692989,786.543245,1.138012,594.325193,1.901822e-01,5.297749,815.308515,0
3,3,511,24.241998,2000.419888,1456.946584,348.530487,20.033104,1171.665434,2112.229061,2212.752086,1540.948703,0.223475,5641.646997,2.179878e+00,17.403679,1302.446334,0
4,4,639,4.773806,74.515812,72.532591,17.034154,0.608673,112.410020,185.728771,455.563467,247.471269,4.743169,496.009370,2.774916e-01,5.092447,154.180710,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,8995,1151487,3.241144,56.340529,57.957592,2.398450,6.990153,57.361707,529.408693,75.736977,57.582960,11.596293,3.065674,4.800543e-06,0.483963,52.466752,2
8996,8996,1151615,4.211585,66.928787,95.590741,3.313154,9.457415,95.134279,803.409509,55.928795,92.450975,9.339025,2.962043,1.664122e-06,1.553185,155.739279,2
8997,8997,1151743,3.790368,362.550289,463.143520,3.551659,4.424311,580.329004,2347.960966,452.285872,715.451862,6.763233,2.762448,1.015088e-06,0.556263,576.408082,2
8998,8998,1151871,3.908863,133.959457,157.247654,2.588597,7.351878,164.148822,147.796359,180.885744,186.554081,3.127393,3.327528,4.764704e-07,0.434529,98.128275,2


# AutoGluon Models

In [None]:
from autogluon.tabular import TabularPredictor
import pandas as pd

# Load your dataframe
df = pd.read_csv('your_data.csv')

# Split the dataframe into features (X) and target variable (y)
X = df.drop(columns=['target_column'])
y = df['target_column']

# Define the problem type (classification or regression)
problem_type = 'classification'  # or 'regression' depending on your task

# Define the AutoGluon TabularPredictor
predictor = TabularPredictor(label='target_column', problem_type=problem_type)

# Train the AutoGluon model
predictor.fit(train_data=X, tuning_data=X, time_limit=3600)  # Adjust the time_limit as needed

# Make predictions on new data
new_data = pd.read_csv('new_data.csv')
predictions = predictor.predict(new_data)

# Evaluate the model
leaderboard = predictor.leaderboard()

# Print the leaderboard
print(leaderboard)
