In [1]:
from extract import extract_raw_data, compute_band_power, compute_spectral_entropy, compute_statistic, compute_waveform_length, compute_zero_crossing_rate
from process import bandpass_filter, apply_ica
import pandas as pd
import matplotlib.pyplot as plt
import mne
import json 
import copy
import pickle

file_path = './data/eeg_record1.mat'

channel_names = ['AF3', 'F7', 'F3', 'FC5', 'T7', 'P7', 'O1',
                 'O2', 'P8', 'T8', 'FC6', 'F4', 'F8', 'AF4']
sfreq = 128  # Specify the sampling frequency
low_f = 0.5  # Specify the lower cut-off frequency
high_f = 50  # Specify the higher cut-off frequency

Preprocessing and Extracting feature

In [None]:
import mne
import numpy as np
from scipy.stats import skew, kurtosis
import pandas as pd


# ===================================== DATA LOADING =====================================
def extract_raw_data(file_path: str, channel_names: list, sfreq: int) -> tuple:
    """Load and structure EEG data into MNE format."""
    # Load data
    from scipy.io import loadmat
    mat_file = loadmat(file_path)
    data = mat_file['o'][0][0]['data'][:, 3:17]  # EEG data
    
    # Create MNE Raw object
    info = mne.create_info(channel_names, sfreq, ch_types="eeg")
    raw = mne.io.RawArray(data.T, info)
    
    # Split into conditions
    focus_data = raw.copy().crop(tmin=0, tmax=10 * 60)
    unfocus_data = raw.copy().crop(tmin=10 * 60, tmax=20 * 60)
    drowsy_data = raw.copy().crop(tmin=20 * 60)

    return [focus_data, unfocus_data, drowsy_data]


# ===================================== FEATURE EXTRACTION =====================================
def compute_band_power(raw: mne.io.Raw, fmin, fmax) -> tuple:
    """Compute band power using Welch's method."""
    # n_fft = int(raw.info['sfreq'] * 2)
    # psds, freqs = raw.compute_psd(method='welch', fmin=fmin, fmax=fmax, n_fft = n_fft).get_data(return_freqs=True)

    fs = int(raw.info['sfreq'])
    psds, freqs = raw.compute_psd(method='welch', fmin=fmin, fmax=fmax, n_per_seg = fs*2).get_data(return_freqs=True)

    bands = {'delta': (0.5, 4), 'theta': (4, 8), 'alpha': (8, 12), 'beta': (12, 30), 'gamma': (30, 100)}
    band_power_data = {}
    for band, (low, high) in bands.items():
        band_idx = np.logical_and(freqs >= low, freqs < high)
        band_power_data[band] = np.mean(psds[:, band_idx], axis=1)
    
    return pd.DataFrame(band_power_data), psds, freqs

def compute_spectral_entropy(raw: mne.io.Raw) -> dict:
    """Compute spectral entropy for each channel."""
    fs = int(raw.info['sfreq'])
    # n_fft = int(raw.info['sfreq'] * 2)
    # psds, freqs = raw.compute_psd(method='welch', fmin=fmin, fmax=fmax, n_fft = n_fft).get_data(return_freqs=True)
    
    psds, _ = raw.compute_psd(method='welch', fmin=0.5, n_per_seg = fs*2).get_data(return_freqs=True)
    spectral_entropy = -np.sum(psds * np.log(psds), axis=1)
    return pd.DataFrame(spectral_entropy)


def compute_statistic(raw: mne.io.Raw) -> dict:
    """Compute statistics (mean, std, skew, kurtosis) for each channel."""
    data = raw.get_data()
    statistic_data = []
    for i, ch in enumerate(raw.ch_names):
        statistic_data.append({
            'mean': np.mean(data[i]),
            'std': np.std(data[i]),
            'skew': skew(data[i]),
            'kurtosis': kurtosis(data[i])
        })
    return pd.DataFrame(statistic_data)


def compute_zero_crossing_rate(raw: mne.io.Raw) -> dict:
    """Compute zero crossing rate for each channel."""
    data = raw.get_data()
    zcr_data = []
    for i, ch in enumerate(raw.ch_names):
        zero_crossings = np.mean(np.abs(np.diff(np.sign(data[i])))) / 2
        zcr_data.append(zero_crossings)
    return pd.DataFrame(zcr_data)


def compute_waveform_length(raw: mne.io.Raw) -> dict:
    """Compute waveform length for each channel."""
    data = raw.get_data()
    waveform_length_data = []
    for i, ch in enumerate(raw.ch_names):
        waveform_length_data.append(np.sum(np.abs(np.diff(data[i]))))
    return pd.DataFrame(waveform_length_data)

# ===================================== CLEANING =====================================
def bandpass_filter(raw: mne.io.Raw, lowcut: float, highcut: float) -> mne.io.Raw:
    """Apply bandpass filter using MNE."""
    raw.filter(l_freq=lowcut, h_freq=highcut, method="fir")
    return raw


def apply_ica(raw: mne.io.Raw, n_components: int = None) -> mne.io.Raw:
    """Apply ICA to remove artifacts."""
    if n_components is None:
        n_components = mne.compute_rank(raw, tol=1e-4)['eeg']
    ica = mne.preprocessing.ICA(n_components=n_components, random_state=42, max_iter=500)
    ica.fit(raw)
    raw = ica.apply(raw)
    return raw

In [2]:
# def preprocess_eeg(raw_eeg, lowcut, highcut, n_components: int = None):
#     filter_eeg = copy.deepcopy(raw_eeg)
#     for i in range(len(filter_eeg)):
#         filter_eeg[i] = bandpass_filter(filter_eeg[i], lowcut, highcut)

#     ica_eeg = copy.deepcopy(filter_eeg)
#     for i in range(len(ica_eeg)):
#         ica_eeg[i] = apply_ica(ica_eeg[i], n_components)
    
#     return ica_eeg

# def extract_features(processed_eeg):
#     data_df = pd.DataFrame()
#     spectrograms = []
#     for i in range(len(processed_eeg)):
#         band_power, psds, freqs = compute_band_power(processed_eeg[i], 0.5, 50)
#         spectral_entropy = compute_spectral_entropy(processed_eeg[i])
#         statistic = compute_statistic(processed_eeg[i])
#         waveform_length = compute_waveform_length(processed_eeg[i])
#         zero_crossing_rate = compute_zero_crossing_rate(processed_eeg[i])

#         temp_df = pd.concat([band_power, spectral_entropy, statistic, waveform_length, zero_crossing_rate], axis=1)
#         temp_df.reset_index(inplace=True)
#         temp_df['state'] = [0, 1, 2][i]    

#         psds_dict = {
#             'focus': psds[0].tolist(),
#             'unfocus': psds[1].tolist(),
#             'drowsy': psds[2].tolist()
#         }

#         data_df = pd.concat([data_df, temp_df])
#         spectrograms.append({'power_spectral_density':psds_dict, 'frequency':freqs.tolist()})

#     data_df.columns = ['channel', 'delta', 'theta', 'alpha', 'beta', 'gamma', 
#                        'spectral_entropy', 'mean', 'variance', 'skewness', 
#                        'kurtosis', 'waveform_length', 'zero_crossing_rate', 'state']
#     return data_df, spectrograms

# final_df = pd.DataFrame()
# spectrograms_dict = {}
# num_files = range(1, 35)
# for i in num_files:
#     print(f'====================================Processing file {i}===============================================')
#     file_path = f'./data/eeg_record{i}.mat'
#     raw_eeg = extract_raw_data(file_path, channel_names, sfreq)
#     processed_eeg = preprocess_eeg(raw_eeg, low_f, high_f, 14)
    
#     data_df, spectrograms = extract_features(processed_eeg)
#     data_df['file'] = i
#     spectrograms_dict[i] = spectrograms
#     final_df = pd.concat([final_df, data_df])

# final_df.to_csv('data1.csv', index=False)
# with open('spectrograms1.json', 'w') as f:
#     json.dump(spectrograms_dict, f)


In [3]:
data = pd.read_csv('data1.csv')

In [16]:
data

Unnamed: 0,channel,delta,theta,alpha,beta,gamma,spectral_entropy,mean,variance,skewness,kurtosis,waveform_length,zero_crossing_rate,state,file
0,0,14.391609,3.679328,2.011425,0.637030,0.160284,-2.685009e+03,-0.000573,17.588064,-74.002214,8750.927935,1.412309e+05,0.356016,0,1
1,1,3829.449232,49.561224,7.994133,1.722802,0.425814,-1.940103e+06,0.152913,137.511126,-8.071308,153.667873,3.848728e+05,0.051576,0,1
2,2,4244.809777,60.327254,8.979475,1.877769,0.447044,-2.163930e+06,0.018262,138.733956,-13.249511,320.183854,3.815944e+05,0.065807,0,1
3,3,13.787026,3.284733,1.775865,0.560829,0.143492,-2.467271e+03,-0.004456,16.622731,-69.464901,7991.657354,1.374003e+05,0.330313,0,1
4,4,19.954494,3.519528,1.790933,0.553084,0.130425,-4.037107e+03,0.000522,18.816576,-48.613148,4724.875090,1.190909e+05,0.336432,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1423,9,0.233465,0.130543,0.118272,0.097184,0.064715,1.668580e+02,0.000191,2.239347,0.894692,20.202484,4.446874e+05,0.351538,2,34
1424,10,0.087677,0.029871,0.024199,0.017197,0.010680,5.819346e+01,-0.000070,1.030356,-7.161353,108.102008,3.600041e+04,0.042753,2,34
1425,11,0.404678,0.102484,0.084710,0.050222,0.036577,1.228906e+02,-0.001705,2.020326,0.543167,432.169430,3.202471e+05,0.385570,2,34
1426,12,0.167334,0.079250,0.064741,0.061547,0.050458,1.328667e+02,-0.001421,1.910101,-0.098318,113.990127,3.612670e+05,0.364613,2,34


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

target = 'state'
categorical_features = ['channel', 'file']
numreical_features = ['delta', 'theta', 'alpha', 'beta', 'gamma', 
                    'spectral_entropy', 'mean', 'variance', 'skewness', 'kurtosis', 
                    'waveform_length', 'zero_crossing_rate']

scaled_data = data.copy()
scaler_dict = {}
for feature in numreical_features:
    scaler = StandardScaler()
    scaled_data[feature] = scaler.fit_transform(data[feature].values.reshape(-1, 1))
    scaler_dict[feature] = scaler

In [5]:
dropped_columns = ['state', 'channel', 'file', 'mean', 'variance', 'skewness', 'kurtosis', 'waveform_length', 'zero_crossing_rate', 'spectral_entropy']

X = scaled_data.drop(columns=dropped_columns)
y = scaled_data['state']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,  random_state=42)
X_val, X_test , y_val, y_test = train_test_split(X_train, y_train, test_size=0.5, random_state=42)

In [6]:
X

Unnamed: 0,delta,theta,alpha,beta,gamma
0,-0.200978,-0.096001,-0.189946,-0.089564,-0.091064
1,6.297714,1.741608,0.232303,0.409273,0.118242
2,7.005253,2.172796,0.301846,0.480469,0.134977
3,-0.202008,-0.111805,-0.206572,-0.124573,-0.104300
4,-0.191502,-0.102401,-0.205508,-0.128131,-0.114600
...,...,...,...,...,...
1423,-0.225096,-0.238133,-0.323562,-0.337586,-0.166397
1424,-0.225344,-0.242165,-0.330201,-0.374334,-0.208990
1425,-0.224804,-0.239257,-0.325931,-0.359161,-0.188576
1426,-0.225208,-0.240187,-0.327340,-0.353958,-0.177635


In [7]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

svc_param = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01],
    'kernel': ['rbf', 'linear']
}
svc_grid = GridSearchCV(SVC(), svc_param, refit=True, verbose=2)
svc_grid.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.

In [8]:
print('Best parameter: ', svc_grid.best_params_)

with open('./model/svc_model1.pkl', 'wb') as f:
    pickle.dump(svc_grid.best_estimator_, f)

with open('./model/svc_model1.pkl', 'rb') as f:
    svc_model = pickle.load(f)

from sklearn.svm import SVC
svc_model_non_ft = SVC()
svc_model_non_ft.fit(X_train, y_train)

print('Train accuracy: ', accuracy_score(y_train, svc_model.predict(X_train)))
print('Validation accuracy: ', accuracy_score(y_val, svc_model.predict(X_val)))
print('Test accuracy: ', accuracy_score(y_test, svc_model.predict(X_test)))

print('Train accuracy non-ft: ', accuracy_score(y_train, svc_model_non_ft.predict(X_train)))
print('Validation accuracy non-ft: ', accuracy_score(y_val, svc_model_non_ft.predict(X_val)))
print('Test accuracy non-ft: ', accuracy_score(y_test, svc_model_non_ft.predict(X_test)))

Best parameter:  {'C': 100, 'gamma': 1, 'kernel': 'rbf'}
Train accuracy:  0.5595595595595596
Validation accuracy:  0.5791583166332666
Test accuracy:  0.54
Train accuracy non-ft:  0.43843843843843844
Validation accuracy non-ft:  0.46693386773547096
Test accuracy non-ft:  0.41


In [9]:
knn_param = { 'n_neighbors' : [3, 4, 5, 6],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_param, refit=True, verbose=2)
knn_grid.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END ...metric=minkowski, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ...metric=minkowski, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ...metric=minkowski, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ...metric=minkowski, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ...metric=minkowski, n_neighbors=3, weights=uniform; total time=   0.0s
[CV] END ..metric=minkowski, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ..metric=minkowski, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ..metric=minkowski, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ..metric=minkowski, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ..metric=minkowski, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ...metric=minkowski, n_neighbors=4, weights=uniform; total time=   0.0s
[CV] END ...metric=minkowski, n_neighbors=4, we

In [10]:
with open('./model/knn_model1.pkl', 'wb') as f:
    pickle.dump(knn_grid.best_estimator_, f)
print('Best parameter: ', knn_grid.best_params_)

with open('./model/knn_model1.pkl', 'rb') as f:
    knn_model = pickle.load(f)

knn_model_non_ft = KNeighborsClassifier()
knn_model_non_ft.fit(X_train, y_train)

print('Train accuracy: ', accuracy_score(y_train, knn_model.predict(X_train)))
print('Validation accuracy: ', accuracy_score(y_val, knn_model.predict(X_val)))
print('Test accuracy: ', accuracy_score(y_test, knn_model.predict(X_test)))

print('Train accuracy non-ft: ', accuracy_score(y_train, knn_model_non_ft.predict(X_train)))
print('Validation accuracy non-ft: ', accuracy_score(y_val, knn_model_non_ft.predict(X_val)))
print('Test accuracy non-ft: ', accuracy_score(y_test, knn_model_non_ft.predict(X_test)))

Best parameter:  {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'uniform'}
Train accuracy:  0.7037037037037037
Validation accuracy:  0.7154308617234469
Test accuracy:  0.692
Train accuracy non-ft:  0.6746746746746747
Validation accuracy non-ft:  0.6773547094188377
Test accuracy non-ft:  0.672


In [11]:
from sklearn.linear_model import LogisticRegression
lr_param = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Regularization type
    'solver': ['liblinear', 'saga'],  # Solvers supporting L1/L2
}
lr_grid = GridSearchCV(LogisticRegression(), lr_param, refit=True, verbose=2)
lr_grid.fit(X_train, y_train)

Fitting 5 folds for each of 28 candidates, totalling 140 fits
[CV] END .............C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END .............C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END .............C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END .............C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END .............C=0.0001, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ..................C=0.0001, penalty=l1, solver=saga; total time=   0.0s
[CV] END ..................C=0.0001, penalty=l1, solver=saga; total time=   0.0s
[CV] END ..................C=0.0001, penalty=l1, solver=saga; total time=   0.0s
[CV] END ..................C=0.0001, penalty=l1, solver=saga; total time=   0.0s
[CV] END ..................C=0.0001, penalty=l1, solver=saga; total time=   0.0s
[CV] END .............C=0.0001, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END .............C=0.0001, penalty=l2, sol



[CV] END ................C=0.1, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ................C=0.1, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ................C=0.1, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ................C=0.1, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ................C=0.1, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END .....................C=0.1, penalty=l2, solver=saga; total time=   0.0s
[CV] END .....................C=0.1, penalty=l2, solver=saga; total time=   0.0s
[CV] END .....................C=0.1, penalty=l2, solver=saga; total time=   0.0s
[CV] END .....................C=0.1, penalty=l2, solver=saga; total time=   0.0s
[CV] END .....................C=0.1, penalty=l2, solver=saga; total time=   0.0s
[CV] END ..................C=1, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ..................C=1, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ..................C



[CV] END .......................C=1, penalty=l2, solver=saga; total time=   0.0s
[CV] END .................C=10, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END .................C=10, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END .................C=10, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END .................C=10, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END .................C=10, penalty=l1, solver=liblinear; total time=   0.0s




[CV] END ......................C=10, penalty=l1, solver=saga; total time=   0.0s
[CV] END ......................C=10, penalty=l1, solver=saga; total time=   0.0s
[CV] END ......................C=10, penalty=l1, solver=saga; total time=   0.0s
[CV] END ......................C=10, penalty=l1, solver=saga; total time=   0.0s
[CV] END ......................C=10, penalty=l1, solver=saga; total time=   0.0s
[CV] END .................C=10, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END .................C=10, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END .................C=10, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END .................C=10, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END .................C=10, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ......................C=10, penalty=l2, solver=saga; total time=   0.0s
[CV] END ......................C=10, penalty=l2, solver=saga; total time=   0.0s
[CV] END ...................



[CV] END ......................C=10, penalty=l2, solver=saga; total time=   0.0s
[CV] END ................C=100, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ................C=100, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ................C=100, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ................C=100, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ................C=100, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END .....................C=100, penalty=l1, solver=saga; total time=   0.0s
[CV] END .....................C=100, penalty=l1, solver=saga; total time=   0.0s
[CV] END .....................C=100, penalty=l1, solver=saga; total time=   0.0s
[CV] END .....................C=100, penalty=l1, solver=saga; total time=   0.0s
[CV] END .....................C=100, penalty=l1, solver=saga; total time=   0.0s
[CV] END ................C=100, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ................C=1



In [12]:
print('Best parameter: ', lr_grid.best_params_)
with open('./model/lr_model1.pkl', 'wb') as f:
    pickle.dump(lr_grid.best_estimator_, f)

with open('./model/lr_model1.pkl', 'rb') as f:
    lr_model = pickle.load(f)

from sklearn.linear_model import LogisticRegression
lr_model_non_ft = LogisticRegression()
lr_model_non_ft.fit(X_train, y_train)


print('Train accuracy: ', accuracy_score(y_train, lr_model.predict(X_train)))
print('Validation accuracy: ', accuracy_score(y_val, lr_model.predict(X_val)))
print('Test accuracy: ', accuracy_score(y_test, lr_model.predict(X_test)))

print('Train accuracy non-ft: ', accuracy_score(y_train, lr_model_non_ft.predict(X_train)))
print('Validation accuracy non-ft: ', accuracy_score(y_val, lr_model_non_ft.predict(X_val)))
print('Test accuracy non-ft: ', accuracy_score(y_test, lr_model_non_ft.predict(X_test)))

Best parameter:  {'C': 10, 'penalty': 'l2', 'solver': 'saga'}
Train accuracy:  0.47347347347347346
Validation accuracy:  0.48897795591182364
Test accuracy:  0.458
Train accuracy non-ft:  0.4744744744744745
Validation accuracy non-ft:  0.49298597194388777
Test accuracy non-ft:  0.456


In [13]:
from sklearn.ensemble import RandomForestClassifier
rf_param = {
    'n_estimators': [4, 8, 16, 32],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4]
}
rf_grid = GridSearchCV(RandomForestClassifier(), rf_param, refit=True, verbose=2)
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=4; total time=   0.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=4; total time=   0.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=4; total time=   0.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=4; total time=   0.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=4; total time=   0.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=8; total time=   0.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=8; total time=   0.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=8; total time=   0.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=8; total time=   0.0s
[CV] END max_depth=10, min_samples_leaf=1, min

In [14]:
print('Best parameter: ', rf_grid.best_params_)
with open('./model/rf_model1.pkl', 'wb') as f:
    pickle.dump( rf_grid.best_estimator_, f)

with open('./model/rf_model1.pkl', 'rb') as f:
    rf_model = pickle.load(f)

from sklearn.ensemble import RandomForestClassifier

rf_model_non_ft = RandomForestClassifier()
rf_model_non_ft.fit(X_train, y_train)

print('Train accuracy: ', accuracy_score(y_train, rf_model.predict(X_train)))
print('Validation accuracy: ', accuracy_score(y_val, rf_model.predict(X_val)))
print('Test accuracy: ', accuracy_score(y_test, rf_model.predict(X_test)))

print('Train accuracy non-ft: ', accuracy_score(y_train, rf_model_non_ft.predict(X_train)))
print('Validation accuracy non-ft: ', accuracy_score(y_val, rf_model_non_ft.predict(X_val)))
print('Test accuracy non-ft: ', accuracy_score(y_test, rf_model_non_ft.predict(X_test)))

Best parameter:  {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 16}
Train accuracy:  0.988988988988989
Validation accuracy:  0.9899799599198397
Test accuracy:  0.988
Train accuracy non-ft:  1.0
Validation accuracy non-ft:  1.0
Test accuracy non-ft:  1.0


In [15]:
# from catboost import CatBoostClassifier, Pool

# # categorical_features = []
# # numerical_features = ['delta', 'theta', 'alpha', 'beta', 'gamma', 'spectral_entropy', 'mean', 'variance', 'skewness', 'kurtosis', 'waveform_length', 'zero_crossing_rate']

# # # Create Pool objects for CatBoost
# train_pool = Pool(X_train, y_train, cat_features=categorical_features)
# test_pool = Pool(X_test, y_test, cat_features=categorical_features)
# val_pool = Pool(X_val, y_val, cat_features=categorical_features)

# # # Initialize and train the CatBoost classifier
# # model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, verbose=100)
# # model.fit(train_pool)
# # model.save_model('./model/catboost_model.cbm')

# model = CatBoostClassifier()
# model.load_model('./model/catboost_model.cbm')

# print('Train accuracy: ', accuracy_score(y_train, model.predict(train_pool)))
# print('Validation accuracy: ', accuracy_score(y_val, model.predict(val_pool)))
# print('Test accuracy: ', accuracy_score(y_test, model.predict(test_pool)))