In [257]:
import os
import numpy as np
import pandas as pd

from scipy.signal import butter, sosfilt
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from lib import *

BASE_PATH = r"C:\Users\wilso\Desktop\Sleep-Stage-Classification\src"
DATA_PATH = os.path.join(BASE_PATH, "data_norm\collated.npz")
SAVE_DIR = os.path.join(BASE_PATH, "csv")

In [176]:
collated_data = Dataset(DATA_PATH)
print(collated_data.x.shape) # x shape
print(collated_data.y.shape) # y shape

(42308, 3000)
(42308,)


In [184]:
df = pd.read_csv(os.path.join(SAVE_DIR, "test.csv"))
y_col, other_cols = df.columns[0], df.columns[1:]
print(f"Number of features : {len(df.columns) - 1}\nNumber of rows : {len(df)}\n")

Number of features : 234
Number of rows : 42308



# Utility

In [265]:
def update_train_test(selected_cols, X_train, X_test):
    """
    Use Case :
    X_train, X_test = update_train_test(selected_cols, X_train, X_test)
    """
    initial_cols = set(X_test.columns)
    new_X_test = X_test.loc[:, selected_cols]
    new_cols = set(new_X_test.columns)
#     print(f"Removed cols | {len(initial_cols - new_cols)} : \n{initial_cols - new_cols}")
    print(f"Selected cols | {len(new_cols)} : \n{new_cols}")
    return X_train.loc[:, selected_cols], new_X_test

# Data Cleaning

In [185]:
# Remove NaN data
df.dropna(inplace=True)
print(f"After removing rows with NaN values :\nNumber of features : {len(df.columns) - 1}\nNumber of rows : {len(df)}\n")
print(df.columns)

After removing rows with NaN values :
Number of features : 234
Number of rows : 37588

Index(['y', 'avg', 'std', 'skew', 'kurtosis', 'hjorth_activity',
       'hjorth_morbidity', 'hjorth_complexity', 'mmd', 'pfd',
       ...
       'gamma2FFTKurtosis', 'gamma2FFTHjorth_activity',
       'gamma2FFTHjorth_morbidity', 'gamma2FFTHjorth_complexity',
       'gamma2FFTMMD', 'gamma2FFTPFD', 'gamma2FFTKFD', 'gamma2FFTLRSSV',
       'gamma2FFTSE', 'gamma2FFTRE'],
      dtype='object', length=235)


In [186]:
# Convert complex numbers in str to float dtype
for col in df.select_dtypes(include=[object]).columns:
    df[col] = df[col].apply(eval).apply(np.abs)

In [187]:
# Normalize DF values (Min-Max Normalization)
y = df[y_col]
df = (df - df.min()) / (df.max() - df.min())
df[y_col] = y

df

Unnamed: 0,y,avg,std,skew,kurtosis,hjorth_activity,hjorth_morbidity,hjorth_complexity,mmd,pfd,...,gamma2FFTKurtosis,gamma2FFTHjorth_activity,gamma2FFTHjorth_morbidity,gamma2FFTHjorth_complexity,gamma2FFTMMD,gamma2FFTPFD,gamma2FFTKFD,gamma2FFTLRSSV,gamma2FFTSE,gamma2FFTRE
0,0,0.601328,0.230527,0.446438,0.072287,0.534005,0.362525,0.568944,0.335444,0.524244,...,1.212022e-09,0.001394,0.001972,0.037269,0.017904,0.898695,0.507954,0.310263,0.893196,0.353814
1,0,0.617582,0.299683,0.511990,0.059719,0.561128,0.337736,0.595446,0.465932,0.546216,...,3.355785e-08,0.000768,0.002684,0.027640,0.018810,0.906826,0.527519,0.362440,0.900954,0.299058
2,0,0.596793,0.387734,0.514166,0.042276,0.559385,0.339312,0.593749,0.585709,0.440155,...,3.724493e-08,0.000911,0.002458,0.030111,0.019986,0.913143,0.435611,0.363099,0.786683,0.314743
5,0,0.608910,0.262084,0.462756,0.059494,0.546229,0.351285,0.580912,0.419148,0.517161,...,2.946733e-08,0.000541,0.003213,0.023186,0.019055,0.923055,0.577408,0.448216,0.867893,0.392968
6,0,0.608497,0.302534,0.493460,0.063200,0.552926,0.345175,0.587452,0.445922,0.487139,...,6.848257e-08,0.000766,0.002688,0.027599,0.017410,0.890552,0.679775,0.475279,0.639573,0.415295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42303,0,0.402672,0.221976,0.613893,0.158196,0.347472,0.549369,0.381073,0.585356,0.556384,...,7.814901e-09,0.004752,0.001033,0.068864,0.031991,0.876050,0.633499,0.601937,0.859557,0.548366
42304,0,0.382468,0.232968,0.493603,0.096040,0.330811,0.567609,0.363783,0.471183,0.587537,...,1.071438e-08,0.003468,0.001223,0.058821,0.037889,0.527566,0.671631,0.451844,0.751784,0.437468
42305,0,0.397342,0.278316,0.604173,0.084674,0.351592,0.544901,0.385335,0.591743,0.528961,...,3.689894e-07,0.000464,0.003476,0.021462,0.024041,0.920354,0.490771,0.345937,0.869419,0.383506
42306,0,0.396681,0.211185,0.584091,0.114713,0.340510,0.556957,0.373859,0.438181,0.557556,...,1.110656e-08,0.001093,0.002238,0.032983,0.023900,0.908631,0.566749,0.466633,0.902437,0.331426


In [188]:
summary_stats = df.describe()
summary_stats

Unnamed: 0,y,avg,std,skew,kurtosis,hjorth_activity,hjorth_morbidity,hjorth_complexity,mmd,pfd,...,gamma2FFTKurtosis,gamma2FFTHjorth_activity,gamma2FFTHjorth_morbidity,gamma2FFTHjorth_complexity,gamma2FFTMMD,gamma2FFTPFD,gamma2FFTKFD,gamma2FFTLRSSV,gamma2FFTSE,gamma2FFTRE
count,37588.0,37588.0,37588.0,37588.0,37588.0,37588.0,37588.0,37588.0,37588.0,37588.0,...,37588.0,37588.0,37588.0,37588.0,37588.0,37588.0,37588.0,37588.0,37588.0,37588.0
mean,2.039853,0.437867,0.204286,0.545683,0.068868,0.38591,0.520333,0.416691,0.340369,0.38278,...,0.0001691109,0.001421,0.011153,0.016088,0.012603,0.802456,0.455648,0.246939,0.583131,0.244184
std,1.309525,0.172992,0.140087,0.048071,0.040435,0.162775,0.166671,0.164317,0.178968,0.161769,...,0.009339313,0.018279,0.01635,0.03406,0.036461,0.220828,0.101144,0.137984,0.221851,0.140018
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.387228,0.103276,0.522008,0.043413,0.322849,0.462011,0.35549,0.205842,0.271908,...,1.256718e-08,3.7e-05,0.004946,0.00602,0.003444,0.806681,0.398145,0.138922,0.406615,0.119603
50%,2.0,0.396228,0.164407,0.546889,0.057802,0.345792,0.551195,0.379334,0.317735,0.36231,...,4.705496e-08,9.2e-05,0.007914,0.009499,0.005486,0.905923,0.459577,0.2674,0.629342,0.290007
75%,3.0,0.483113,0.26995,0.570444,0.081887,0.43094,0.576426,0.466395,0.443435,0.477619,...,2.363818e-07,0.000232,0.012474,0.015159,0.009806,0.924856,0.516279,0.317405,0.751889,0.324548
max,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Basic Feature selection

In [266]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.3)
X_train, y_train = train[other_cols], train[y_col]
X_test, y_test = test[other_cols], test[y_col]
print(X_train.shape)
print(X_test.shape)

(26311, 234)
(11277, 234)


In [267]:
# basic removal of features with 0 variance
from sklearn.feature_selection import VarianceThreshold
sel_variance_threshold = VarianceThreshold(threshold=0.00)
X_train_remove_variance = sel_variance_threshold.fit_transform(X_train)
assert X_train_remove_variance.shape == X_train.shape
print("There are no features with 0 variance.")

There are no features with 0 variance.


In [268]:
from sklearn.feature_selection import chi2, SelectKBest

sel_chi2 = SelectKBest(chi2, k=len(other_cols)//2)    # select k features
X_train_chi2 = sel_chi2.fit_transform(X_train, y_train)
selected_features = sel_chi2.get_support()
X_train, X_test = update_train_test(selected_features, X_train, X_test)

Selected cols | 117 : 
{'gamma2Hjorth_complexity', 'thetaSTD', 'gamma1Hjorth_morbidity', 'beta1RE', 'deltaKurtosis', 'alphaFFTSTD', 'beta1FFTMMD', 'gamma2Hjorth_activity', 'alphaSTD', 'sigmaFFTSE', 'sigmaRE', 'gamma1FFTSTD', 'gamma1KFD', 'beta1FFTLRSSV', 'std', 'deltaFFTSE', 'gamma1Hjorth_activity', 'alphaHjorth_complexity', 'beta1FFTSTD', 'deltaSE', 'deltaSkew', 'beta2FFTLRSSV', 'thetaHjorth_morbidity', 'beta2FFTHjorth_complexity', 'beta2FFTHjorth_activity', 'deltaLRSSV', 'beta1Hjorth_activity', 'gamma2FFTSTD', 'thetaRE', 'sigmaHjorth_complexity', 'beta2FFTMMD', 'beta1Hjorth_complexity', 'beta2Hjorth_activity', 'sigmaKFD', 'thetaFFTHjorth_complexity', 'lrssv', 'beta1LRSSV', 'deltaKFD', 'deltaMMD', 'alphaFFTMMD', 'thetaKFD', 'beta2STD', 'gamma2FFTRE', 'alphaHjorth_activity', 'sigmaSE', 'beta1STD', 'alphaKFD', 'sigmaKurtosis', 'gamma1FFTHjorth_complexity', 'beta1Kurtosis', 'sigmaFFTHjorth_complexity', 'gamma1Hjorth_complexity', 'deltaSTD', 'sigmaLRSSV', 'beta2Kurtosis', 'gamma1LRSSV', '

In [263]:
from scipy.stats import kruskal # Kruskal-Wallis H-test
df_y_col = df[y_col]
removed_cols = []
for _, col_to_test in enumerate(other_cols):
    H, pval = kruskal(df[col_to_test], df_y_col)
#     print(f"{col_to_test} :\nH-statistic:\t{H}\nP-value:\t{pval}\n")
    if pval > 0.01:
        # statistically non-significant feature, therefore discarded
        removed_cols.append(col_to_test)
        print(f"[!Remove] {col_to_test}")
len(removed_cols)

0

In [258]:
H, pval = kruskal(*[df[col] for col in df.columns])
print(f"H-statistic:\t{H}\nP-value:\t{pval}\n")
if pval < 0.05:
    print("Reject NULL hypothesis - Significant differences exist between groups.")
if pval > 0.05:
    print("Accept NULL hypothesis - No significant difference between groups.")

H-statistic:	7338177.934852661
P-value:	0.0

Reject NULL hypothesis - Significant differences exist between groups.
