# Manual feature extraction

# Imports

In [2]:
import os
import warnings
from lightgbm import LGBMClassifier
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, LeaveOneOut, StratifiedKFold
from sklearn.svm import SVC
import lightgbm as lgb
from utils import variance_thresholding, standardize, calculate_metrics, calculate_metrics_statistics

In [None]:
# parameters for Welch's method for estimating power spectrum

NPERSEG = 60                    # length of segment
NOVERLAP = int(0.75 * NPERSEG)  # overlap of segments
NFFT = NPERSEG                  # length of FFT
WINDOW = "hann"                 # window function type

# parameters for saving data
PROCESSED_DATA_DIR = "processed_data"
DEPRESJON_PREFIX = "manual_depresjon"
PSYKOSE_PREFIX = "manual_psykose"
HYPERAKTIV_PREFIX = "manual_hyperaktiv"
MAIN_RESULTS_DIR = "results"

DAY_NIGHT_HOURS = (8, 21) # (6, 22) / (8, 21)
day_night_format = f'{DAY_NIGHT_HOURS[0]}_{DAY_NIGHT_HOURS[1]}' # "6, 22" / "8, 21"

# Classification

## Classifiers, parameters, constants

In [33]:
classifiers = {
    "GBM": LGBMClassifier(
    objective='binary',
    metric=['auc', 'binary_logloss'],
    force_col_wise=True,
    verbosity=-1
    ),
    "LR": LogisticRegression(
        penalty="elasticnet",
        random_state=0,
        solver="saga",
        max_iter=5000
    ),
    "SVM": SVC(
        kernel="poly",
        cache_size=512,
        max_iter=5000
    ),
    "RF": RandomForestClassifier(
        n_estimators=500,
        criterion="entropy"
    )
}


param_grids = {
    "GBM":{
    'num_leaves': [25, 50, 100, 250, 500],
    'max_depth': [5, 10, 15, 20, 25],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25],
    'reg_alpha': [0, 0.1, 0.2, 0.3],
    },
    "LR": {
        "C": [0.001, 0.01, 0.1, 0.5, 1, 2, 5, 10, 25, 50, 100, 500, 1000],
        "class_weight": [None, "balanced"],
        "l1_ratio": [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5,
                     0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
    },
    "SVM": {
        "C": np.logspace(10e-3, 10e3, num=50),
        "gamma": np.logspace(10e-3, 10e3, num=50),
        "class_weight": [None, "balanced"]
    },
    "RF": {
        "class_weight": [None, "balanced", "balanced_subsample"]
    }
}

## Hyperaktiv Classification

In [20]:
dataset = HYPERAKTIV_PREFIX
y_filename = f"hyperaktiv_{day_night_format}_y.csv"

In [22]:
datasets = {}

for part in ["full_24h", "night", "day"]:
    filename = f"{dataset}_{day_night_format}_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    datasets[part] = pd.read_csv(filepath, header=0).values

y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, y_filename), header=None, dtype=int)
y = y.values.ravel()

In [25]:
results_directory = os.path.join(".", MAIN_RESULTS_DIR, "hyperactiv")

for part in ["full_24h", "night", "day"]:
    print(f"PART: {part}")
    
    X = datasets[part]
    
    
    for clf_type in ["GBM", "LR", "SVM", "RF"]: 
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
        
        test_scores = []
        for train_idx, test_idx in folds.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            
            X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)
            X_train, X_test = standardize(X_train, X_test)
            if clf_type == "GBM":
                train_data = lgb.Dataset(X_train, label=y_train)
                test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
            
            grid_search = GridSearchCV(
                estimator=classifiers[clf_type], 
                param_grid=param_grids[clf_type], 
                scoring="accuracy",
                n_jobs=-1,
                refit=True,
                cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)
            
            clf = grid_search.best_estimator_
            
            metrics = calculate_metrics(clf, X_test, y_test)
            print(metrics)
            test_scores.append(metrics)
        
        pd.DataFrame.from_records(test_scores).to_csv(os.path.join(results_directory, f"test_scores_{day_night_format}_{part}_{clf_type}"), index=False)
        final_scores = calculate_metrics_statistics(test_scores)
        df = pd.DataFrame([(key,) + values for key, values in final_scores.items()],
                      columns=['Index', 'Mean', 'Stddev']).set_index('Index')
        df.to_csv(os.path.join(results_directory, f"final_scores_{day_night_format}_{part}_{clf_type}"), index=False)
        
        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")
        print()

PART: night
  GBM


KeyboardInterrupt: 

## Depresjon classification

In [30]:
dataset = DEPRESJON_PREFIX
y_filename = f"depresjon_{day_night_format}_y.csv"

In [31]:
datasets = {}

for part in ["full_24h", "night", "day"]:
    filename = f"{dataset}_{day_night_format}_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    datasets[part] = pd.read_csv(filepath, header=0).values

y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, y_filename), header=None, dtype=int)
y = y.values.ravel()

In [32]:
results_directory = os.path.join(".", MAIN_RESULTS_DIR, "depresjon")

for part in ["full_24h", "night", "day"]:
    print(f"PART: {part}")
    
    X = datasets[part]
    

    for clf_type in ["GBM", "LR", "SVM", "RF"]: 
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
        
        test_scores = []
        for train_idx, test_idx in folds.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            
            X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)
            X_train, X_test = standardize(X_train, X_test)
            
            if clf_type == "GBM":
                train_data = lgb.Dataset(X_train, label=y_train)
                test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
            
            grid_search = GridSearchCV(
                estimator=classifiers[clf_type], 
                param_grid=param_grids[clf_type], 
                scoring="accuracy",
                n_jobs=-1,
                refit=True,
                cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)
            
            clf = grid_search.best_estimator_
            
            metrics = calculate_metrics(clf, X_test, y_test)
            print(metrics)
            test_scores.append(metrics)
        
        pd.DataFrame.from_records(test_scores).to_csv(os.path.join(results_directory, f"test_scores_{day_night_format}_{part}_{clf_type}"), index=False)
        final_scores = calculate_metrics_statistics(test_scores)
        df = pd.DataFrame([(key,) + values for key, values in final_scores.items()],
                      columns=['Index', 'Mean', 'Stddev']).set_index('Index')
        df.to_csv(os.path.join(results_directory, f"final_scores_{day_night_format}_{part}_{clf_type}"), index=False)
        
        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")
        
        print()

PART: night
  GBM


KeyboardInterrupt: 

## Psykose classification

In [42]:
dataset = PSYKOSE_PREFIX
y_filename = f"psykose_{day_night_format}_y.csv"

In [43]:
datasets = {}

for part in ["full_24h", "night", "day"]:
    filename = f"{dataset}_{day_night_format}_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    datasets[part] = pd.read_csv(filepath, header=0).values

y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, y_filename), header=None, dtype=int)
y = y.values.ravel()

In [44]:
results_directory = os.path.join(".", MAIN_RESULTS_DIR, "psykose")

for part in ["full_24h", "night", "day"]:
    print(f"PART: {part}")
    
    X = datasets[part]
    
    for clf_type in ["GBM", "LR", "SVM", "RF"]: 
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
        
        test_scores = []
        for train_idx, test_idx in folds.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            
            X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)
            X_train, X_test = standardize(X_train, X_test)
            
            if clf_type == "GBM":
                train_data = lgb.Dataset(X_train, label=y_train)
                test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
                
            grid_search = GridSearchCV(
                estimator=classifiers[clf_type], 
                param_grid=param_grids[clf_type], 
                scoring="accuracy",
                n_jobs=-1,
                refit=True,
                cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)
            
            clf = grid_search.best_estimator_
            
            metrics = calculate_metrics(clf, X_test, y_test)
            print(metrics)
            test_scores.append(metrics)
        
        pd.DataFrame.from_records(test_scores).to_csv(os.path.join(results_directory, f"test_scores_{day_night_format}_{part}_{clf_type}"), index=False)
        final_scores = calculate_metrics_statistics(test_scores)
        df = pd.DataFrame([(key,) + values for key, values in final_scores.items()],
                      columns=['Index', 'Mean', 'Stddev']).set_index('Index')
        df.to_csv(os.path.join(results_directory, f"final_scores_{day_night_format}_{part}_{clf_type}"), index=False)
        
        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")
        
        print()

PART: night
  GBM


KeyboardInterrupt: 