# Preprocess
1. Standardize the data
2. Fill in missing values with population mean

In [22]:
import pandas as pd
import sys
sys.path.append("../")
from utils.load_data import load_eeg_data

In [23]:
df = load_eeg_data('../data/')

In [24]:
LABEL = "main.disorder"

In [25]:
# Get all numerical columns
band_cols = [c for c in df.columns if df[c].dtype in ['int64', 'float64']]

In [26]:
band_cols.remove('age')
band_cols.remove('education')
band_cols.remove('IQ')
band_cols

['AB.A.delta.a.FP1',
 'AB.A.delta.b.FP2',
 'AB.A.delta.c.F7',
 'AB.A.delta.d.F3',
 'AB.A.delta.e.Fz',
 'AB.A.delta.f.F4',
 'AB.A.delta.g.F8',
 'AB.A.delta.h.T3',
 'AB.A.delta.i.C3',
 'AB.A.delta.j.Cz',
 'AB.A.delta.k.C4',
 'AB.A.delta.l.T4',
 'AB.A.delta.m.T5',
 'AB.A.delta.n.P3',
 'AB.A.delta.o.Pz',
 'AB.A.delta.p.P4',
 'AB.A.delta.q.T6',
 'AB.A.delta.r.O1',
 'AB.A.delta.s.O2',
 'AB.B.theta.a.FP1',
 'AB.B.theta.b.FP2',
 'AB.B.theta.c.F7',
 'AB.B.theta.d.F3',
 'AB.B.theta.e.Fz',
 'AB.B.theta.f.F4',
 'AB.B.theta.g.F8',
 'AB.B.theta.h.T3',
 'AB.B.theta.i.C3',
 'AB.B.theta.j.Cz',
 'AB.B.theta.k.C4',
 'AB.B.theta.l.T4',
 'AB.B.theta.m.T5',
 'AB.B.theta.n.P3',
 'AB.B.theta.o.Pz',
 'AB.B.theta.p.P4',
 'AB.B.theta.q.T6',
 'AB.B.theta.r.O1',
 'AB.B.theta.s.O2',
 'AB.C.alpha.a.FP1',
 'AB.C.alpha.b.FP2',
 'AB.C.alpha.c.F7',
 'AB.C.alpha.d.F3',
 'AB.C.alpha.e.Fz',
 'AB.C.alpha.f.F4',
 'AB.C.alpha.g.F8',
 'AB.C.alpha.h.T3',
 'AB.C.alpha.i.C3',
 'AB.C.alpha.j.Cz',
 'AB.C.alpha.k.C4',
 'AB.C.alpha.l

In [27]:
def mean_imputation(df: pd.DataFrame, cols: list) -> pd.DataFrame:
    """ Impute missing values with mean.

    Args:
        df (pd.DataFrame): Dataframe to be imputed.
        cols (list): List of columns to be imputed.

    Returns:
        pd.DataFrame: Imputed dataframe.
    """
    for col in cols:
        df[col] = df[col].fillna(df[col].mean())
    return df

def standardize(df: pd.DataFrame, cols: list) -> pd.DataFrame:
    """ Standardize the dataframe.

    Args:
        df (pd.DataFrame): Dataframe to be standardized.
        cols (list): List of columns to be standardized.

    Returns:
        pd.DataFrame: Standardized dataframe.
    """
    for col in cols:
        df[col] = (df[col] - df[col].mean()) / df[col].std()
    return df

In [28]:
df = mean_imputation(df, band_cols)
df = standardize(df, band_cols)

In [29]:
df.head()

Unnamed: 0_level_0,sex,age,eeg.date,education,IQ,main.disorder,specific.disorder,AB.A.delta.a.FP1,AB.A.delta.b.FP2,AB.A.delta.c.F7,...,COH.F.gamma.o.Pz.p.P4,COH.F.gamma.o.Pz.q.T6,COH.F.gamma.o.Pz.r.O1,COH.F.gamma.o.Pz.s.O2,COH.F.gamma.p.P4.q.T6,COH.F.gamma.p.P4.r.O1,COH.F.gamma.p.P4.s.O2,COH.F.gamma.q.T6.r.O1,COH.F.gamma.q.T6.s.O2,COH.F.gamma.r.O1.s.O2
no.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,M,57.0,2012.8.30,,,Addictive disorder,Alcohol use disorder,1.401843,0.044134,0.376737,...,-1.254627,-1.960368,-1.828336,-0.834235,-2.237575,-1.571887,-1.056004,-1.437106,-1.749512,-1.451175
2,M,37.0,2012.9.6,6.0,120.0,Addictive disorder,Alcohol use disorder,-0.59899,-0.831898,-0.580495,...,-1.909891,-1.920907,-1.646903,-1.802377,-0.71764,-0.786717,-0.380596,-1.221707,-0.460035,-0.693514
3,M,32.0,2012.9.10,16.0,113.0,Addictive disorder,Alcohol use disorder,0.864991,0.520585,-0.059918,...,1.486972,0.798541,-0.972797,0.508607,0.06964,-0.473845,0.180634,-0.582988,-0.074636,-1.304606
4,M,35.0,2012.10.8,18.0,126.0,Addictive disorder,Alcohol use disorder,0.116406,0.054719,-0.03851,...,-1.002612,0.448936,-1.117574,-0.75457,0.836607,-1.17539,-0.970917,-0.191512,0.755712,-0.811951
5,M,36.0,2012.10.18,16.0,112.0,Addictive disorder,Alcohol use disorder,1.55936,1.016306,0.411452,...,-0.909547,0.210674,-0.299794,-0.116533,0.612477,0.725193,0.531805,1.043846,1.177668,0.687519


In [30]:
df = mean_imputation(df, band_cols)

In [31]:
df_final = df[band_cols + [LABEL]]

In [32]:
# Save dataframe
df_final.to_csv('../data/band_training.csv', index=False)