## Importing Libraries

In [None]:
import os
import gc

import numpy as np
import pandas as pd


from statsmodels.stats.power import FTestAnovaPower
from statsmodels.stats.proportion import proportion_effectsize



## References
- [MindBigData Leaderboard](https://huggingface.co/spaces/DavidVivancos/MindBigData-Leaderboard)
    - [MindBigData 2022: A Large Dataset of Brain Signals ](https://arxiv.org/pdf/2212.14746)
        - [MindBigData](https://mindbigdata.com/opendb/)
            - [MindBigData-MW-v1.0.zip](https://mindbigdata.com/opendb/MindBigData-MW-v1.0.zip)
            - [MindBigData-EP-v1.0.zip](https://mindbigdata.com/opendb/MindBigData-EP-v1.0.zip)
            - [MindBigData-MU-v1.0.zip](https://mindbigdata.com/opendb/MindBigData-MU-v1.0.zip)
            - [MindBigData-IN-v1.06.zip](https://mindbigdata.com/opendb/MindBigData-IN-v1.06.zip)


## Refresh Memory

In [None]:
gc.collect()

## Data Loading

In [None]:
data_dir = os.path.join('/mnt', 'd', 'work', 'Walsh', 'Capstone', 'Published', 'Data')
prk_folder = os.path.join(data_dir, 'parquets')

if not os.path.exists(prk_folder):
    os.makedirs(prk_folder)

## Data directories

In [None]:
mbd_data_dir = os.path.join(data_dir,'MindBigData')
mw_file = os.path.join(mbd_data_dir, 'MW.txt')
mu_file = os.path.join(mbd_data_dir, 'MU.txt')
in_file = os.path.join(mbd_data_dir, 'IN.txt')
ep_file = os.path.join(mbd_data_dir, 'EP.txt')

mw_pk_file = os.path.join(prk_folder, 'MW.parquet')
mu_pk_file = os.path.join(prk_folder, 'MU.parquet')
in_pk_file = os.path.join(prk_folder, 'IN.parquet')
ep_pk_file = os.path.join(prk_folder, 'EP.parquet')

col_headers = ['id', 'event', 'device', 'channel', 'code', 'size', 'data']

## Initializing dataframe variables

In [None]:
mw_df = None
mu_df = None
in_df = None
ep_df = None

SAMPLING_RATES = {
    "MW" : 512,
    "EP" : 128,
    "MU" : 220,
    "IN" : 128
}


In [None]:
print("Processing MW.txt...")
mw_df = pd.read_csv(mw_file, header=None, names=col_headers, sep='\\t')
mw_df.to_parquet(mw_pk_file, index=False)
print(f"Saved MW.parquet with {len(mw_df)} rows.")
del mw_df
gc.collect()

In [None]:
print("Processing MU.txt...")
mu_df = pd.read_csv(mu_file, header=None, names=col_headers, sep='\\t')
mu_df.to_parquet(mu_pk_file, index=False)
print(f"Saved MU.parquet with {len(mu_df)} rows.")
del mu_df
gc.collect()

In [None]:
print("Processing IN.txt...")
in_df = pd.read_csv(in_file, header=None, names=col_headers, sep='\\t')
in_df.to_parquet(in_pk_file, index=False)
print(f"Saved IN.parquet with {len(in_df)} rows.")
del in_df
gc.collect()

In [None]:
print("Processing EP.txt...")
ep_df = pd.read_csv(ep_file, header=None, names=col_headers, sep='\\t')
ep_df.to_parquet(ep_pk_file, index=False)
print(f"Saved EP.parquet with {len(ep_df)} rows.")
del ep_df
gc.collect()

In [None]:
print("\n### Data ingestion is complete. ###")
print("All raw data has been converted to Parquet format in the 'Data/parquets' directory.")

### With above steps we have injested data from external data sources and converted them to common format

## Minimum Sample required

In [None]:

def calculate_total_sample_size_anova_power(
    num_classes: int,
    effect_size_f: float = 0.25,  # Cohen's f for ANOVA
    alpha: float = 0.05,
    power: float = 0.80
) -> int:
    """
    Calculates the total sample size required for a multi-class comparison (ANOVA-like)
    to detect a given effect size with specified power and significance.

    This helps ensure that there's enough data to detect meaningful differences
    in feature distributions across classes.

    Args:
        num_classes (int): The number of distinct classes (groups).
        effect_size_f (float): Cohen's f effect size.
                               - 0.1: small effect
                               - 0.25: medium effect (default)
                               - 0.4: large effect
        alpha (float): Significance level (Type I error rate), e.g., 0.05.
        power (float): Desired statistical power (1 - Type II error rate), e.g., 0.80.

    Returns:
        int: The total estimated sample size.
    """
    if num_classes <= 1:
        raise ValueError("Number of classes must be greater than 1 for comparison.")
    if not (0 < alpha < 1) or not (0 < power < 1):
        raise ValueError("Alpha and power must be between 0 and 1.")
    if effect_size_f <= 0:
        raise ValueError("Effect size must be positive.")

    # Create a power analysis object for F-tests (ANOVA)
    power_calculator = FTestAnovaPower()

    # Calculate the number of observations (samples) per group (class)
    # k_groups is the number of classes
    # nobs is the number of observations per group
    nobs_per_group = power_calculator.solve_power(
        effect_size=effect_size_f,
        nobs=None,
        alpha=alpha,
        power=power,
        k_groups=num_classes
    )

    total_samples = int(np.ceil(nobs_per_group * num_classes))

    print(f"ANOVA Power Analysis: To detect an effect size (Cohen's f) of {effect_size_f} ")
    print(f"  across {num_classes} classes with alpha={alpha} and power={power}:")
    print(f"  - Samples needed per class: {int(np.ceil(nobs_per_group))}")
    print(f"  - Total estimated sample size: {total_samples}")
    return total_samples

In [None]:
total_samples_anova = calculate_total_sample_size_anova_power(num_classes=10, effect_size_f=0.25, alpha=0.05, power=0.80)
total_samples_anova_large_effect = calculate_total_sample_size_anova_power(num_classes=10, effect_size_f=0.4, alpha=0.05, power=0.90)