# ECoG Foundation Model Training
This is meant to be a minimal notebook which is capable of running model training with a free to use colab notebooks. Feel free to change this as you see fit for your experiments.

In [None]:
# Clone repository.
!git clone https://github.com/leoniekerken/ECoG-foundation-model.git

Now, go into the repo you just downloaded and change the hugging face user access token in the Makefile to your personal access token. If you don't want to do this everytime you could also upload the code to your personal drive and change the path_to_github_repo variable below, although then you risk your code being out of date.

In [None]:
# Download data.
!cd ECoG-foundation-model && make download-data

In [None]:
# Required pip installs.
!pip install accelerate
!pip install einops
!pip install mne
!pip install mne-bids
!pip install pyEDFlib

In [None]:
# The local path to the github repo. Must be accessible from this notebook.
# If you just run the code above this will work.
path_to_github_repo = 'ECoG-foundation-model/'

In [None]:
# Add import for ECoG code.
import sys
import os
sys.path.append(os.path.join(path_to_github_repo, 'ECoG_MAE'))

# Other imports
from dataclasses import dataclass

from config import VideoMAEExperimentConfig, VideoMAETaskConfig, ViTConfig, TrainerConfig, ECoGDataConfig
from ecog_setup import system_setup, model_setup
from loader import dl_setup
from train import train_model

In [None]:
# Configuration for this experiment. See class definition for possible config values and docstrings.
experiment_config = VideoMAEExperimentConfig(
        video_mae_task_config=VideoMAETaskConfig(
            vit_config=ViTConfig(
                dim=80,
                mlp_dim=80,
                patch_size=1,
                patch_dims=[1, 1, 1],
                frame_patch_size=4,
                use_cls_token=False,
            ),
            tube_mask_ratio=0.5,
            decoder_mask_ratio=0.0,
            use_contrastive_loss=False,
            running_cell_masking=False,
        ),
        trainer_config=TrainerConfig(
            learning_rate=0.0,
            num_epochs=10,
            loss='patch',
        ),
        ecog_data_config=ECoGDataConfig(
            norm=None,
            # I'm not sure on the exact limits but I've managed to
            # get a batch size of 32 to work but a batch size of 64 leads to crashes on
            # the free tier T4 GPU.
            batch_size=8,
            data_size=1.0,
            env=False,
            # bands=[[4, 8], [8, 13], [13, 30], [30, 55], [70, 200]], # You can train over more bands, but encoding data currently only includes high gamma
            bands=[[70, 200]],
            new_fs=20,
            dataset_path=os.path.join(path_to_github_repo, 'dataset'),
            train_data_proportion=0.9,
            sample_length=2,
            shuffle=True,
            test_loader=False,
        ),
        job_name='test_run',
    )

In [None]:
accelerator, device, data_type, local_rank = system_setup()

In [None]:
train_dl, test_dl, num_train_samples = dl_setup(experiment_config)

# If you want to run a more minimal training run you can uncomment the code
# below to limit the number of samples accessible by each dataset. This is
# currently inefficient though if you use few training batches because it will
# do summaries frequently which takes more time than an individual training
# step.
# train_num_batches = 1
# test_num_batches = 1

# for dataset in train_dl.dataset.datasets:
#   dataset.max_samples = experiment_config.ecog_data_config.batch_size * train_num_batches
# # Can reuse same dataloader for test
# test_dl = train_dl
# # Or just limit test dataloader
# # for dataset in test_dl.dataset.datasets:
# #   dataset.max_samples = experiment_config.ecog_data_config.batch_size * test_num_batches

# num_train_samples = experiment_config.ecog_data_config.batch_size * train_num_batches

In [None]:
# The data is arranged in shape b*c*t*d*h*w, where
# b = batch size,
# c = freq bands,
# t = number of datapoints within a sample (args.new_fs samples per second)
# d = depth (currently 1)
# h = height of grid (currently 8)
# w = width of grid (currently 8)

print(next(train_dl._get_iterator()).shape)

In [None]:
model, optimizer, lr_scheduler, num_patches = model_setup(
    experiment_config, device, num_train_samples
)

In [None]:
model = train_model(
        experiment_config,
        device,
        model,
        train_dl,
        test_dl,
        num_patches,
        optimizer,
        lr_scheduler,
        accelerator,
        data_type,
        local_rank,
    )

You can now view the results of the training in results/

## Encoding

In [None]:
from downstream_tasks.encoding.config import EncodingExperimentConfig, EncodingTaskConfig, EncodingDataConfig
from downstream_tasks.encoding.utils import run_encoding_task

In [None]:
encoding_experiment_config = EncodingExperimentConfig(
    encoding_data_config = EncodingDataConfig(
        conversation_data_df_path = os.path.join(path_to_github_repo, "word-embeddings/gpt2-layer-8-emb.pkl"),
        encoding_neural_data_folder = os.path.join(path_to_github_repo, "preprocessed-highgamma"),
        electrode_glob_path = "NY*_*_Part*_conversation*_electrode_preprocess_file_{elec_id}.mat",
        lag = 0
    ),
    encoding_task_config = EncodingTaskConfig(
        model_path = "", # Unused here.
        embedding_device = "cuda",
        embedding_batch_size = 8,
        num_folds = 2,
    )
)

In [None]:
pearson_correlations, mspe = run_encoding_task(encoding_experiment_config, experiment_config.ecog_data_config, model)

In [None]:
pearson_correlations

In [None]:
mspe