%% [markdown]<br>
# Comprehensive Validation of Leak-Free TimeSeriesSplitter<br>
<br>
This notebook validates that our new TimeSeriesSplitter creates proper train/val/test splits <br>
with appropriate data flow allowances using the visualization tools from analysis.

In [None]:
# %%
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import h5py
from rastermap import Rastermap


parent_dir = os.path.dirname(os.getcwd())
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

from utils.data_splitter import TimeSeriesSplitter
from utils.load_meso_session import MesoscopeSession
from Activity_Data_Loader import Dataset_Activity


%%<br>
Load preprocessed session data

In [4]:
print("=== LOADING SESSION DATA ===")
session_path = "../DATA/session_61f260e7-b5d3-4865-a577-bcfc53fda8a8.h5"

=== LOADING SESSION DATA ===


Load via MesoscopeSession for trial data access

In [5]:
preprocessed_session = MesoscopeSession.from_preprocessed(session_path)
activity, timestamps = preprocessed_session.get_preprocessed_data()

Preprocessed session loaded from ../DATA/session_61f260e7-b5d3-4865-a577-bcfc53fda8a8.h5
Shape: (19081, 6903) (time_points x neurons)
EID: 61f260e7-b5d3-4865-a577-bcfc53fda8a8, Subject: SP066


Load covariate matrix directly from HDF5

In [6]:
with h5py.File(session_path, 'r') as f:
    covariate_matrix = f['covariate_matrix'][:]
    feature_names = [name.decode('utf-8') for name in f['covariate_metadata']['feature_names'][:]]

In [7]:
print(f"Activity shape: {activity.shape}")
print(f"Covariate shape: {covariate_matrix.shape}")  
print(f"Covariate features: {feature_names}")

Activity shape: (19081, 6903)
Covariate shape: (19081, 11)
Covariate features: ['wheel_velocity', 'stimulus_catch_trial', 'stimulus_left_100pct', 'stimulus_left_25pct', 'stimulus_left_12.5pct', 'stimulus_left_6.25pct', 'stimulus_right_100pct', 'stimulus_right_25pct', 'stimulus_right_12.5pct', 'stimulus_right_6.25pct', 'trial_phase']


In [8]:
# Create stimulus-based splits and TimeSeriesSplitter
print("\n=== CREATING STIMULUS-BASED SPLITS ===")

split_map = TimeSeriesSplitter.create_stimulus_based_splits(
    covariate_matrix=covariate_matrix,
    train_pct=0.7,
    val_pct=0.2,
    held_out_stimulus_types=[]  # No held-out types
)

# Test with realistic DLinear parameters
seq_len, pred_len, label_len = 48, 16, 4

splitter = TimeSeriesSplitter(
    split_map=split_map,
    seq_len=seq_len,
    pred_len=pred_len,
    label_len=label_len
)

summary = splitter.get_split_summary()
print(f"\nSplit Summary:")
for key, value in summary.items():
    if 'pct' in key:
        print(f"  {key}: {value:.1f}%")
    else:
        print(f"  {key}: {value}")


=== CREATING STIMULUS-BASED SPLITS ===
Stimulus type 0: 62 blocks -> 43 train, 12 val, 7 test
Stimulus type 1: 31 blocks -> 21 train, 6 val, 4 test
Stimulus type 2: 30 blocks -> 21 train, 6 val, 3 test
Stimulus type 3: 19 blocks -> 13 train, 3 val, 3 test
Stimulus type 4: 35 blocks -> 24 train, 7 val, 4 test
Stimulus type 5: 34 blocks -> 23 train, 6 val, 5 test
Stimulus type 6: 26 blocks -> 18 train, 5 val, 3 test
Stimulus type 7: 40 blocks -> 28 train, 8 val, 4 test
Stimulus type 8: 36 blocks -> 25 train, 7 val, 4 test
Generating leak-aware sample indices...
Found 13256 valid training samples.
Found 2944 valid validation samples.
Found 2818 valid test samples.

Split Summary:
  train_samples: 13256
  val_samples: 2944
  test_samples: 2818
  total_samples: 19018
  train_pct: 69.7%
  val_pct: 15.5%
  test_pct: 14.8%


In [18]:
train_indices = splitter.get_indices('train')
len(train_indices)

13256

In [22]:
stacked_indices = np.vstack((train_indices, val_indices, test_indices))


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 13256 and the array at index 1 has size 2944

In [None]:
print("\n=== VISUALIZING SPLIT MAP ===")

# Define time window for visualization (in sample indices)
xmin_idx = 0
xmax_idx = 1000

model = Rastermap(n_clusters=100, # number of clusters to compute
                  n_PCs=200, # number of PCs
                  locality=0.5, # locality in sorting is low here to get more global sorting (this is a value from 0-1)
                  time_lag_window=5, # use future timepoints to compute correlation
                  grid_upsample=10, # default value, 10 is good for large recordings
                  ).fit(activity.T)

y = model.embedding 
isort = model.isort

X_embedding = model.X_embedding
fig = plt.figure(figsize=(12,5), dpi=200)
ax = fig.add_subplot(111)
ax.imshow(X_embedding, vmin=0, vmax=0.8, cmap="gray_r", aspect="auto")

def create_split_overlay(split_maps, xmin_idx, xmax_idx):
    """Create a colored overlay for stimulus types"""
    # Get stimulus data for the time window
    stim_window = stimulus_onehot[xmin_idx:xmax_idx]
    
    # Define colors for each stimulus type
    # 0: Catch trials (purple)
    # 1-4: Left stimuli (different shades of red)
    # 5-8: Right stimuli (different shades of blue)
    colors = [
        [0.5, 0, 0.5, 0.5],  # 0: Purple (catch trials - no stimulus)
        [1, 0, 0, 0.5],      # 1: Left 100% - dark red
        [1, 0.3, 0.3, 0.5],  # 2: Left 25% - medium red
    ]
    
    # Convert one-hot to stimulus type indices
    stim_types = np.argmax(stim_window, axis=1)
    
    # Create RGB overlay
    nn = X_embedding.shape[0]  # Number of neurons
    overlay = np.zeros((nn, len(stim_types), 4))  # RGBA
    
    for t, stim_type in enumerate(stim_types):
        if np.any(stim_window[t]):  # Only color if there's a stimulus
            overlay[:, t] = colors[stim_type]
    
    return overlay

# Create figure for stimulus-colored rastermap with wheel velocity
fig = plt.figure(figsize=(15, 8), dpi=150)
grid = plt.GridSpec(12, 20, figure=fig, wspace=0.1, hspace=0.4)


# Plot rastermap (keep original scaling)
ax_raster = plt.subplot(grid[2:9, :-1])
ax_raster.imshow(X_embedding[:, xmin_idx:xmax_idx], cmap="gray_r", vmin=0, vmax=0.8, aspect="auto")

# Create and overlay stimulus colors
stim_overlay = create_split_overlay(split_indices, xmin_idx, xmax_idx)
ax_raster.imshow(stim_overlay, aspect="auto")

ax_raster.set_ylabel("Neurons (sorted)")
ax_raster.set_title("Rastermap with Stimulus Type Overlay")

ax_cbar = plt.subplot(grid[2:9, -1])
activity_gradient = np.linspace(0.8, 0, X_embedding.shape[0])[:, np.newaxis]  # High at top
ax_cbar.imshow(activity_gradient, cmap="gray_r", aspect="auto", vmin=0, vmax=0.8)
ax_cbar.yaxis.set_label_position("right")
ax_cbar.set_ylabel("Activity", rotation=270, labelpad=10)
ax_cbar.set_yticks([0, X_embedding.shape[0]//2, X_embedding.shape[0]-1])
ax_cbar.set_yticklabels(['0.80', '0.40', '0.00'])
ax_cbar.set_xticks([])
ax_cbar.yaxis.tick_right()

# Plot wheel velocity below the rastermap (bigger vertical space)
# ax_wheel = plt.subplot(grid[10:12, :-1])
# plot_wheel_velocity(ax_wheel, preprocessed_session.aligned_wheel_velocity, timestamps, xmin_idx, xmax_idx)

# Create legend for stimulus types
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor=(0.5, 0, 0.5, 0.5), label='Catch Trial'),
    Patch(facecolor=(1, 0, 0, 0.5), label='Left 100%'),
    Patch(facecolor=(1, 0.3, 0.3, 0.5), label='Left 25%'),
]
ax_raster.legend(handles=legend_elements, loc='upper right', fontsize=8, ncol=2)

plt.show()



=== VISUALIZING SPLIT MAP ===


NameError: name 'Rastermap' is not defined

In [10]:
# Test with your session file
dataset = Dataset_Activity(
    root_path="../DATA",
    data_path="session_61f260e7-b5d3-4865-a577-bcfc53fda8a8.h5",
    flag='train'
)

print(f"\nDataset length: {len(dataset)}")
print(f"Feature names: {dataset.feature_names}")

i=504
seq_x, seq_y, seq_x_mark, seq_y_mark = dataset[i]

# print(f"\nSample {i}:")
# print(f"  seq_x (input neural) shape: {seq_x.shape}")
# print(f"  seq_y (target neural) shape: {seq_y.shape}")  
# print(f"  seq_x_mark (input covariates) shape: {seq_x_mark.shape}")
# print(f"  seq_y_mark (target covariates) shape: {seq_y_mark.shape}")
# #TODO: assert that shapes are correct.
# print(f"  Input wheel velocity range: {seq_x_mark[:, 0].min():.3f} to {seq_x_mark[:, 0].max():.3f}")
# print(f"  Input stimulus activity: {np.sum(seq_x_mark[:, 1:10].sum(axis=1) > 0)} samples with stimulus")
# print(f"  Target wheel velocity range: {seq_y_mark[:, 0].min():.3f} to {seq_y_mark[:, 0].max():.3f}")
# print(f"  Target stimulus activity: {np.sum(seq_y_mark[:, 1:10].sum(axis=1) > 0)} samples with stimulus")

# print("\n✓ Dataset_Activity successfully loads and provides covariate data!")

Loaded preprocessed data: (19081, 6903)
Loaded covariate matrix: (19081, 11)
Original neurons: 7673, Processed neurons: 6903
Data range: 0.000 to 1.000
Covariate features: ['wheel_velocity', 'stimulus_catch_trial', 'stimulus_left_100pct', 'stimulus_left_25pct', 'stimulus_left_12.5pct', 'stimulus_left_6.25pct', 'stimulus_right_100pct', 'stimulus_right_25pct', 'stimulus_right_12.5pct', 'stimulus_right_6.25pct', 'trial_phase']

Creating stimulus-based splits...
Stimulus type 0: 62 blocks -> 43 train, 6 val, 13 test
Stimulus type 1: 31 blocks -> 21 train, 3 val, 7 test
Stimulus type 2: 30 blocks -> 21 train, 3 val, 6 test
Stimulus type 3: 19 blocks -> 13 train, 1 val, 5 test
Stimulus type 4: 35 blocks -> 24 train, 3 val, 8 test
Stimulus type 5: 34 blocks -> 23 train, 3 val, 8 test
Stimulus type 6: 26 blocks -> 18 train, 2 val, 6 test
Stimulus type 7: 40 blocks -> 28 train, 4 val, 8 test
Stimulus type 8: 36 blocks -> 25 train, 3 val, 8 test
Generating leak-aware sample indices...
Found 1200