In [1]:
import numpy as np
import os
import pandas as pd
import tarfile
from src.file_ops import npy_loader, get_probe_signals
from dataclasses import dataclass
from typing import Optional
from pathlib import Path
import matplotlib.pyplot as plt


In [2]:
# Set the current working directory
cwd = os.chdir(r"C:\Python Work Directory\NMA_Impact_Scholars_Steinmetz")

# @title Data retrieval
data_directory = r'data\spikeAndBehavioralData'

session_label_string = "Cori_2016-12-17"

session_label = session_label_string.split("_")
print(session_label)

# test_dataset
test_LFP = r"Cori_2016-12-17"

['Cori', '2016-12-17']


In [3]:
walker = os.walk(os.path.join(os.getcwd(),data_directory))
for root, dirs, files in walker:
    print(root)
    print(dirs)
    for file in files:
        print(file)


C:\Python Work Directory\NMA_Impact_Scholars_Steinmetz\data\spikeAndBehavioralData
[]
Cori_2016-12-14.tar
Cori_2016-12-17.tar
Cori_2016-12-18.tar
Forssmann_2017-11-01.tar
Forssmann_2017-11-02.tar
Forssmann_2017-11-04.tar
Forssmann_2017-11-05.tar
Hench_2017-06-15.tar
Hench_2017-06-16.tar
Hench_2017-06-17.tar
Hench_2017-06-18.tar
Lederberg_2017-12-05.tar
Lederberg_2017-12-06.tar
Lederberg_2017-12-07.tar
Lederberg_2017-12-08.tar
Lederberg_2017-12-09.tar
Lederberg_2017-12-10.tar
Lederberg_2017-12-11.tar
Moniz_2017-05-15.tar
Moniz_2017-05-16.tar
Moniz_2017-05-18.tar
Muller_2017-01-07.tar
Muller_2017-01-08.tar
Muller_2017-01-09.tar
Radnitz_2017-01-08.tar
Radnitz_2017-01-09.tar
Radnitz_2017-01-10.tar
Radnitz_2017-01-11.tar
Radnitz_2017-01-12.tar
Richards_2017-10-29.tar
Richards_2017-10-30.tar
Richards_2017-10-31.tar
Richards_2017-11-01.tar
Richards_2017-11-02.tar
Tatum_2017-12-06.tar
Tatum_2017-12-07.tar
Tatum_2017-12-08.tar
Tatum_2017-12-09.tar
Theiler_2017-10-11.tar


In [4]:
# def extract_spikes_data(filename):
#     with tarfile.open(filename) as tar:
#         spikes = [name for name in tar.getnames() if name.startswith('spikes')]

In [5]:
@dataclass
class Clusters:
    depths: Optional[np.ndarray] = None
    original_ids: Optional[np.ndarray] = None
    site: Optional[np.ndarray] = None
    probes: Optional[np.ndarray] = None
    template_waveform_chans: Optional[np.ndarray] = None
    template_waveforms: Optional[np.ndarray] = None
    waveform_duration: Optional[np.ndarray] = None
    phy_annotation: Optional[np.ndarray] = None


    def to_dataframe(self) -> pd.DataFrame:
        """
        Convert the cluster data to a pandas DataFrame.
        For multi-dimensional arrays, only the first dimension is used as the index,
        and the remaining dimensions are stored as array objects in the cells.
        """
        data_dict = {}
        base_length = None

        # Process each attribute
        for attr_name, value in self.__dict__.items():
            if value is not None:
                if len(value.shape) == 1:
                    # 1D arrays can be directly added
                    data_dict[attr_name] = value
                    if base_length is None:
                        base_length = len(value)
                else:
                    # For multi-dimensional arrays, store them as objects
                    # Each row will contain a slice of the array
                    data_dict[attr_name] = [value[i] for i in range(value.shape[0])]
                    if base_length is None:
                        base_length = value.shape[0]

        # Create DataFrame
        df = pd.DataFrame(data_dict)
        return df

    @classmethod
    def from_tar(cls, tar_path: str | Path) -> 'Clusters':
        """
        Load cluster data from a tar file containing numpy arrays.

        Args:
            tar_path: Path to the tar file containing cluster data

        Returns:
            ClusterData instance with loaded arrays

        Raises:
            FileNotFoundError: If tar file doesn't exist
            ValueError: If expected cluster files are missing
        """
        # if not os.path.exists(tar_path):
        #     raise FileNotFoundError(f"Tar file not found: {tar_path}")

        data = cls()

        with tarfile.open(tar_path, 'r') as tar:
            cluster_files = [name for name in tar.getnames() if name.startswith('clusters')]

            # Mapping between file names and dataclass attributes
            file_attr_map = {
                'clusters.depths.npy': 'depths',
                'clusters.originalIDs.npy': 'original_ids',
                'clusters.peakChannel.npy': 'site',
                'clusters.probes.npy': 'probes',
                'clusters.templateWaveformChans.npy': 'template_waveform_chans',
                'clusters.templateWaveforms.npy': 'template_waveforms',
                'clusters.waveformDuration.npy': 'waveform_duration',
                'clusters._phy_annotation.npy': 'phy_annotation'
            }

            # Extract and load each file
            for file_name, attr_name in file_attr_map.items():
                if file_name not in cluster_files:
                    print(f"Warning: {file_name} not found in tar archive")
                    continue

                # try:
                #     # Extract file to memory and load with numpy
                #     member = tar.extractfile(file_name)
                #     if member is None:
                #         raise ValueError(f"Could not extract {file_name}")

                array_data = np.squeeze(npy_loader(tar,file_name))
                setattr(data, attr_name, array_data)

                # except Exception as e:
                #     print(f"Error loading {file_name}: {str(e)}")

        return data

@dataclass
class Trials:
    feedback_type: Optional[np.ndarray] = None
    feedback_times: Optional[np.ndarray] = None
    gocue_times: Optional[np.ndarray] = None
    included: Optional[np.ndarray] = None
    intervals: Optional[np.ndarray] = None
    repNum: Optional[np.ndarray] = None
    response_choice: Optional[np.ndarray] = None
    response_times: Optional[np.ndarray] = None
    contrast_left: Optional[np.ndarray] = None
    constra_right: Optional[np.ndarray] = None
    stimulus_times: Optional[np.ndarray] = None


    def to_dataframe(self) -> pd.DataFrame:
        """
        Convert the Trials data to a pandas DataFrame.
        For multi-dimensional arrays, only the first dimension is used as the index,
        and the remaining dimensions are stored as array objects in the cells.
        """
        data_dict = {}
        base_length = None

        # Process each attribute
        for attr_name, value in self.__dict__.items():
            if value is not None:
                if len(value.shape) == 1:
                    # 1D arrays can be directly added
                    data_dict[attr_name] = value
                    if base_length is None:
                        base_length = len(value)
                else:
                    # For multi-dimensional arrays, store them as objects
                    # Each row will contain a slice of the array
                    data_dict[attr_name] = [value[i] for i in range(value.shape[0])]
                    if base_length is None:
                        base_length = value.shape[0]

        # Create DataFrame
        df = pd.DataFrame(data_dict)
        return df

    @classmethod
    def from_tar(cls, tar_path: str | Path) -> 'Trials':
        """
        Load cluster data from a tar file containing numpy arrays.

        Args:
            tar_path: Path to the tar file containing cluster data

        Returns:
            ClusterData instance with loaded arrays

        Raises:
            FileNotFoundError: If tar file doesn't exist
            ValueError: If expected cluster files are missing
        """
        # if not os.path.exists(tar_path):
        #     raise FileNotFoundError(f"Tar file not found: {tar_path}")

        data = cls()

        with tarfile.open(tar_path, 'r') as tar:
            trial_files = [name for name in tar.getnames() if name.startswith('trials')]

            # Mapping between file names and dataclass attributes
            file_attr_map = {
                'trials.feedbackType.npy': 'feedback_type',
                'trials.feedback_times.npy': 'feedback_times',
                'trials.goCue_times.npy': 'gocue_times',
                'trials.included.npy': 'included',
                'trials.intervals.npy': 'intervals',
                'trials.repNum.npy': 'repNum',
                'trials.response_choice.npy': 'response_choice',
                'trials.response_times.npy': 'response_times',
                'trials.visualStim_contrastLeft.npy': 'contrast_left',
                'trials.visualStim_contrastRight.npy': 'contrast_right',
                'trials.visualStim_times.npy': 'stimulus_times',
            }

            # Extract and load each file
            for file_name, attr_name in file_attr_map.items():
                if file_name not in trial_files:
                    print(f"Warning: {file_name} not found in tar archive")
                    continue

                # try:
                #     # Extract file to memory and load with numpy
                #     member = tar.extractfile(file_name)
                #     if member is None:
                #         raise ValueError(f"Could not extract {file_name}")

                array_data = np.squeeze(npy_loader(tar,file_name))
                setattr(data, attr_name, array_data)

                # except Exception as e:
                #     print(f"Error loading {file_name}: {str(e)}")

        return data

@dataclass
class Spikes:
    amps: Optional[np.ndarray] = None
    clusters: Optional[np.ndarray] = None
    depths: Optional[np.ndarray] = None
    times: Optional[np.ndarray] = None

    def to_dataframe(self) -> pd.DataFrame:
        """
        Convert the Trials data to a pandas DataFrame.
        For multi-dimensional arrays, only the first dimension is used as the index,
        and the remaining dimensions are stored as array objects in the cells.
        """
        data_dict = {}
        base_length = None

        # Process each attribute
        for attr_name, value in self.__dict__.items():
            if value is not None:
                if len(value.shape) == 1:
                    # 1D arrays can be directly added
                    data_dict[attr_name] = value
                    if base_length is None:
                        base_length = len(value)
                else:
                    # For multi-dimensional arrays, store them as objects
                    # Each row will contain a slice of the array
                    data_dict[attr_name] = [value[i] for i in range(value.shape[0])]
                    if base_length is None:
                        base_length = value.shape[0]

        # Create DataFrame
        df = pd.DataFrame(data_dict)
        return df

    @classmethod
    def from_tar(cls, tar_path: str | Path) -> 'Spikes':
        """
        Load cluster data from a tar file containing numpy arrays.

        Args:
            tar_path: Path to the tar file containing cluster data

        Returns:
            ClusterData instance with loaded arrays

        Raises:
            FileNotFoundError: If tar file doesn't exist
            ValueError: If expected cluster files are missing
        """
        # if not os.path.exists(tar_path):
        #     raise FileNotFoundError(f"Tar file not found: {tar_path}")

        data = cls()

        with tarfile.open(tar_path, 'r') as tar:
            trial_files = [name for name in tar.getnames() if name.startswith('spikes')]

            # Mapping between file names and dataclass attributes
            file_attr_map = {
                'spikes.amps.npy': 'amps',
                'spikes.clusters.npy': 'clusters',
                'spikes.depths.npy': 'depths',
                'spikes.times.npy': 'times',
            }

            # Extract and load each file
            for file_name, attr_name in file_attr_map.items():
                if file_name not in trial_files:
                    print(f"Warning: {file_name} not found in tar archive")
                    continue

                # try:
                #     # Extract file to memory and load with numpy
                #     member = tar.extractfile(file_name)
                #     if member is None:
                #         raise ValueError(f"Could not extract {file_name}")

                array_data = np.squeeze(npy_loader(tar,file_name))
                setattr(data, attr_name, array_data)

                # except Exception as e:
                #     print(f"Error loading {file_name}: {str(e)}")

        return data

@dataclass
class BrainLocation:
    """Dataclass for brain location information from channels.brainLocation.tsv"""
    ccf_ap: np.ndarray  # AP position in CCF [µm]
    ccf_dv: np.ndarray  # DV position in CCF [µm]
    ccf_lr: np.ndarray  # LR position in CCF [µm]
    allen_ontology: np.ndarray  # Brain region acronyms

    @classmethod
    def from_tsv(cls, tsv_file) -> 'BrainLocation':
        """Load brain location data from a TSV file or file-like object"""
        df = pd.read_csv(tsv_file, sep='\t')
        return cls(
            ccf_ap=df['ccf_ap'].to_numpy(),
            ccf_dv=df['ccf_dv'].to_numpy(),
            ccf_lr=df['ccf_lr'].to_numpy(),
            allen_ontology=df['allen_ontology'].to_numpy()
        )

@dataclass
class Channels:
    """Main dataclass for channel-related data"""
    brain_location: Optional[BrainLocation] = None
    probes: Optional[np.ndarray] = None  # [integer] (nChannels)
    raw_row: Optional[np.ndarray] = None  # [integer] (nChannels)
    site: Optional[np.ndarray] = None  # [integer] (nChannels)
    site_positions: Optional[np.ndarray] = None  # [µm] (nChannels, 2)

    @classmethod
    def from_tar(cls, tar_path: str | Path) -> 'ChannelData':
        """
        Load channel data from a tar file containing the channel-related files.

        Args:
            tar_path: Path to the tar file containing channel data

        Returns:
            ChannelData instance with loaded arrays

        Raises:
            FileNotFoundError: If tar file doesn't exist
            ValueError: If expected channel files are missing
        """
        if not os.path.exists(tar_path):
            raise FileNotFoundError(f"Tar file not found: {tar_path}")

        data = cls()

        with tarfile.open(tar_path, 'r') as tar:
            channel_files = [name for name in tar.getnames() if name.startswith('channels')]

            # Load brain location TSV file
            if 'channels.brainLocation.tsv' in channel_files:
                member = tar.extractfile('channels.brainLocation.tsv')
                if member is not None:
                    data.brain_location = BrainLocation.from_tsv(member)

            # Mapping between NPY files and dataclass attributes
            npy_file_map = {
                'channels.probe.npy': 'probes',
                'channels.rawRow.npy': 'raw_row',
                'channels.site.npy': 'site',
                'channels.sitePositions.npy': 'site_positions'
            }

            # Load NPY files
            for file_name, attr_name in npy_file_map.items():
                if file_name in channel_files:
                    try:
                        member = tar.extractfile(file_name)
                        if member is None:
                            raise ValueError(f"Could not extract {file_name}")

                        array_data = np.squeeze(npy_loader(tar,file_name))
                        setattr(data, attr_name, array_data)

                    except Exception as e:
                        print(f"Error loading {file_name}: {str(e)}")
                else:
                    print(f"Warning: {file_name} not found in tar archive")

        return data

    def to_dataframe(self) -> pd.DataFrame:
        """
        Convert channel data to a pandas DataFrame.
        For multi-dimensional arrays (like site_positions), stores them as array objects.
        """
        data_dict = {}

        # Add brain location data if available
        if self.brain_location is not None:
            data_dict.update({
                'ccf_ap': self.brain_location.ccf_ap,
                'ccf_dv': self.brain_location.ccf_dv,
                'ccf_lr': self.brain_location.ccf_lr,
                'allen_ontology': self.brain_location.allen_ontology
            })

        # Add other channel data
        if self.probes is not None:
            data_dict['probes'] = self.probes
        if self.raw_row is not None:
            data_dict['raw_row'] = self.raw_row
        if self.site is not None:
            data_dict['site'] = self.site
        if self.site_positions is not None:
            # Store 2D site positions as array objects
            data_dict['site_positions'] = [pos for pos in self.site_positions]

        return pd.DataFrame(data_dict)

In [6]:
alldata_tar_path = os.path.join(os.getcwd(),data_directory,test_LFP + r".tar")
with tarfile.open(alldata_tar_path, 'r') as tar:
    print(type(tar))
    # print(tar.getnames())

    clusters = [name for name in tar.getnames() if name.startswith('clusters')]
    spikes = [name for name in tar.getnames() if name.startswith('spikes')]
    trials = [name for name in tar.getnames() if name.startswith('trials')]
    print(clusters)
    print(spikes)
    print(trials)
    wheel = [name for name in tar.getnames() if name.startswith('wheel.position')]
    print(wheel)
    wheel_pos = npy_loader(tar,wheel[0])
    # for spike in spikes:
    #     print(npy_loader(tar,spike).shape)
    #
    # for cluster in clusters:
    #     print(npy_loader(tar,cluster).shape)
    #
    # for trial in trials:
    #     print(npy_loader(tar,trial).shape)

cluster_data = Clusters.from_tar(alldata_tar_path)
trials = Trials.from_tar(alldata_tar_path)
spike_data = Spikes.from_tar(alldata_tar_path)
channel_data = Channels.from_tar(alldata_tar_path)

<class 'tarfile.TarFile'>
['clusters.depths.npy', 'clusters.originalIDs.npy', 'clusters.peakChannel.npy', 'clusters.probes.npy', 'clusters.templateWaveformChans.npy', 'clusters.templateWaveforms.npy', 'clusters.waveformDuration.npy', 'clusters._phy_annotation.npy']
['spikes.amps.npy', 'spikes.clusters.npy', 'spikes.depths.npy', 'spikes.times.npy']
['trials.feedbackType.npy', 'trials.feedback_times.npy', 'trials.goCue_times.npy', 'trials.included.npy', 'trials.intervals.npy', 'trials.repNum.npy', 'trials.response_choice.npy', 'trials.response_times.npy', 'trials.visualStim_contrastLeft.npy', 'trials.visualStim_contrastRight.npy', 'trials.visualStim_times.npy']
['wheel.position.npy']


In [7]:
cluster_df = cluster_data.to_dataframe().query('phy_annotation != 1.0')
cluster_df.phy_annotation.value_counts()

phy_annotation
2.0    1069
3.0       1
Name: count, dtype: int64

In [8]:
cluster_df[['mouse_name', 'date_exp']] = session_label
cluster_df.head()

Unnamed: 0,depths,original_ids,site,probes,template_waveform_chans,template_waveforms,waveform_duration,phy_annotation,mouse_name,date_exp
0,2094.092438,1,205.0,0.0,"[204.0, 203.0, 202.0, 201.0, 206.0, 205.0, 199...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",28.0,2.0,Cori,2016-12-17
1,3174.395215,2,310.0,0.0,"[309.0, 307.0, 311.0, 308.0, 313.0, 305.0, 304...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",17.0,2.0,Cori,2016-12-17
2,171.90866,3,18.0,0.0,"[17.0, 15.0, 13.0, 19.0, 16.0, 21.0, 12.0, 11....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",23.0,2.0,Cori,2016-12-17
3,482.968504,4,49.0,0.0,"[48.0, 46.0, 44.0, 50.0, 47.0, 43.0, 42.0, 45....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",18.0,2.0,Cori,2016-12-17
5,2178.953696,6,212.0,0.0,"[211.0, 209.0, 214.0, 210.0, 212.0, 213.0, 207...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",25.0,2.0,Cori,2016-12-17


In [9]:
trial_df = trials.to_dataframe()
trial_df[['mouse_name', 'date_exp']] = session_label
trial_df.head()

Unnamed: 0,feedback_type,feedback_times,gocue_times,included,intervals,repNum,response_choice,response_times,contrast_left,stimulus_times,contrast_right,mouse_name,date_exp
0,-1.0,89.889353,89.157155,True,"[86.36687946509481, 90.87699428563835]",1.0,-1.0,89.889353,1.0,88.405147,1.0,Cori,2016-12-17
1,1.0,95.213625,94.732019,True,"[91.87486989590016, 96.17468648244063]",1.0,1.0,95.172558,0.25,93.588406,0.0,Cori,2016-12-17
2,-1.0,99.188694,98.442062,True,"[97.17349547684987, 100.19191645857983]",1.0,-1.0,99.188694,0.5,97.738054,0.5,Cori,2016-12-17
3,1.0,105.490943,105.092538,True,"[101.18832980700331, 106.45731289113688]",1.0,1.0,105.455483,0.25,104.120927,0.0,Cori,2016-12-17
4,-1.0,114.204641,113.897839,True,"[107.45484467064871, 115.20800277418245]",1.0,1.0,114.204641,0.0,113.03703,0.0,Cori,2016-12-17


In [10]:
spike_df = spike_data.to_dataframe()
spike_df[['mouse_name', 'date_exp']] = session_label
spike_df.head()

Unnamed: 0,amps,clusters,depths,times,mouse_name,date_exp
0,348.252763,73,2893.284912,0.003433,Cori,2016-12-17
1,172.982962,176,2327.768555,0.007033,Cori,2016-12-17
2,351.892773,254,2219.60083,0.007567,Cori,2016-12-17
3,495.461074,21,2159.393311,0.008167,Cori,2016-12-17
4,92.052296,69,3095.12793,0.0087,Cori,2016-12-17


In [11]:
channel_df = channel_data.to_dataframe()
channel_df[['mouse_name', 'date_exp']] = session_label
channel_df[channel_df['allen_ontology'] == 'CA1'].head()

Unnamed: 0,ccf_ap,ccf_dv,ccf_lr,allen_ontology,probes,raw_row,site,site_positions,mouse_name,date_exp
150,8101.1,3187.8,2232.1,CA1,0.0,154,154,"[59.0, 1560.0]",Cori,2016-12-17
151,8119.9,3187.8,2258.0,CA1,0.0,155,155,"[27.0, 1560.0]",Cori,2016-12-17
152,8115.4,3168.7,2241.5,CA1,0.0,156,156,"[43.0, 1580.0]",Cori,2016-12-17
153,8134.2,3168.7,2267.4,CA1,0.0,157,157,"[11.0, 1580.0]",Cori,2016-12-17
154,8110.8,3149.6,2225.1,CA1,0.0,158,158,"[59.0, 1600.0]",Cori,2016-12-17


In [13]:
# Using merge
cluster_df['allen_ontology'] = (
    cluster_df.merge(
        channel_df[['mouse_name', 'date_exp', 'probes', 'site', 'allen_ontology']],
        left_on=['mouse_name', 'date_exp', 'probes', 'site'],
        right_on=['mouse_name', 'date_exp', 'probes', 'site'],
        how='left'
    )['allen_ontology']
)

In [14]:
cluster_df.head()

Unnamed: 0,depths,original_ids,site,probes,template_waveform_chans,template_waveforms,waveform_duration,phy_annotation,mouse_name,date_exp,allen_ontology
0,2094.092438,1,205.0,0.0,"[204.0, 203.0, 202.0, 201.0, 206.0, 205.0, 199...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",28.0,2.0,Cori,2016-12-17,CA1
1,3174.395215,2,310.0,0.0,"[309.0, 307.0, 311.0, 308.0, 313.0, 305.0, 304...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",17.0,2.0,Cori,2016-12-17,VISl
2,171.90866,3,18.0,0.0,"[17.0, 15.0, 13.0, 19.0, 16.0, 21.0, 12.0, 11....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",23.0,2.0,Cori,2016-12-17,root
3,482.968504,4,49.0,0.0,"[48.0, 46.0, 44.0, 50.0, 47.0, 43.0, 42.0, 45....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",18.0,2.0,Cori,2016-12-17,root
5,2178.953696,6,212.0,0.0,"[211.0, 209.0, 214.0, 210.0, 212.0, 213.0, 207...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",25.0,2.0,Cori,2016-12-17,CA1


In [15]:
def calculate_peak_to_trough_duration(waveforms, sampling_rate=30000):
    """
    Calculate the peak-to-trough duration of a waveform.

    Parameters:
    waveform (np.ndarray): Template waveform array
    sampling_rate (int): Sampling rate in Hz, default 30000 for typical ephys

    Returns:
    float: Peak-to-trough duration in milliseconds
    """
    # Find peak and trough indices
    waveform = waveforms[:,0]

    peak_idx = np.argmax(waveform)
    trough_idx = np.argmin(waveform)

    # Calculate time difference
    time_diff_samples = abs(peak_idx - trough_idx)

    # Convert to milliseconds
    duration_ms = (time_diff_samples / sampling_rate) * 1000

    return duration_ms

In [16]:
cluster_df['peak_to_trough_duration'] = cluster_df['template_waveforms'].apply(calculate_peak_to_trough_duration)

In [27]:
cluster_df.sort_values(['probes','original_ids']).head()

Unnamed: 0,depths,original_ids,site,probes,template_waveform_chans,template_waveforms,waveform_duration,phy_annotation,mouse_name,date_exp,allen_ontology,peak_to_trough_duration
0,2094.092438,1,205.0,0.0,"[204.0, 203.0, 202.0, 201.0, 206.0, 205.0, 199...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",28.0,2.0,Cori,2016-12-17,CA1,0.833333
1,3174.395215,2,310.0,0.0,"[309.0, 307.0, 311.0, 308.0, 313.0, 305.0, 304...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",17.0,2.0,Cori,2016-12-17,VISl,0.566667
2,171.90866,3,18.0,0.0,"[17.0, 15.0, 13.0, 19.0, 16.0, 21.0, 12.0, 11....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",23.0,2.0,Cori,2016-12-17,root,0.733333
3,482.968504,4,49.0,0.0,"[48.0, 46.0, 44.0, 50.0, 47.0, 43.0, 42.0, 45....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",18.0,2.0,Cori,2016-12-17,root,0.166667
5,2178.953696,6,212.0,0.0,"[211.0, 209.0, 214.0, 210.0, 212.0, 213.0, 207...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",25.0,2.0,Cori,2016-12-17,CA1,0.766667


In [18]:
# cluster_df.sort_values('original_ids').query('probes == 0.0')[['original_ids','probes','site','allen_ontology']].head(100)

In [19]:
# channel_df.query('probes == 0.0').head(200)

In [20]:
# channel_df.query('probes== 1.0')[['probes','site','allen_ontology']].head()

In [21]:
# channel_df.query('probes==1.0').site.values

In [22]:
# cluster_df.query('probes== 1.0').sort_values('site').site.values

In [23]:
# cluster_df.query('probes== 0.0').sort_values('site').site.values

In [24]:
# channel_df.query('probes== 0.0').sort_values('site').site.values

In [25]:
# channel_df.query('probes== 1.0').sort_values('site').site.values

In [26]:
# channel_df.query('probes== 1.0').sort_values('raw_row').site.values