In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline  

In [2]:
import numpy as np
from os.path import dirname, join as pjoin
import scipy.io as sio
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, random_split

from func2graph import data, baselines

In [4]:
# sessions are preprocessed by sliding windows, num_unqiue_cell_types and num_unique_neurons are known
session1 = torch.rand(32, 10, 100)
session2 = torch.rand(30, 13, 100)
session1_cell_type_ids = torch.rand(32, 10)
session2_cell_type_ids = torch.rand(30, 13)
session1_neuron_ids = torch.rand(32, 10)
session2_neuron_ids = torch.rand(30, 13)

all_sessions = [session1, session2]
all_sessions_cell_type_ids = [session1_cell_type_ids, session2_cell_type_ids]
all_sessions_neuron_ids = [session1_neuron_ids, session2_neuron_ids]


# all_sessions_activity_windows
# all_sessions_new_UniqueID_windows
# all_sessions_new_cell_type_windows
class Mouse_Session_Dataset(TensorDataset):
    def __init__(
        self, 
        all_sessions, 
        all_sessions_cell_type_ids, 
        all_sessions_neuron_ids,
        batch_size=3,                      # real batch size
    ):
        num_batch_per_session = [session.shape[0] // batch_size for session in all_sessions]

        self.all_batch = []
        self.all_batch_cell_type_ids = []
        self.all_batch_neuron_ids = []
        for i in range(len(num_batch_per_session)):      # for each session
            for j in range(num_batch_per_session[i]):      # for each batch
                self.all_batch.append(all_sessions[i][j*batch_size:(j+1)*batch_size])
                self.all_batch_cell_type_ids.append(all_sessions_cell_type_ids[i][j*batch_size:(j+1)*batch_size])
                self.all_batch_neuron_ids.append(all_sessions_neuron_ids[i][j*batch_size:(j+1)*batch_size])

    def __getitem__(self, index):
        return self.all_batch[index], self.all_batch_cell_type_ids[index], self.all_batch_neuron_ids[index]

    def __len__(self):
        return len(self.all_batch)



dataset = Mouse_Session_Dataset(all_sessions, all_sessions_cell_type_ids, all_sessions_neuron_ids)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)    # this is not real batch_size


for idx, (batch, batch_cell_type_ids, batch_neuron_ids) in enumerate(dataloader):
    batch = batch.squeeze(0)                 # remove the fake batch_size
    batch_cell_type_ids = batch_cell_type_ids.squeeze(0)
    batch_neuron_ids = batch_neuron_ids.squeeze(0)
    print(idx, batch.shape, batch_cell_type_ids.shape, batch_neuron_ids.shape)

    if idx == 11:
        print(batch_cell_type_ids[1])
        print(batch_neuron_ids[1])
        print(session2_cell_type_ids[4])
        print(session2_neuron_ids[4])

0 torch.Size([3, 10, 100]) torch.Size([3, 10]) torch.Size([3, 10])
1 torch.Size([3, 10, 100]) torch.Size([3, 10]) torch.Size([3, 10])
2 torch.Size([3, 10, 100]) torch.Size([3, 10]) torch.Size([3, 10])
3 torch.Size([3, 10, 100]) torch.Size([3, 10]) torch.Size([3, 10])
4 torch.Size([3, 10, 100]) torch.Size([3, 10]) torch.Size([3, 10])
5 torch.Size([3, 10, 100]) torch.Size([3, 10]) torch.Size([3, 10])
6 torch.Size([3, 10, 100]) torch.Size([3, 10]) torch.Size([3, 10])
7 torch.Size([3, 10, 100]) torch.Size([3, 10]) torch.Size([3, 10])
8 torch.Size([3, 10, 100]) torch.Size([3, 10]) torch.Size([3, 10])
9 torch.Size([3, 10, 100]) torch.Size([3, 10]) torch.Size([3, 10])
10 torch.Size([3, 13, 100]) torch.Size([3, 13]) torch.Size([3, 13])
11 torch.Size([3, 13, 100]) torch.Size([3, 13]) torch.Size([3, 13])
tensor([0.0265, 0.6752, 0.0444, 0.0856, 0.0071, 0.8997, 0.9815, 0.9855, 0.1327,
        0.5707, 0.7414, 0.3893, 0.5196])
tensor([0.7752, 0.1585, 0.3136, 0.4390, 0.9119, 0.3721, 0.0682, 0.8728, 0

In [None]:
# preprocess sessions by sliding windows
# determine num_unqiue_cell_types and num_unique_neurons
# determine unique_cell_type_ids and unique_neuron_ids

# a list of sessions files
input_sessions_files = []
output_folder = '../../data/Mouse/preprocessed/'

directory = '../../data/Mouse/Bugeon/'
date_exp = 'SB025/2019-10-07/'
input_setting = 'Blank/01/'
session_normalization = 'session'

accumulate_UnqiueID = []
accumulate_cell_type = []
all_sessions_acitvity_TRAIN = []   # first 80% of the time
all_sessions_acitvity_VAL = []
num_neurons_per_session = []

activity, frame_times, UniqueID, neuron_ttypes = data.load_mouse_data_session(
    directory, date_exp, input_setting, session_normalization
)
accumulate_UnqiueID.append(UniqueID)
accumulate_cell_type.append(neuron_ttypes)
all_sessions_acitvity_TRAIN.append(activity[:, :int(activity.shape[1]*0.8)])
all_sessions_acitvity_VAL.append(activity[:, int(activity.shape[1]*0.8):])
num_neurons_per_session.append(activity.shape[0])

# print(UniqueID)
# print(neuron_ttypes)

directory = '../../data/Mouse/Bugeon/'
date_exp = 'SB025/2019-10-04/'
input_setting = 'Blank/01/'
session_normalization = 'session'

activity, frame_times, UniqueID, neuron_ttypes = data.load_mouse_data_session(
    directory, date_exp, input_setting, session_normalization
)
accumulate_UnqiueID.append(UniqueID)
accumulate_cell_type.append(neuron_ttypes)
all_sessions_acitvity_TRAIN.append(activity[:, :int(activity.shape[1]*0.8)])
all_sessions_acitvity_VAL.append(activity[:, int(activity.shape[1]*0.8):])
num_neurons_per_session.append(activity.shape[0])

directory = '../../data/Mouse/Bugeon/'
date_exp = 'SB025/2019-10-08/'
input_setting = 'Blank/01/'
session_normalization = 'session'

activity, frame_times, UniqueID, neuron_ttypes = data.load_mouse_data_session(
    directory, date_exp, input_setting, session_normalization
)
accumulate_UnqiueID.append(UniqueID)
accumulate_cell_type.append(neuron_ttypes)
all_sessions_acitvity_TRAIN.append(activity[:, :int(activity.shape[1]*0.8)])
all_sessions_acitvity_VAL.append(activity[:, int(activity.shape[1]*0.8):])
num_neurons_per_session.append(activity.shape[0])

directory = '../../data/Mouse/Bugeon/'
date_exp = 'SB025/2019-10-09/'
input_setting = 'Blank/01/'
session_normalization = 'session'

activity, frame_times, UniqueID, neuron_ttypes = data.load_mouse_data_session(
    directory, date_exp, input_setting, session_normalization
)
accumulate_UnqiueID.append(UniqueID)
accumulate_cell_type.append(neuron_ttypes)
all_sessions_acitvity_TRAIN.append(activity[:, :int(activity.shape[1]*0.8)])   ################
all_sessions_acitvity_VAL.append(activity[:, int(activity.shape[1]*0.8):])     ################
num_neurons_per_session.append(activity.shape[0])

# sort UniqueID
accumulate_UnqiueID = np.concatenate(accumulate_UnqiueID)
accumulate_cell_type = np.concatenate(accumulate_cell_type)
print(np.sort(accumulate_UnqiueID[~np.isnan(accumulate_UnqiueID)]))
# get the index of accumulated UniqueID = 111
# print(accumulate_cell_type[np.where(accumulate_UnqiueID == 304)])
# print(accumulate_UnqiueID[np.where(accumulate_cell_type == 'Vip-Ptprt-Pkp2')])

print(num_neurons_per_session)

[  6.   8.   9.  10.  11.  12.  13.  14.  15.  16.  17.  18.  19.  20.
  21.  23.  25.  28.  29.  30.  31.  32.  33.  34.  35.  37.  39.  40.
  41.  41.  42.  43.  45.  45.  46.  47.  47.  48.  48.  49.  50.  51.
  52.  53.  53.  54.  55.  56.  58.  59.  60.  61.  63.  65.  65.  66.
  67.  68.  72.  73.  74.  75.  76.  77.  79.  80.  81.  82.  83.  84.
  85.  85.  86.  87.  88.  89.  91.  91.  92.  93.  94.  96.  96.  97.
  98.  98.  99. 100. 101. 102. 103. 103. 105. 106. 107. 108. 108. 109.
 110. 111. 113. 114. 115. 116. 117. 117. 118. 119. 120. 121. 122. 124.
 125. 126. 127. 127. 128. 130. 131. 131. 132. 133. 134. 134. 136. 137.
 138. 139. 140. 141. 142. 144. 145. 146. 147. 148. 149. 149. 150. 151.
 152. 152. 154. 154. 155. 157. 158. 158. 161. 163. 164. 165. 166. 166.
 167. 169. 170. 171. 172. 173. 173. 174. 175. 176. 177. 177. 180. 181.
 181. 182. 184. 184. 185. 187. 189. 190. 191. 192. 193. 194. 197. 197.
 200. 204. 205. 206. 210. 211. 212. 213. 214. 215. 215. 217. 220. 221.
 222. 

In [None]:
def assign_unique_neuron_ids(all_sessions_original_UniqueID, num_neurons_per_session):
    """
    all_sessions_original_UniqueID: a concatenated list of the original UniqueID from all sessions

    Return:
    all_sessions_new_UniqueID: a list of sessions new UniqueID, each session is a 1D array of shape num_neurons
    """

    # first reassign ID starting from 0 to those non-NaN neurons
    # same IDs should be assigned to neurons that have the same original UniqueID
    non_nan_values = all_sessions_original_UniqueID[~np.isnan(all_sessions_original_UniqueID)]
    unique_non_nan_values = np.unique(non_nan_values)
    id_mapping = {unique_non_nan_values[i]: i for i in range(len(unique_non_nan_values))}

    new_ids = [id_mapping[non_nan_values[i]] for i in range(len(non_nan_values))]
    all_sessions_new_UniqueID = np.copy(all_sessions_original_UniqueID)
    all_sessions_new_UniqueID[~np.isnan(all_sessions_new_UniqueID)] = new_ids

    # then assign new IDs to those NaN neurons
    num_unique_non_nan = unique_non_nan_values.shape[0]     # new IDs start from num_unqiue_non_nan
    num_nan = np.sum(np.isnan(all_sessions_original_UniqueID))           # new IDs end with num_non_nan + num_nan -1

    new_ids = np.arange(num_unique_non_nan, num_unique_non_nan + num_nan)
    all_sessions_new_UniqueID[np.isnan(all_sessions_new_UniqueID)] = new_ids

    # Segment all_sessions_new_UniqueID into sessions
    all_sessions_new_UniqueID = np.split(all_sessions_new_UniqueID, np.cumsum(num_neurons_per_session)[:-1])

    num_unique_neurons = num_unique_non_nan + num_nan

    return all_sessions_new_UniqueID, num_unqiue_neurons    # shape: num_sessions x num_neurons_per_session

all_sessions_new_UniqueID, num_unqiue_neurons = assign_unique_neuron_ids(UniqueID, len(neuron_ttypes))
print(all_sessions_new_UniqueID.shape)
print(all_sessions_new_UniqueID[0][:10])
# calculate the number of unique neurons in each session
num_unique_neurons_per_session = [len(np.unique(all_sessions_new_UniqueID[i])) for i in range(len(all_sessions_new_UniqueID))]
print(num_unique_neurons_per_session)

# calulate the number of neurons in each session
num_neurons_per_session = [len(all_sessions_new_UniqueID[i]) for i in range(len(all_sessions_new_UniqueID))]
print(num_neurons_per_session)

[115. 116. 117. 118. 119. 120. 121. 122. 123. 124. 125. 126. 127. 128.
 129. 130. 131. 132. 133. 134. 135. 136. 137. 138. 139. 140. 141. 142.
 143. 144. 145. 146. 147. 148. 149. 150. 151. 152. 153.  56. 154.  58.
 155. 156. 157. 158. 159. 160. 161. 162. 163. 164. 165. 166. 167. 168.
 169. 170. 171. 172. 173. 174. 175. 176. 177. 178. 179. 180. 181. 182.
 183. 184. 185. 186. 187.  48. 188.  72.  73.  77. 189.  86. 190. 191.
 192. 193. 194. 195. 196. 197. 198. 199. 200.   1. 201. 202. 203.  11.
 204. 205.]
[ nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan 188.  nan 198.
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan 169.  nan 240. 241. 260.  nan 279.  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan   1.  nan  nan  nan

In [7]:
def assign_unique_cell_type_ids(all_sessions_original_cell_type, num_neurons_per_session):
    """
    all_sessions_original_cell_type: a concatenated list of the original cell types from all sessions (raw cell types)

    Return:
    all_sessions_new_cell_type: a list of sessions new cell type, each session is a 1D array of shape num_neurons
    """
    # Get the first level of cell types
    neuron_types_result = []
    for i in range(len(all_sessions_original_cell_type)):
        # split by "-"
        neuron_types_result.append(all_sessions_original_cell_type[i].split("-")[0])
    all_sessions_original_cell_type = neuron_types_result
    print(all_sessions_original_cell_type[:100])

    unique_cell_types = list(set(all_sessions_original_cell_type))
    # Assign IDs to cell types
    cell_type2id = {unique_cell_types[i]: i for i in range(len(unique_cell_types))}

    # Get new cell type IDs
    all_sessions_new_cell_type = np.zeros(len(all_sessions_original_cell_type))
    for i in range(len(all_sessions_original_cell_type)):
        all_sessions_new_cell_type[i] = cell_type2id[all_sessions_original_cell_type[i]]

    # Segment all_sessions_new_UniqueID into sessions
    all_sessions_new_cell_type = np.split(all_sessions_new_cell_type, np.cumsum(num_neurons_per_session)[:-1])

    return all_sessions_new_cell_type, cell_type2id     # shape: num_sessions x num_neurons_per_session
    
all_sessions_new_cell_type, cell_type2id = assign_unique_cell_type_ids(neuron_ttypes, [len(neuron_ttypes)])
print(all_sessions_new_cell_type[0][:100])
print(neuron_ttypes[:100])
print(cell_type2id)

['EC', 'Pvalb', 'EC', 'EC', 'EC', 'EC', 'EC', 'EC', 'EC', 'IN', 'EC', 'EC', 'EC', 'IN', 'IN', 'IN', 'IN', 'EC', 'IN', 'Pvalb', 'Vip', 'Sncg', 'EC', 'EC', 'IN', 'EC', 'EC', 'EC', 'EC', 'IN', 'EC', 'EC', 'Pvalb', 'EC', 'EC', 'EC', 'IN', 'EC', 'EC', 'EC', 'EC', 'EC', 'EC', 'Vip', 'EC', 'EC', 'EC', 'IN', 'IN', 'EC', 'IN', 'IN', 'EC', 'EC', 'EC', 'IN', 'IN', 'EC', 'EC', 'EC', 'EC', 'EC', 'IN', 'EC', 'EC', 'EC', 'IN', 'Lamp5', 'EC', 'EC', 'IN', 'IN', 'IN', 'IN', 'EC', 'EC', 'EC', 'Lamp5', 'IN', 'EC', 'IN', 'IN', 'IN', 'EC', 'IN', 'Lamp5', 'Lamp5', 'EC', 'EC', 'EC', 'EC', 'EC', 'Pvalb', 'Lamp5', 'Lamp5', 'IN', 'EC', 'Lamp5', 'EC', 'IN']
['6' '4' '6' '6' '6' '6' '6' '6' '6' '2' '6' '6' '6' '2' '2' '2' '2' '6'
 '2' '4' '5' '7' '6' '6' '2' '6' '6' '6' '6' '2' '6' '6' '4' '6' '6' '6'
 '2' '6' '6' '6' '6' '6' '6' '5' '6' '6' '6' '2' '2' '6' '2' '2' '6' '6'
 '6' '2' '2' '6' '6' '6' '6' '6' '2' '6' '6' '6' '2' '1' '6' '6' '2' '2'
 '2' '2' '6' '6' '6' '1' '2' '6' '2' '2' '2' '6' '2' '1' '1' '6' '6' '

In [None]:
def sliding_windows(all_sessions_acitvity, all_sessions_new_UniqueID, all_sessions_new_cell_type, window_size):
    """
    (can be from TRAIN or VAL set)
    all_sessions_acitvity: a list of sessions activity, each session is a 2D array of shape num_neurons x num_frames
    all_sessions_new_UniqueID: a list of sessions new UniqueID, each session is a 1D array of shape num_neurons
    all_sessions_new_cell_type: a list of sessions new cell type, each session is a 1D array of shape num_neurons

    Return:
    - all_sessions_activity_windows:
        a list of sessions activity windows, each session is a 3D array of shape num_windows x num_neurons x window_size
    - all_sessions_new_UniqueID_windows:
        a list of sessions new UniqueID windows, each session is a 2D array of shape num_windows x num_neurons (each row should be the same)
    - all_sessions_new_cell_type_windows:
        a list of sessions new cell type windows, each session is a 2D array of shape num_windows x num_neurons (each row should be the same)
    """

    all_sessions_activity_windows = []
    all_sessions_new_UniqueID_windows = []
    all_sessions_new_cell_type_windows = []

    for i in range(len(all_sessions_acitvity)):
        num_neurons = all_sessions_acitvity[i].shape[0]
        num_frames = all_sessions_acitvity[i].shape[1]
        num_windows = num_frames - window_size + 1

        # activity
        activity_windows = np.zeros((num_windows, num_neurons, window_size))
        for j in range(num_windows):
            activity_windows[j] = all_sessions_acitvity[i][:, j:j+window_size]
        all_sessions_activity_windows.append(activity_windows)

        # UniqueID
        UniqueID_windows = np.zeros((num_windows, num_neurons))
        for j in range(num_windows):
            UniqueID_windows[j] = all_sessions_new_UniqueID[i]
        all_sessions_new_UniqueID_windows.append(UniqueID_windows)

        # cell type
        cell_type_windows = np.zeros((num_windows, num_neurons))
        for j in range(num_windows):
            cell_type_windows[j] = all_sessions_new_cell_type[i]
        all_sessions_new_cell_type_windows.append(cell_type_windows)

    return all_sessions_activity_windows, all_sessions_new_UniqueID_windows, all_sessions_new_cell_type_windows