In [44]:

import pickle 
import numpy as np 
import pandas 


## Preprocessing 4 datasets 

* Length of trials is 3/4 seconds 
* Varying trial lengths -- what to use as observation window ?


Files required 
* ../../datasets/V1HPC_spks_CQuinn.pkl
* ../../datasets/V1LGN_spks_CQuinn
* ../../datasets/V1LP_spks_CQuinn.pkl
* ../../datasets/V1RSC_spks_CQuinn.pkl

In [13]:

datasets = [ 'V1HPC_spks_CQuinn', 'V1LGN_spks_CQuinn',  'V1LP_spks_CQuinn', 'V1RSC_spks_CQuinn']
times_per_trials = [3, 4, 4, 4] # duration of trials in seconds 




def load_dataset(dataset_name):
    print(f'{dataset_name}')
    with open(f'../../datasets/{dataset_name}.pkl', 'rb') as f_:
        data = pickle.load(f_)
    return data 

## dataset 1  - V1/HPC 

In [14]:
dataset_i = 0
data = load_dataset(datasets[dataset_i])

        
# modify id for dataset V1HPC_spks_CQuinn
def modify_id(x):
    y = x.split('_') 
    return f'{y[0].upper()}_NA_{y[-1].lower()}_{y[2]}_{y[1]}' # NA for the hemisphere (left/right) -- data does not exist in this dataset 


data['times'] = data['trial_spikes']
print(data['trial_spikes'].max(), times_per_trials[dataset_i], " max times per trial ") 
data

V1HPC_spks_CQuinn
2.9999666666666673 3  max times per trial 


Unnamed: 0,trial,trial_spikes,spikes,cuid,stim,depth,region,et,cc,cluster_id,r_group_type,times
0.0,0.0,0.014200,0.014200,et323_292,pre,1060,hippo,et323,cc033663,292,excited,0.014200
0.0,0.0,0.882167,0.882167,et323_292,pre,1060,hippo,et323,cc033663,292,excited,0.882167
0.0,0.0,0.923733,0.923733,et323_292,pre,1060,hippo,et323,cc033663,292,excited,0.923733
0.0,0.0,1.652933,1.652933,et323_292,pre,1060,hippo,et323,cc033663,292,excited,1.652933
1.0,1.0,0.283867,3.283867,et323_292,pre,1060,hippo,et323,cc033663,292,excited,0.283867
...,...,...,...,...,...,...,...,...,...,...,...,...
48.0,48.0,1.714300,145.714300,HP11_443,novel,2000,v1,HP11,CC082263,443,other,1.714300
48.0,48.0,1.886167,145.886167,HP11_443,novel,2000,v1,HP11,CC082263,443,other,1.886167
48.0,48.0,2.250533,146.250533,HP11_443,novel,2000,v1,HP11,CC082263,443,other,2.250533
48.0,48.0,2.517000,146.517000,HP11_443,novel,2000,v1,HP11,CC082263,443,other,2.517000


In [16]:
data['id'] = data['cuid'] +  '_' +  data['region']  + '_' + data['stim'] # mouse id, unit_id, unit group, pre/post/novel 
data['id'] = data['id'].apply(modify_id)

data['stimn'] = 0. # stimn=0 is the only stimulus considered 
data 

Unnamed: 0,trial,trial_spikes,spikes,cuid,stim,depth,region,et,cc,cluster_id,r_group_type,times,id,stimn
0.0,0.0,0.014200,0.014200,et323_292,pre,1060,hippo,et323,cc033663,292,excited,0.014200,ET323_NA_pre_hippo_292,0.0
0.0,0.0,0.882167,0.882167,et323_292,pre,1060,hippo,et323,cc033663,292,excited,0.882167,ET323_NA_pre_hippo_292,0.0
0.0,0.0,0.923733,0.923733,et323_292,pre,1060,hippo,et323,cc033663,292,excited,0.923733,ET323_NA_pre_hippo_292,0.0
0.0,0.0,1.652933,1.652933,et323_292,pre,1060,hippo,et323,cc033663,292,excited,1.652933,ET323_NA_pre_hippo_292,0.0
1.0,1.0,0.283867,3.283867,et323_292,pre,1060,hippo,et323,cc033663,292,excited,0.283867,ET323_NA_pre_hippo_292,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48.0,48.0,1.714300,145.714300,HP11_443,novel,2000,v1,HP11,CC082263,443,other,1.714300,HP11_NA_novel_v1_443,0.0
48.0,48.0,1.886167,145.886167,HP11_443,novel,2000,v1,HP11,CC082263,443,other,1.886167,HP11_NA_novel_v1_443,0.0
48.0,48.0,2.250533,146.250533,HP11_443,novel,2000,v1,HP11,CC082263,443,other,2.250533,HP11_NA_novel_v1_443,0.0
48.0,48.0,2.517000,146.517000,HP11_443,novel,2000,v1,HP11,CC082263,443,other,2.517000,HP11_NA_novel_v1_443,0.0


In [17]:
# remove non-excited neurons
data = data[data['r_group_type'] == 'excited']
# data['r_group_type'].unique()

In [18]:
data.region.unique()

array(['hippo', 'v1'], dtype=object)

# filter data by region 

In [19]:
# filter data by region 
data = data[data['region'].isin(['hippo', 'v1'])] 
data[['id', 'times', 'stimn', 'region', 'stim' ]].groupby(['region', 'stim' ]).count()

# filter trial times 
data = data[['times', 'trial', 'stimn', 'id', 'stim']]

In [20]:

pre, post, novel = data[data['stim'] == 'pre'], data[data['stim'] == 'post'], data[data['stim']=='novel']
pre, post, novel = pre.reset_index(), post.reset_index(), novel.reset_index()

In [21]:
data.shape[0] == pre.shape[0] + post.shape[0] + novel.shape[0]

True

In [22]:
pre.trial.unique() # note there is no trial 14 

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
       27., 28., 14.])

In [34]:
pre.to_feather(f'../../datasets/{datasets[dataset_i]}_pre.feather')
post.to_feather(f'../../datasets/{datasets[dataset_i]}_post.feather')


# dataset 2 - V1/LGN

In [23]:
dataset_i = 1
data = load_dataset(datasets[dataset_i])


data['times'] = data['trial_spikes']
print(data['trial_spikes'].max(), times_per_trials[dataset_i], " max times per trial ") 

# filter out non-'excited' ones 
data = data[data ['r_group_type'] == 'excited']
data 

V1LGN_spks_CQuinn
3.9999000000000002 4  max times per trial 


Unnamed: 0,condition,cuid,depth,location,situ,spikes,trial,trial_spikes,r_group_type,times
0.0,GroupWTpostG-N-G---GLGN,019ET#L06id45postLGN,175,LGN,novel,0.022333,0.0,0.022333,excited,0.022333
0.0,GroupWTpostG-N-G---GLGN,019ET#L06id45postLGN,175,LGN,novel,0.118800,0.0,0.118800,excited,0.118800
0.0,GroupWTpostG-N-G---GLGN,019ET#L06id45postLGN,175,LGN,novel,0.557733,0.0,0.557733,excited,0.557733
0.0,GroupWTpostG-N-G---GLGN,019ET#L06id45postLGN,175,LGN,novel,0.561233,0.0,0.561233,excited,0.561233
0.0,GroupWTpostG-N-G---GLGN,019ET#L06id45postLGN,175,LGN,novel,0.873533,0.0,0.873533,excited,0.873533
...,...,...,...,...,...,...,...,...,...,...
19.0,GroupWTpreG-1-G---GV1,001ET#L03id20preV1,100,V1,pre,77.659467,19.0,1.659467,excited,1.659467
19.0,GroupWTpreG-1-G---GV1,001ET#L03id20preV1,100,V1,pre,78.218500,19.0,2.218500,excited,2.218500
19.0,GroupWTpreG-1-G---GV1,001ET#L03id20preV1,100,V1,pre,78.234733,19.0,2.234733,excited,2.234733
19.0,GroupWTpreG-1-G---GV1,001ET#L03id20preV1,100,V1,pre,79.461833,19.0,3.461833,excited,3.461833


In [24]:
# double checking  the naming for novel recordings 
conditions = data['condition'].unique()
assert data[data.condition.isin([c for c in conditions if 'G-N-G' in c  ])].shape[0] == data[data['situ'] == 'novel'].shape[0]
assert data[data.condition.isin([c for c in conditions if 'G-N-G' not in c  ])].shape[0] == data[data['situ'] != 'novel'].shape[0]

In [25]:

import re 

def modify_id(x):
    mouse_id = re.findall('ET#L\d+', x)[0]
    unit_id = re.findall('id\d+', x)[0]
    unit_id = unit_id.replace('id', '')
#     return x.replace(unit_id, '')
    
    group = re.findall('LGN|V1', x)[0]
    pre_post = re.findall('pre|post|novel', x)[0]
    
    return f'{mouse_id}_NA_{pre_post}_{group}_{unit_id}'


data['id'] = data['cuid'].apply(modify_id)
data['stimn'] = 0. 


modify_id('019ET#L06id45postLGN')

'ET#L06_NA_post_LGN_45'

In [26]:
data 

Unnamed: 0,condition,cuid,depth,location,situ,spikes,trial,trial_spikes,r_group_type,times,id,stimn
0.0,GroupWTpostG-N-G---GLGN,019ET#L06id45postLGN,175,LGN,novel,0.022333,0.0,0.022333,excited,0.022333,ET#L06_NA_post_LGN_45,0.0
0.0,GroupWTpostG-N-G---GLGN,019ET#L06id45postLGN,175,LGN,novel,0.118800,0.0,0.118800,excited,0.118800,ET#L06_NA_post_LGN_45,0.0
0.0,GroupWTpostG-N-G---GLGN,019ET#L06id45postLGN,175,LGN,novel,0.557733,0.0,0.557733,excited,0.557733,ET#L06_NA_post_LGN_45,0.0
0.0,GroupWTpostG-N-G---GLGN,019ET#L06id45postLGN,175,LGN,novel,0.561233,0.0,0.561233,excited,0.561233,ET#L06_NA_post_LGN_45,0.0
0.0,GroupWTpostG-N-G---GLGN,019ET#L06id45postLGN,175,LGN,novel,0.873533,0.0,0.873533,excited,0.873533,ET#L06_NA_post_LGN_45,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
19.0,GroupWTpreG-1-G---GV1,001ET#L03id20preV1,100,V1,pre,77.659467,19.0,1.659467,excited,1.659467,ET#L03_NA_pre_V1_20,0.0
19.0,GroupWTpreG-1-G---GV1,001ET#L03id20preV1,100,V1,pre,78.218500,19.0,2.218500,excited,2.218500,ET#L03_NA_pre_V1_20,0.0
19.0,GroupWTpreG-1-G---GV1,001ET#L03id20preV1,100,V1,pre,78.234733,19.0,2.234733,excited,2.234733,ET#L03_NA_pre_V1_20,0.0
19.0,GroupWTpreG-1-G---GV1,001ET#L03id20preV1,100,V1,pre,79.461833,19.0,3.461833,excited,3.461833,ET#L03_NA_pre_V1_20,0.0


In [28]:
# asserting unique id mappings 
assert len(data['cuid'].unique()), len(data['id'].unique())

In [29]:
data = data[['id', 'stimn', 'times', 'trial', 'situ']] # slicing columns 

In [30]:
pre, post, novel = data[data['situ'] == 'pre'], data[data['situ'] == 'post'], data[data['situ']=='novel']
pre, post, novel = pre.reset_index(), post.reset_index(), novel.reset_index()

In [303]:
pre.to_feather(f'../../datasets/{datasets[dataset_i]}_pre.feather')
post.to_feather(f'../../datasets/{datasets[dataset_i]}_post.feather')


In [32]:
# double checking pre/post/novel split is done correctly 
data.shape[0] == pre.shape[0] + post.shape[0] + novel.shape[0]

True

## dataset 3  -- V1/LP

In [33]:
dataset_i = 2
data = load_dataset(datasets[dataset_i])


data['times'] = data['trial_spikes']
print(data['trial_spikes'].max(), times_per_trials[dataset_i], " max times per trial ") 

# dataset 3 seems unaffected, all units are "excited"
data = data[data['r_group_type'] == 'excited'] 
data 

V1LP_spks_CQuinn
3.9999666666666673 4  max times per trial 


Unnamed: 0,condition,cuid,depth,location,situ,spikes,trial,trial_spikes,n_type,layer,r_group_type,times
0.0,GroupWTpostG-1-G---GLP,001ET#000Aid6postLP,575,LP,post,0.216067,0.0,0.216067,rs,L4,excited,0.216067
0.0,GroupWTpostG-1-G---GLP,001ET#000Aid6postLP,575,LP,post,0.568633,0.0,0.568633,rs,L4,excited,0.568633
0.0,GroupWTpostG-1-G---GLP,001ET#000Aid6postLP,575,LP,post,0.686633,0.0,0.686633,rs,L4,excited,0.686633
0.0,GroupWTpostG-1-G---GLP,001ET#000Aid6postLP,575,LP,post,0.704967,0.0,0.704967,rs,L4,excited,0.704967
0.0,GroupWTpostG-1-G---GLP,001ET#000Aid6postLP,575,LP,post,0.757767,0.0,0.757767,rs,L4,excited,0.757767
...,...,...,...,...,...,...,...,...,...,...,...,...
38.0,GroupWTpostG-N-G---GV1,003ET#014Aid28postV1,250,V1,novel,155.394900,38.0,3.394900,rs,L5,excited,3.394900
38.0,GroupWTpostG-N-G---GV1,003ET#014Aid28postV1,250,V1,novel,155.417400,38.0,3.417400,rs,L5,excited,3.417400
39.0,GroupWTpostG-N-G---GV1,003ET#014Aid28postV1,250,V1,novel,156.603700,39.0,0.603700,rs,L5,excited,0.603700
39.0,GroupWTpostG-N-G---GV1,003ET#014Aid28postV1,250,V1,novel,156.704367,39.0,0.704367,rs,L5,excited,0.704367


In [34]:

import re 

def modify_id(x):
    mouse_id = re.findall('ET#\d+A', x)[0]
    unit_id = re.findall('id\d+', x)[0]
    unit_id = unit_id.replace('id', '')
    group = re.findall('LP|V1', x)[0]
    pre_post = re.findall('pre|post|novel', x)[0]
    
    return f'{mouse_id}_NA_{pre_post}_{group}_{unit_id}'


modify_id('001ET#000Aid6postLP')

data['id'] = data['cuid'].apply(modify_id)
data['stimn'] = 0. 


In [35]:
assert len(data[data['situ'] == 'pre']['id'].unique()) == len(data[data['situ'] == 'pre']['cuid'].unique())
assert len(data[data['situ'] == 'post']['id'].unique()) == len(data[data['situ'] == 'post']['cuid'].unique())
assert len(data[data['situ'] == 'novel']['id'].unique()) == len(data[data['situ'] == 'novel']['cuid'].unique())

In [36]:
conditions = data['condition'].unique()
assert data[data.condition.isin([c for c in conditions if 'G-N-G' in c  ])].shape[0] == data[data['situ'] == 'novel'].shape[0]
assert data[data.condition.isin([c for c in conditions if 'G-N-G' not in c  ])].shape[0] == data[data['situ'] != 'novel'].shape[0]

In [37]:
data = data[['stimn', 'id', 'trial', 'times', 'situ']]
pre, post, novel = data[data['situ'] == 'pre'], data[data['situ'] == 'post'], data[data['situ']=='novel']
pre, post, novel = pre.reset_index(), post.reset_index(), novel.reset_index()
pre.to_feather(f'../../datasets/{datasets[dataset_i]}_pre.feather')
post.to_feather(f'../../datasets/{datasets[dataset_i]}_post.feather')


data.shape[0] == pre.shape[0] + post.shape[0] + novel.shape[0]

True

## dataset 4 - V1/RSC

In [38]:
dataset_i = 3
data = load_dataset(datasets[dataset_i])

with open(f'../../datasets/{datasets[dataset_i]}.pkl', 'rb') as f_:
    data = pickle.load(f_)


data['times'] = data['trial_spikes']
print(data['trial_spikes'].max(), times_per_trials[dataset_i], " max times per trial ") 

data 

V1RSC_spks_CQuinn
3.9999666666666656 4  max times per trial 


Unnamed: 0,condition,cuid,depth,location,situ,spikes,trial,trial_spikes,n_type,layer,r_group_type,times
0.0,GroupWTpreG-1-G---GV1,001ET#R000id166preV1,800,V1,pre,0.127900,0.0,0.127900,rs,L2/3,excited,0.127900
0.0,GroupWTpreG-1-G---GV1,001ET#R000id166preV1,800,V1,pre,0.137467,0.0,0.137467,rs,L2/3,excited,0.137467
0.0,GroupWTpreG-1-G---GV1,001ET#R000id166preV1,800,V1,pre,0.351133,0.0,0.351133,rs,L2/3,excited,0.351133
0.0,GroupWTpreG-1-G---GV1,001ET#R000id166preV1,800,V1,pre,0.361300,0.0,0.361300,rs,L2/3,excited,0.361300
0.0,GroupWTpreG-1-G---GV1,001ET#R000id166preV1,800,V1,pre,0.372867,0.0,0.372867,rs,L2/3,excited,0.372867
...,...,...,...,...,...,...,...,...,...,...,...,...
39.0,GroupWTpostG-N-G---GV1,005ET#RB03id72postV1,200,V1,novel,156.038933,39.0,0.038933,rs,L6,other,0.038933
39.0,GroupWTpostG-N-G---GV1,005ET#RB03id72postV1,200,V1,novel,156.144033,39.0,0.144033,rs,L6,other,0.144033
39.0,GroupWTpostG-N-G---GV1,005ET#RB03id72postV1,200,V1,novel,156.230100,39.0,0.230100,rs,L6,other,0.230100
39.0,GroupWTpostG-N-G---GV1,005ET#RB03id72postV1,200,V1,novel,156.436567,39.0,0.436567,rs,L6,other,0.436567


In [39]:

import re 

def modify_id(x):
    mouse_id = re.findall('ET#[A-Z]+\d+', x)[0]
    unit_id = re.findall('id\d+', x)[0]
    unit_id = unit_id.replace('id', '')
    group = re.findall('RSC|V1', x)[0]
    pre_post = re.findall('pre|post|novel', x)[0]
    
    return f'{mouse_id}_NA_{pre_post}_{group}_{unit_id}'


modify_id('001ET#R000id166preV1')

data['id'] = data['cuid'].apply(modify_id)
data['stimn'] = 0. 


In [40]:
data[data.situ == 'post'].id.unique().shape , data[data.situ == 'post'].cuid.unique().shape 

((912,), (912,))

In [41]:
data = data[data['r_group_type'] == 'excited'] 
data 

Unnamed: 0,condition,cuid,depth,location,situ,spikes,trial,trial_spikes,n_type,layer,r_group_type,times,id,stimn
0.0,GroupWTpreG-1-G---GV1,001ET#R000id166preV1,800,V1,pre,0.127900,0.0,0.127900,rs,L2/3,excited,0.127900,ET#R000_NA_pre_V1_166,0.0
0.0,GroupWTpreG-1-G---GV1,001ET#R000id166preV1,800,V1,pre,0.137467,0.0,0.137467,rs,L2/3,excited,0.137467,ET#R000_NA_pre_V1_166,0.0
0.0,GroupWTpreG-1-G---GV1,001ET#R000id166preV1,800,V1,pre,0.351133,0.0,0.351133,rs,L2/3,excited,0.351133,ET#R000_NA_pre_V1_166,0.0
0.0,GroupWTpreG-1-G---GV1,001ET#R000id166preV1,800,V1,pre,0.361300,0.0,0.361300,rs,L2/3,excited,0.361300,ET#R000_NA_pre_V1_166,0.0
0.0,GroupWTpreG-1-G---GV1,001ET#R000id166preV1,800,V1,pre,0.372867,0.0,0.372867,rs,L2/3,excited,0.372867,ET#R000_NA_pre_V1_166,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39.0,GroupWTpostG-N-G---GV1,005ET#RB03id48postV1,900,V1,novel,156.945667,39.0,0.945667,rs,L2/3,excited,0.945667,ET#RB03_NA_post_V1_48,0.0
39.0,GroupWTpostG-N-G---GV1,005ET#RB03id48postV1,900,V1,novel,156.952200,39.0,0.952200,rs,L2/3,excited,0.952200,ET#RB03_NA_post_V1_48,0.0
39.0,GroupWTpostG-N-G---GV1,005ET#RB03id48postV1,900,V1,novel,156.959800,39.0,0.959800,rs,L2/3,excited,0.959800,ET#RB03_NA_post_V1_48,0.0
39.0,GroupWTpostG-N-G---GV1,005ET#RB03id48postV1,900,V1,novel,156.969467,39.0,0.969467,rs,L2/3,excited,0.969467,ET#RB03_NA_post_V1_48,0.0


In [42]:


assert len(data[data.situ == 'pre']['id'].unique()) == len(data[data.situ == 'pre']['cuid'].unique())
assert len(data[data.situ == 'post']['id'].unique()) == len(data[data.situ == 'post']['cuid'].unique())
assert len(data[data.situ == 'novel']['id'].unique()) == len(data[data.situ == 'novel']['cuid'].unique())


In [46]:
data = data[['stimn', 'id', 'trial', 'times', 'situ']]

pre, post, novel = data[data['situ'] == 'pre'], data[data['situ'] == 'post'], data[data['situ']=='novel']
pre, post, novel = pre.reset_index(), post.reset_index(), novel.reset_index()



pre.to_feather(f'../../datasets/{datasets[dataset_i]}_pre.feather')
post.to_feather(f'../../datasets/{datasets[dataset_i]}_post.feather')


data.shape[0] == pre.shape[0] + post.shape[0] + novel.shape[0]

True