<a href="https://colab.research.google.com/github/yeticheese/NMA_2024_Zealous_Sedum_CN/blob/main/Copy_of_A_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# @title Install packages
!pip install pip --upgrade --quiet
!pip install brain_observatory_utilities --upgrade --quiet
!pip install pandas --quiet
!pip install seaborn --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.9/58.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m86.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.0/336.0 kB[0m [31m19.7 MB/s[0m eta [36m0:0

In [3]:
#@title Import Libraries
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.manifold import TSNE

import brain_observatory_utilities.datasets.optical_physiology.data_formatting as ophys_formatting
import brain_observatory_utilities.utilities.general_utilities as utilities

from allensdk.brain_observatory.behavior.behavior_project_cache import VisualBehaviorOphysProjectCache

pd.set_option('display.max_columns', 500)
# this line may be needed if you run into Error in pandas query function
# Otherwise set the engine to python in queries made throught the book
# pd.DataFrame.query = lambda self, expr, **kwargs: self.query(expr, engine='python', **kwargs)

In [4]:
#@title Data Retrival
# @title Data retrieval
import os, requests

fname = "allen_visual_behavior_2p_change_detection_familiar_novel_image_sets.parquet"
url = "https://ndownloader.figshare.com/files/28470255"

if not os.path.isfile(fname):
  try:
    r = requests.get(url)
  except requests.ConnectionError:
    print("!!! Failed to download data !!!")
  else:
    if r.status_code != requests.codes.ok:
      print("!!! Failed to download data !!!")
    else:
      with open(fname, "wb") as fid:
        fid.write(r.content)

In [5]:
filename = "allen_visual_behavior_2p_change_detection_familiar_novel_image_sets.parquet"
data = pd.read_parquet(filename)

In [11]:
data.head(10)

Unnamed: 0,stimulus_presentations_id,cell_specimen_id,trace,trace_timestamps,mean_response,baseline_response,image_name,image_index,is_change,omitted,mean_running_speed,mean_pupil_area,response_latency,rewarded,ophys_experiment_id,imaging_depth,targeted_structure,cre_line,session_type,session_number,mouse_id,ophys_session_id,ophys_container_id,behavior_session_id,full_genotype,reporter_line,driver_line,indicator,sex,age_in_days,exposure_level
85,5,1086496689,"[0.06366926431655884, 0.002476318972185254, -0...","[-1.228133913303992, -1.1958145997959921, -1.1...",0.006115,-0.084113,im075,2,True,False,6e-06,5822.220145,0.5004,True,994278281,275,VISp,Sst-IRES-Cre,OPHYS_3_images_B,3.0,476970,994199725,991913453,994210659,Sst-IRES-Cre/wt;Ai148(TIT2L-GC6f-ICL-tTA2)/wt,Ai148(TIT2L-GC6f-ICL-tTA2),[Sst-IRES-Cre],GCaMP6f,M,196.0,familiar
86,5,1086501573,"[0.0637083426117897, 0.0919334813952446, -0.09...","[-1.228133913303992, -1.1958145997959921, -1.1...",-0.018832,-0.025578,im075,2,True,False,6e-06,5822.220145,0.5004,True,994278281,275,VISp,Sst-IRES-Cre,OPHYS_3_images_B,3.0,476970,994199725,991913453,994210659,Sst-IRES-Cre/wt;Ai148(TIT2L-GC6f-ICL-tTA2)/wt,Ai148(TIT2L-GC6f-ICL-tTA2),[Sst-IRES-Cre],GCaMP6f,M,196.0,familiar
87,5,1086500533,"[0.0223990511149168, 0.22551870346069336, 0.11...","[-1.228133913303992, -1.1958145997959921, -1.1...",-0.041275,-0.032698,im075,2,True,False,6e-06,5822.220145,0.5004,True,994278281,275,VISp,Sst-IRES-Cre,OPHYS_3_images_B,3.0,476970,994199725,991913453,994210659,Sst-IRES-Cre/wt;Ai148(TIT2L-GC6f-ICL-tTA2)/wt,Ai148(TIT2L-GC6f-ICL-tTA2),[Sst-IRES-Cre],GCaMP6f,M,196.0,familiar
88,5,1086497438,"[0.01795899122953415, 0.15271763503551483, 0.0...","[-1.228133913303992, -1.1958145997959921, -1.1...",0.039527,-0.02675,im075,2,True,False,6e-06,5822.220145,0.5004,True,994278281,275,VISp,Sst-IRES-Cre,OPHYS_3_images_B,3.0,476970,994199725,991913453,994210659,Sst-IRES-Cre/wt;Ai148(TIT2L-GC6f-ICL-tTA2)/wt,Ai148(TIT2L-GC6f-ICL-tTA2),[Sst-IRES-Cre],GCaMP6f,M,196.0,familiar
89,5,1086497908,"[0.059766266494989395, 0.03602207452058792, 0....","[-1.228133913303992, -1.1958145997959921, -1.1...",-0.006628,-0.039277,im075,2,True,False,6e-06,5822.220145,0.5004,True,994278281,275,VISp,Sst-IRES-Cre,OPHYS_3_images_B,3.0,476970,994199725,991913453,994210659,Sst-IRES-Cre/wt;Ai148(TIT2L-GC6f-ICL-tTA2)/wt,Ai148(TIT2L-GC6f-ICL-tTA2),[Sst-IRES-Cre],GCaMP6f,M,196.0,familiar
90,5,1086501720,"[0.1370878517627716, 0.06181945651769638, 0.14...","[-1.228133913303992, -1.1958145997959921, -1.1...",0.010061,-0.113912,im075,2,True,False,6e-06,5822.220145,0.5004,True,994278281,275,VISp,Sst-IRES-Cre,OPHYS_3_images_B,3.0,476970,994199725,991913453,994210659,Sst-IRES-Cre/wt;Ai148(TIT2L-GC6f-ICL-tTA2)/wt,Ai148(TIT2L-GC6f-ICL-tTA2),[Sst-IRES-Cre],GCaMP6f,M,196.0,familiar
91,5,1086497129,"[-0.01640218496322632, -0.05918422341346741, -...","[-1.228133913303992, -1.1958145997959921, -1.1...",0.087167,-0.084272,im075,2,True,False,6e-06,5822.220145,0.5004,True,994278281,275,VISp,Sst-IRES-Cre,OPHYS_3_images_B,3.0,476970,994199725,991913453,994210659,Sst-IRES-Cre/wt;Ai148(TIT2L-GC6f-ICL-tTA2)/wt,Ai148(TIT2L-GC6f-ICL-tTA2),[Sst-IRES-Cre],GCaMP6f,M,196.0,familiar
92,5,1086497034,"[0.12177210301160812, 0.07332494109869003, -0....","[-1.228133913303992, -1.1958145997959921, -1.1...",0.00165,0.019924,im075,2,True,False,6e-06,5822.220145,0.5004,True,994278281,275,VISp,Sst-IRES-Cre,OPHYS_3_images_B,3.0,476970,994199725,991913453,994210659,Sst-IRES-Cre/wt;Ai148(TIT2L-GC6f-ICL-tTA2)/wt,Ai148(TIT2L-GC6f-ICL-tTA2),[Sst-IRES-Cre],GCaMP6f,M,196.0,familiar
93,5,1086496500,"[-0.1013624370098114, -0.06485532969236374, 0....","[-1.228133913303992, -1.1958145997959921, -1.1...",-0.003985,-0.011439,im075,2,True,False,6e-06,5822.220145,0.5004,True,994278281,275,VISp,Sst-IRES-Cre,OPHYS_3_images_B,3.0,476970,994199725,991913453,994210659,Sst-IRES-Cre/wt;Ai148(TIT2L-GC6f-ICL-tTA2)/wt,Ai148(TIT2L-GC6f-ICL-tTA2),[Sst-IRES-Cre],GCaMP6f,M,196.0,familiar
94,5,1086498071,"[0.0267901923507452, -0.04602086544036865, 0.0...","[-1.228133913303992, -1.1958145997959921, -1.1...",0.016455,-0.013882,im075,2,True,False,6e-06,5822.220145,0.5004,True,994278281,275,VISp,Sst-IRES-Cre,OPHYS_3_images_B,3.0,476970,994199725,991913453,994210659,Sst-IRES-Cre/wt;Ai148(TIT2L-GC6f-ICL-tTA2)/wt,Ai148(TIT2L-GC6f-ICL-tTA2),[Sst-IRES-Cre],GCaMP6f,M,196.0,familiar


In [12]:
data.targeted_structure.unique()

array(['VISp'], dtype=object)

## Dimensions to consider
- `cell_specimen_id`
- `trace`
- `trace_timestamps`
- `image_name`
- `image_index` (maybe)
- `omitted`
- `mean_running_speed`
- `mean_pupil_area`
- `rewarded`
- `imaging_depth`
- `cre_line`
- `exposure_level`

## Identification labels to consider
-`mouse_id`

In [23]:
trimmed_data = data[['cell_specimen_id','trace','trace_timestamps','mean_response','image_index','omitted','mean_running_speed','mean_pupil_area','rewarded','imaging_depth','cre_line','exposure_level','mouse_id']]
trimmed_data.head(10)

Unnamed: 0,cell_specimen_id,trace,trace_timestamps,mean_response,image_index,omitted,mean_running_speed,mean_pupil_area,rewarded,imaging_depth,cre_line,exposure_level,mouse_id
85,1086496689,"[0.06366926431655884, 0.002476318972185254, -0...","[-1.228133913303992, -1.1958145997959921, -1.1...",0.006115,2,False,6e-06,5822.220145,True,275,Sst-IRES-Cre,familiar,476970
86,1086501573,"[0.0637083426117897, 0.0919334813952446, -0.09...","[-1.228133913303992, -1.1958145997959921, -1.1...",-0.018832,2,False,6e-06,5822.220145,True,275,Sst-IRES-Cre,familiar,476970
87,1086500533,"[0.0223990511149168, 0.22551870346069336, 0.11...","[-1.228133913303992, -1.1958145997959921, -1.1...",-0.041275,2,False,6e-06,5822.220145,True,275,Sst-IRES-Cre,familiar,476970
88,1086497438,"[0.01795899122953415, 0.15271763503551483, 0.0...","[-1.228133913303992, -1.1958145997959921, -1.1...",0.039527,2,False,6e-06,5822.220145,True,275,Sst-IRES-Cre,familiar,476970
89,1086497908,"[0.059766266494989395, 0.03602207452058792, 0....","[-1.228133913303992, -1.1958145997959921, -1.1...",-0.006628,2,False,6e-06,5822.220145,True,275,Sst-IRES-Cre,familiar,476970
90,1086501720,"[0.1370878517627716, 0.06181945651769638, 0.14...","[-1.228133913303992, -1.1958145997959921, -1.1...",0.010061,2,False,6e-06,5822.220145,True,275,Sst-IRES-Cre,familiar,476970
91,1086497129,"[-0.01640218496322632, -0.05918422341346741, -...","[-1.228133913303992, -1.1958145997959921, -1.1...",0.087167,2,False,6e-06,5822.220145,True,275,Sst-IRES-Cre,familiar,476970
92,1086497034,"[0.12177210301160812, 0.07332494109869003, -0....","[-1.228133913303992, -1.1958145997959921, -1.1...",0.00165,2,False,6e-06,5822.220145,True,275,Sst-IRES-Cre,familiar,476970
93,1086496500,"[-0.1013624370098114, -0.06485532969236374, 0....","[-1.228133913303992, -1.1958145997959921, -1.1...",-0.003985,2,False,6e-06,5822.220145,True,275,Sst-IRES-Cre,familiar,476970
94,1086498071,"[0.0267901923507452, -0.04602086544036865, 0.0...","[-1.228133913303992, -1.1958145997959921, -1.1...",0.016455,2,False,6e-06,5822.220145,True,275,Sst-IRES-Cre,familiar,476970


In [24]:
len(trimmed_data.cell_specimen_id.unique())

223

In [25]:
len(trimmed_data.index.unique())

147695

## Resulting dataset
The Dataset should be at least 223 columns, each column for each `cell_specimen_id` and 147695 rows for each `stimulus_presentations_id`

In [26]:
trimmed_data.pivot_table