**Description:** Extract trial data from raw data filenames from the larval schooling project. Count the number of trials of various kinds (grouped by population, age, and number of fish).

In [None]:
import platform, os, sys, datetime, re, itertools
from os.path import join
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, Markdown, Latex

# plt.rcParams['figure.dpi'] = 150
plt.rcParams['figure.figsize'] = 9,6

In [None]:
''' 
Is this running on google drive or on a local computer? 
On google drive you have to authenticate and copy/paste a code, twice.
'''
on_google_drive = True # False # 

if on_google_drive:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    project_dir = '/content/drive/Shared drives/larval_schooling'
    from google.colab import auth
    import gspread
    from oauth2client.client import GoogleCredentials
    auth.authenticate_user()
    gc    = gspread.authorize(GoogleCredentials.get_application_default())
    # sheet = gc.open('trial_list')
    sheet = gc.open_by_url('https://docs.google.com/spreadsheets/d/1kkuDMjHZF4sRYKsrLFapVVNqYphUL9ywCUNcAZ_bmNE')
    rows  = sheet.sheet1.get_all_values()
    df    = pd.DataFrame.from_records(rows[1:],columns=rows[0])
else:
    project_dir = '../..'
    df = pd.read_excel(join(trial_dir,'trial_list.xlsx'))
    
df    = df[['n','Group','Tracking (Ethovision)','Video File Name']]
df['Tracking (Ethovision)'] = pd.to_numeric(df['Tracking (Ethovision)'], errors='coerce').astype(float)
trial_list = df.loc[df['Tracking (Ethovision)']==1,'Video File Name']


In [None]:
def parse_filename(name,mode=''):
    try:
        # Remove directory names and extension.
        name = os.path.basename(name)
        name = os.path.splitext(name)[0]
        # For ethovision raw data files, only keep the bit between the dashes.
        if mode=='etho':
          name = name.split('-')[1]
        # Extract trial info.
        split  = name.lower().split('_')
        pop,day,age,group,n = split[:5]
        extra  = None if len(split)==5 else ','.join(split[5:])
        n      = int(re.findall('(\d+)',n)[0])
        age    = int(age[:-3])
        # Merge 43dpf with 42dpf and 71dpf with 70dpf.
        age = 42 if age==43 else (70 if age==71 else age)
        # Merge RC and SF.
        pop = 'sf' if pop.lower()=='rc' else pop
        return { k:v for k,v in locals().items() if k in 
                ['name', 'pop', 'age', 'group', 'n'] }
    except:
        return {}

def count_trials(df):
    df = df.set_index('name')
    grouped_trials = df.groupby(['pop','age','n'])
    count  = pd.DataFrame(grouped_trials['n'].count().rename('count'))
    count = count.unstack(1)
    count.columns = count.columns.droplevel()
    count[pd.isna(count)] = 0
    return count.astype(int)

trials = {}

raw_files      = glob(join(project_dir,'raw_videos/*.avi'))
trials['raw']  = pd.DataFrame([ parse_filename(f) for f in raw_files])

etho_files     = glob(join(project_dir,'ethovision/Raw_Data/*.xlsx'))
trials['etho'] = pd.DataFrame([ parse_filename(f,mode='etho') for f in etho_files])

# trials['list'] = pd.DataFrame([ parse_filename(f) for f in trial_list])
rows           = [ parse_filename(f) for f in trial_list ] # get rid of missing filenames
trials['list'] = pd.DataFrame([r for r in rows if len(r)>0])

breakdown      = { k:count_trials(trials[k]) for k in trials.keys() }

def show_breakdown(k):
    display(Markdown(f'## {k.capitalize()}'))
    display(breakdown[k])
    for pop in breakdown[k].index.get_level_values('pop').unique():
        breakdown[k].loc[pop].T.plot(marker='o',figsize=(9,4))
        plt.legend(loc=(1.1,0.4),title='n')
        plt.xlabel('age (dpf)')
        plt.ylabel('number of trials')
        plt.title(pop)
        plt.show()

In [None]:
show_breakdown('raw')

In [None]:
show_breakdown('etho')

In [None]:
show_breakdown('list')