# Methods Notebook: Extracting Call Samples for KMeans Clustering

## Imports Section:

In [1]:
import numpy as np
import pandas as pd
import random
import scipy
from scipy import stats
import datetime as dt
import dask.dataframe as dd


import re
import suncalc
import matplotlib.pyplot as plt
from matplotlib import colors
import soundfile as sf
import matplotlib.patches as patches
from pathlib import Path

In [2]:
import sys

sys.path.append("../src")

In [3]:
from core import SITE_NAMES, FREQ_GROUPS, SEATTLE_LATITUDE, SEATTLE_LONGITUDE
import bout.assembly as bt
import bout.clustering as bt_clustering
import bout.plot as bt_plt
import activity.subsampling as ss
import activity.activity_assembly as dh

import calls.compute_features as cf
import calls.call_extraction as cextract

from cli import get_file_paths

['/Users/adityakrishna/duty-cycle-investigation/daily_notebook', '/Users/adityakrishna/miniconda3/envs/dc-study/lib/python311.zip', '/Users/adityakrishna/miniconda3/envs/dc-study/lib/python3.11', '/Users/adityakrishna/miniconda3/envs/dc-study/lib/python3.11/lib-dynload', '', '/Users/adityakrishna/miniconda3/envs/dc-study/lib/python3.11/site-packages', '../src', '../src', '../src', '../src/bout', '../src', '/Users/adityakrishna/duty-cycle-investigation/daily_notebook/../src/bout', '/Users/adityakrishna/duty-cycle-investigation/daily_notebook/../src']


In [4]:
ubna_data_df = dd.read_csv('../data/ubna_data_*_mir_collected_audio_records.csv', dtype=str).compute()
if 'Unnamed: 0' in ubna_data_df.columns:
    ubna_data_df.drop(columns='Unnamed: 0', inplace=True)
ubna_data_df["datetime_UTC"] = pd.DatetimeIndex(ubna_data_df["datetime_UTC"])
ubna_data_df.set_index("datetime_UTC", inplace=True)

In [5]:
def filter_df_with_location(ubna_data_df, cfg):
    site_name_cond = ubna_data_df["site_name"] == cfg['site']
    file_year_cond = ubna_data_df.index.year == (dt.datetime.strptime(cfg['year'], '%Y')).year
    file_month_cond = ubna_data_df.index.month == (dt.datetime.strptime(cfg['month'], '%B')).month
    minute_cond = np.logical_or((ubna_data_df.index).minute == 30, (ubna_data_df.index).minute == 0)
    datetime_cond = np.logical_and((ubna_data_df.index).second == 0, minute_cond)
    file_error_cond = np.logical_and((ubna_data_df["file_duration"]!='File has no comment due to error!'), (ubna_data_df["file_duration"]!='File has no Audiomoth-related comment'))
    all_errors_cond = np.logical_and((ubna_data_df["file_duration"]!='Is empty!'), file_error_cond)
    file_date_cond = np.logical_and(file_year_cond, file_month_cond)

    filtered_location_df = ubna_data_df.loc[site_name_cond&datetime_cond&file_date_cond&all_errors_cond].sort_index()
    filtered_location_nightly_df = filtered_location_df.between_time(cfg['recording_start'], cfg['recording_end'], inclusive="left")

    return filtered_location_nightly_df

In [6]:
cfg = dict()
cfg['site'] = 'Carp Pond'
cfg['year'] = '2022'
cfg['month'] = 'October'
cfg['recording_start'] = '00:00'
cfg['recording_end'] = '16:00'
cfg['duration'] = 1795

files_from_location = filter_df_with_location(ubna_data_df, cfg)
files_from_location

data_params = dict()
data_params['site_tag'] = cfg['site'].split()[0]
data_params["site_name"] = cfg['site']
data_params["type_tag"] = ''
data_params['cur_dc_tag'] = '30of30'
data_params['bin_size'] = '30'
data_params['recording_start'] = cfg['recording_start']
data_params['recording_end'] = cfg['recording_end']

file_paths = get_file_paths(data_params)

data_params['ref_audio_files'] = sorted(list(files_from_location["file_path"].apply(lambda x : Path(x)).values))
file_status_cond = files_from_location["file_status"] == "Usable for detection"
file_duration_cond = np.isclose(files_from_location["file_duration"].astype('float'), cfg['duration'])
good_location_df = files_from_location.loc[file_status_cond&file_duration_cond]
data_params['good_audio_files'] = sorted(list(good_location_df["file_path"].apply(lambda x : Path(x)).values))

if data_params['good_audio_files'] == data_params['ref_audio_files']:
    print("All files from deployment session good!")
else:
    print("Error files exist!")

print(f"Will be looking at {len(data_params['good_audio_files'])} files from {cfg['site']}")

paths = good_location_df['file_path'].values
with open(f"{cfg['site'].split()[0].lower()}_{cfg['month'].lower()}__filelist.txt", 'w') as txt_file:
    for line in paths:
        txt_file.write("".join(line) + "\n") # works with any number of elements in a line

Error files exist!
Will be looking at 542 files from Carp Pond


In [7]:
def convert_kaleidoscopedf_to_bd2df(df):
    bd2_df = pd.DataFrame()
    bd2_df['start_time'] = df['OFFSET']
    bd2_df['end_time'] = df['OFFSET'] + df['DURATION']
    bd2_df['low_freq'] = df['Fmin']
    bd2_df['high_freq'] = df['Fmax']
    bd2_df['input_file'] = df['IN FILE']
    bd2_df['input_dir'] = np.char.add(np.char.add(df['INDIR'].values.astype(str), '/'), df['FOLDER'].values.astype(str))
    bd2_df['mean_freq'] = df['Fmean']
    if 'TOP1MATCH*' in df.columns:
        bd2_df['TOP1MATCH*'] = df['TOP1MATCH*']
    if 'TOP1MATCH' in df.columns:
        bd2_df['TOP1MATCH'] = df['TOP1MATCH']
    bd2_df['TOP1DIST'] = df['TOP1DIST']
    bd2_df['TOP2MATCH'] = df['TOP2MATCH']
    bd2_df['TOP2DIST'] = df['TOP2DIST']
    bd2_df['TOP3MATCH'] = df['TOP3MATCH']
    bd2_df['TOP3DIST'] = df['TOP3DIST']
    bd2_df.sort_values('input_file', inplace=True)
    
    return bd2_df

def sort_file_group(file_group):
    return file_group.sort_values('start_time')

In [8]:
location_kd_df = dd.read_csv('../../Downloads/Carp/**/cluster.csv').compute()
location_kd_df

Unnamed: 0,INDIR,FOLDER,IN FILE,CHANNEL,OFFSET,DURATION,Fmin,Fmean,Fmax,DATE,...,TOP2DIST,TOP3MATCH,TOP3DIST,VOCALIZATIONS,MANUAL ID,ORGID,USERID,REVIEW ORGID,REVIEW USERID,INPATHMD5
0,/mnt,ubna_data_01_mir/recover-20220801/UBNA_004,20220801_000000.WAV,0,807.981934,0.001464,21390.375,21858.773,22160.836,2022-08-01,...,0,,0,1,,,,,,jIumorBSrPVXoX4QiHTf7A==
1,/mnt,ubna_data_01_mir/recover-20220801/UBNA_004,20220801_070000.WAV,0,27.638887,0.014711,25806.451,26103.635,26316.930,2022-08-01,...,0,,0,1,,,,,,e2bITztG5t0RjBNgkH+wXg==
2,/mnt,ubna_data_01_mir/recover-20220801/UBNA_004,20220801_063000.WAV,0,3.867346,0.001636,24096.385,24450.600,25161.215,2022-08-01,...,0,,0,1,,,,,,ehm9ZJrZ/nz5B599IQ7LmQ==
3,/mnt,ubna_data_01_mir/recover-20220801/UBNA_004,20220801_063000.WAV,0,4.620121,0.001623,24391.150,24645.752,25158.227,2022-08-01,...,0,,0,1,,,,,,ehm9ZJrZ/nz5B599IQ7LmQ==
4,/mnt,ubna_data_01_mir/recover-20220801/UBNA_004,20220801_063000.WAV,0,6.914619,0.001319,23809.523,24260.234,25000.977,2022-08-01,...,0,,0,1,,,,,,ehm9ZJrZ/nz5B599IQ7LmQ==
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104238,/mnt,ubna_data_02_mir/recover-20221003/UBNA_010,20220930_120000.WAV,0,1699.890381,0.001287,43190.523,46631.738,51290.480,2022-09-30,...,0,,0,1,,,,,,MMJGcF1St3fDht0i7cM88g==
104239,/mnt,ubna_data_02_mir/recover-20221003/UBNA_010,20220930_120000.WAV,0,1700.959961,0.001408,43244.508,45466.867,49412.840,2022-09-30,...,0,,0,1,,,,,,MMJGcF1St3fDht0i7cM88g==
104240,/mnt,ubna_data_02_mir/recover-20221003/UBNA_010,20220930_120000.WAV,0,1701.278076,0.001156,43956.043,44980.062,47084.891,2022-09-30,...,0,,0,1,,,,,,MMJGcF1St3fDht0i7cM88g==
104241,/mnt,ubna_data_02_mir/recover-20221003/UBNA_010,20220930_120000.WAV,0,1701.682861,0.001448,39409.824,41445.309,43961.352,2022-09-30,...,0,,0,1,,,,,,MMJGcF1St3fDht0i7cM88g==


In [9]:
location_df = convert_kaleidoscopedf_to_bd2df(location_kd_df)
location_df['file_group'] = location_df['input_file']
location_df = location_df.groupby('file_group', group_keys=False).apply(lambda x: sort_file_group(x))

location_df['start_time'] = location_df['start_time'].astype('float64')
location_df['end_time'] = location_df['end_time'].astype('float64')
location_df['low_freq'] = location_df['low_freq'].astype('float64')
location_df['high_freq'] = location_df['high_freq'].astype('float64')
file_dts = pd.to_datetime(location_df['input_file'], format='%Y%m%d_%H%M%S', exact=False)
anchor_start_times = file_dts + pd.to_timedelta(location_df['start_time'], unit='S')
anchor_end_times = file_dts + pd.to_timedelta(location_df['end_time'], unit='S')

location_df.insert(0, 'call_end_time', anchor_end_times)
location_df.insert(0, 'call_start_time', anchor_start_times)
location_df.insert(0, 'ref_time', anchor_start_times)
location_df.reset_index(inplace=True)
location_df.rename({'index':'index_in_file'}, axis='columns', inplace=True)
location_df

Unnamed: 0,index_in_file,ref_time,call_start_time,call_end_time,start_time,end_time,low_freq,high_freq,input_file,input_dir,mean_freq,TOP1MATCH,TOP1DIST,TOP2MATCH,TOP2DIST,TOP3MATCH,TOP3DIST,file_group
0,0,2022-07-13 01:01:10.959251,2022-07-13 01:01:10.959251,2022-07-13 01:01:10.963812,70.959251,70.963812,61157.797,80128.203,20220713_010000.WAV,/mnt/ubna_data_01_mir/recover-20220715/UBNA_008,72792.836,nocluster,0,,0,,0,20220713_010000.WAV
1,1,2022-07-13 01:01:17.038521,2022-07-13 01:01:17.038521,2022-07-13 01:01:17.039521,77.038521,77.039521,74772.883,92063.492,20220713_010000.WAV,/mnt/ubna_data_01_mir/recover-20220715/UBNA_008,83963.656,nocluster,0,,0,,0,20220713_010000.WAV
2,2,2022-07-13 01:03:23.501175,2022-07-13 01:03:23.501175,2022-07-13 01:03:23.502590,203.501175,203.502590,27681.992,29198.637,20220713_010000.WAV,/mnt/ubna_data_01_mir/recover-20220715/UBNA_008,28270.480,nocluster,0,,0,,0,20220713_010000.WAV
3,5,2022-07-13 01:16:27.368286,2022-07-13 01:16:27.368286,2022-07-13 01:16:27.369423,987.368286,987.369423,10378.247,10612.768,20220713_010000.WAV,/mnt/ubna_data_01_mir/recover-20220715/UBNA_008,10558.699,nocluster,0,,0,,0,20220713_010000.WAV
4,6,2022-07-13 01:19:59.530151,2022-07-13 01:19:59.530151,2022-07-13 01:19:59.532535,1199.530151,1199.532535,10025.062,10139.824,20220713_010000.WAV,/mnt/ubna_data_01_mir/recover-20220715/UBNA_008,13422.763,nocluster,0,,0,,0,20220713_010000.WAV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
610510,22538,2022-10-17 13:40:29.671753,2022-10-17 13:40:29.671753,2022-10-17 13:40:29.673693,629.671753,629.673693,47065.336,57154.523,20221017_133000.WAV,/mnt/ubna_data_02_mir/recover-20221017/UBNA_010,51536.633,nocluster,0,,0,,0,20221017_133000.WAV
610511,22539,2022-10-17 13:42:52.847473,2022-10-17 13:42:52.847473,2022-10-17 13:42:52.848656,772.847473,772.848656,26405.229,27587.520,20221017_133000.WAV,/mnt/ubna_data_02_mir/recover-20221017/UBNA_010,27048.941,nocluster,0,,0,,0,20221017_133000.WAV
610512,22545,2022-10-17 14:12:37.142395,2022-10-17 14:12:37.142395,2022-10-17 14:12:37.143893,757.142395,757.143893,10568.051,10754.631,20221017_140000.WAV,/mnt/ubna_data_02_mir/recover-20221017/UBNA_010,10678.930,nocluster,0,,0,,0,20221017_140000.WAV
610513,22546,2022-10-17 14:12:47.538147,2022-10-17 14:12:47.538147,2022-10-17 14:12:47.540140,767.538147,767.540140,10004.002,10102.041,20221017_140000.WAV,/mnt/ubna_data_02_mir/recover-20221017/UBNA_010,10035.919,nocluster,0,,0,,0,20221017_140000.WAV


In [10]:
if not('freq_group' in location_df.columns):
    location_df.insert(0, 'freq_group', '')
groups = FREQ_GROUPS[data_params['site_tag']]
blue_group = groups['LF1']
red_group = groups['HF1']
yellow_group = groups['HF2']

call_is_yellow = (location_df['low_freq']>=yellow_group[0])&(location_df['high_freq']<=yellow_group[1])
call_is_red = (location_df['low_freq']>=red_group[0])&(location_df['high_freq']<=red_group[1])
call_is_blue = (location_df['low_freq']>=blue_group[0])&(location_df['high_freq']<=blue_group[1])

location_df.loc[call_is_yellow, 'freq_group'] = 'HF2'
location_df.loc[call_is_red&(~(call_is_yellow)), 'freq_group'] = 'HF1'
location_df.loc[call_is_blue&(~(call_is_red | call_is_yellow)), 'freq_group'] = 'LF1'
location_df.loc[(~(call_is_red | call_is_yellow | call_is_blue)), 'freq_group'] = np.NaN
location_df = location_df.loc[~location_df['freq_group'].isna()].copy()
location_df

Unnamed: 0,freq_group,index_in_file,ref_time,call_start_time,call_end_time,start_time,end_time,low_freq,high_freq,input_file,input_dir,mean_freq,TOP1MATCH,TOP1DIST,TOP2MATCH,TOP2DIST,TOP3MATCH,TOP3DIST,file_group
0,HF2,0,2022-07-13 01:01:10.959251,2022-07-13 01:01:10.959251,2022-07-13 01:01:10.963812,70.959251,70.963812,61157.797,80128.203,20220713_010000.WAV,/mnt/ubna_data_01_mir/recover-20220715/UBNA_008,72792.836,nocluster,0,,0,,0,20220713_010000.WAV
1,HF2,1,2022-07-13 01:01:17.038521,2022-07-13 01:01:17.038521,2022-07-13 01:01:17.039521,77.038521,77.039521,74772.883,92063.492,20220713_010000.WAV,/mnt/ubna_data_01_mir/recover-20220715/UBNA_008,83963.656,nocluster,0,,0,,0,20220713_010000.WAV
2,LF1,2,2022-07-13 01:03:23.501175,2022-07-13 01:03:23.501175,2022-07-13 01:03:23.502590,203.501175,203.502590,27681.992,29198.637,20220713_010000.WAV,/mnt/ubna_data_01_mir/recover-20220715/UBNA_008,28270.480,nocluster,0,,0,,0,20220713_010000.WAV
8,LF1,4,2022-07-13 01:37:39.950562,2022-07-13 01:37:39.950562,2022-07-13 01:37:39.951646,459.950562,459.951646,24922.602,28128.701,20220713_013000.WAV,/mnt/ubna_data_01_mir/recover-20220715/UBNA_008,25840.555,nocluster,0,,0,,0,20220713_013000.WAV
11,LF1,3,2022-07-13 03:36:44.457794,2022-07-13 03:36:44.457794,2022-07-13 03:36:44.458915,404.457794,404.458915,31250.000,32657.957,20220713_033000.WAV,/mnt/ubna_data_01_mir/recover-20220715/UBNA_008,32100.697,nocluster,0,,0,,0,20220713_033000.WAV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
610508,HF2,22536,2022-10-17 13:40:28.351562,2022-10-17 13:40:28.351562,2022-10-17 13:40:28.352661,628.351562,628.352661,47919.656,55968.688,20221017_133000.WAV,/mnt/ubna_data_02_mir/recover-20221017/UBNA_010,50958.004,nocluster,0,,0,,0,20221017_133000.WAV
610509,HF2,22537,2022-10-17 13:40:28.546326,2022-10-17 13:40:28.546326,2022-10-17 13:40:28.547833,628.546326,628.547833,52343.484,62019.230,20221017_133000.WAV,/mnt/ubna_data_02_mir/recover-20221017/UBNA_010,55728.887,nocluster,0,,0,,0,20221017_133000.WAV
610510,HF2,22538,2022-10-17 13:40:29.671753,2022-10-17 13:40:29.671753,2022-10-17 13:40:29.673693,629.671753,629.673693,47065.336,57154.523,20221017_133000.WAV,/mnt/ubna_data_02_mir/recover-20221017/UBNA_010,51536.633,nocluster,0,,0,,0,20221017_133000.WAV
610511,LF1,22539,2022-10-17 13:42:52.847473,2022-10-17 13:42:52.847473,2022-10-17 13:42:52.848656,772.847473,772.848656,26405.229,27587.520,20221017_133000.WAV,/mnt/ubna_data_02_mir/recover-20221017/UBNA_010,27048.941,nocluster,0,,0,,0,20221017_133000.WAV


In [11]:
location_df.to_csv(f'{file_paths["SITE_folder"]}/kd__{data_params["type_tag"]}{data_params["site_tag"]}_2022.csv')

In [12]:
def relabel_drivenames_to_mirrors(filepaths):
    drivename = re.compile(r'ubna_data_0[0-9]/')
    for i, fp in enumerate(filepaths):
        if bool(drivename.search(fp)):
            d_name = drivename.search(fp).group()
            replace_d_name = f'{d_name[:-1]}_mir/'
            filepaths[i] = filepaths[i].replace(d_name, replace_d_name)

    return filepaths

def get_params_relevant_to_data_at_location(cfg):
    data_params = dict()
    data_params["type_tag"] = ''
    data_params["cur_dc_tag"] = "1800of1800"
    data_params["site_tag"] = cfg['site']
    data_params['site_name'] = SITE_NAMES[cfg['site']]
    print(f"Searching for files from {data_params['site_name']}")

    file_paths = get_file_paths(data_params)
    location_sum_df = pd.read_csv(f'{file_paths["SITE_folder"]}/{file_paths["bd2_TYPE_SITE_YEAR"]}.csv', low_memory=False, index_col=0)
    location_sum_df.reset_index(inplace=True)
    location_sum_df.rename({'index':'index_in_file'}, axis='columns', inplace=True)
    location_sum_df.reset_index(inplace=True)
    location_sum_df.rename({'index':'index_in_summary'}, axis='columns', inplace=True)
    site_filepaths = relabel_drivenames_to_mirrors(location_sum_df['input_file'].copy().unique())
    bout_params = bt.get_bout_params_from_location(location_sum_df, data_params)

    data_params['good_audio_files'] = site_filepaths
    data_params['bout_params'] = bout_params
    print(f"Will be looking at {len(data_params['good_audio_files'])} files from {data_params['site_name']}")

    return location_sum_df, data_params

In [13]:
location_df.reset_index(inplace=True)
location_df.rename({'index':'index_in_summary'}, axis='columns', inplace=True)
location_df

Unnamed: 0,index_in_summary,freq_group,index_in_file,ref_time,call_start_time,call_end_time,start_time,end_time,low_freq,high_freq,input_file,input_dir,mean_freq,TOP1MATCH,TOP1DIST,TOP2MATCH,TOP2DIST,TOP3MATCH,TOP3DIST,file_group
0,0,HF2,0,2022-07-13 01:01:10.959251,2022-07-13 01:01:10.959251,2022-07-13 01:01:10.963812,70.959251,70.963812,61157.797,80128.203,20220713_010000.WAV,/mnt/ubna_data_01_mir/recover-20220715/UBNA_008,72792.836,nocluster,0,,0,,0,20220713_010000.WAV
1,1,HF2,1,2022-07-13 01:01:17.038521,2022-07-13 01:01:17.038521,2022-07-13 01:01:17.039521,77.038521,77.039521,74772.883,92063.492,20220713_010000.WAV,/mnt/ubna_data_01_mir/recover-20220715/UBNA_008,83963.656,nocluster,0,,0,,0,20220713_010000.WAV
2,2,LF1,2,2022-07-13 01:03:23.501175,2022-07-13 01:03:23.501175,2022-07-13 01:03:23.502590,203.501175,203.502590,27681.992,29198.637,20220713_010000.WAV,/mnt/ubna_data_01_mir/recover-20220715/UBNA_008,28270.480,nocluster,0,,0,,0,20220713_010000.WAV
3,8,LF1,4,2022-07-13 01:37:39.950562,2022-07-13 01:37:39.950562,2022-07-13 01:37:39.951646,459.950562,459.951646,24922.602,28128.701,20220713_013000.WAV,/mnt/ubna_data_01_mir/recover-20220715/UBNA_008,25840.555,nocluster,0,,0,,0,20220713_013000.WAV
4,11,LF1,3,2022-07-13 03:36:44.457794,2022-07-13 03:36:44.457794,2022-07-13 03:36:44.458915,404.457794,404.458915,31250.000,32657.957,20220713_033000.WAV,/mnt/ubna_data_01_mir/recover-20220715/UBNA_008,32100.697,nocluster,0,,0,,0,20220713_033000.WAV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
608584,610508,HF2,22536,2022-10-17 13:40:28.351562,2022-10-17 13:40:28.351562,2022-10-17 13:40:28.352661,628.351562,628.352661,47919.656,55968.688,20221017_133000.WAV,/mnt/ubna_data_02_mir/recover-20221017/UBNA_010,50958.004,nocluster,0,,0,,0,20221017_133000.WAV
608585,610509,HF2,22537,2022-10-17 13:40:28.546326,2022-10-17 13:40:28.546326,2022-10-17 13:40:28.547833,628.546326,628.547833,52343.484,62019.230,20221017_133000.WAV,/mnt/ubna_data_02_mir/recover-20221017/UBNA_010,55728.887,nocluster,0,,0,,0,20221017_133000.WAV
608586,610510,HF2,22538,2022-10-17 13:40:29.671753,2022-10-17 13:40:29.671753,2022-10-17 13:40:29.673693,629.671753,629.673693,47065.336,57154.523,20221017_133000.WAV,/mnt/ubna_data_02_mir/recover-20221017/UBNA_010,51536.633,nocluster,0,,0,,0,20221017_133000.WAV
608587,610511,LF1,22539,2022-10-17 13:42:52.847473,2022-10-17 13:42:52.847473,2022-10-17 13:42:52.848656,772.847473,772.848656,26405.229,27587.520,20221017_133000.WAV,/mnt/ubna_data_02_mir/recover-20221017/UBNA_010,27048.941,nocluster,0,,0,,0,20221017_133000.WAV
