# Methods Notebook: Extracting Call Samples for KMeans Clustering

## Imports Section:

In [1]:
import numpy as np
import pandas as pd
import random
import scipy
from scipy import stats
import datetime as dt
import dask.dataframe as dd


import glob
import json
import matplotlib.pyplot as plt
from matplotlib import colors
import soundfile as sf
import matplotlib.patches as patches
from pathlib import Path

In [2]:
import sys

sys.path.append("../src")

In [3]:
from core import SITE_NAMES, FREQ_GROUPS
import bout.assembly as bt
import bout.clustering as bt_clustering
import bout.plot as bt_plt
import activity.subsampling as ss
import activity.activity_assembly as dh

import calls.compute_features as cf
import calls.call_extraction as cextract

from cli import get_file_paths

['/Users/adityakrishna/duty-cycle-investigation/daily_notebook', '/Users/adityakrishna/miniconda3/envs/dc-study/lib/python311.zip', '/Users/adityakrishna/miniconda3/envs/dc-study/lib/python3.11', '/Users/adityakrishna/miniconda3/envs/dc-study/lib/python3.11/lib-dynload', '', '/Users/adityakrishna/miniconda3/envs/dc-study/lib/python3.11/site-packages', '../src', '../src', '../src', '../src/bout', '../src', '/Users/adityakrishna/duty-cycle-investigation/daily_notebook/../src/bout', '/Users/adityakrishna/duty-cycle-investigation/daily_notebook/../src']


In [4]:
ubna_data_df = dd.read_csv('../data/ubna_data_*_mir_collected_audio_records.csv', dtype=str).compute()
if 'Unnamed: 0' in ubna_data_df.columns:
    ubna_data_df.drop(columns='Unnamed: 0', inplace=True)
ubna_data_df["datetime_UTC"] = pd.DatetimeIndex(ubna_data_df["datetime_UTC"])
ubna_data_df.set_index("datetime_UTC", inplace=True)

In [11]:
def filter_df_with_location(ubna_data_df, cfg):
    site_name_cond = ubna_data_df["site_name"] == cfg['site']
    file_year_cond = ubna_data_df.index.year == (dt.datetime.strptime(cfg['year'], '%Y')).year
    file_month_cond = ubna_data_df.index.month == (dt.datetime.strptime(cfg['month'], '%B')).month
    minute_cond = np.logical_or((ubna_data_df.index).minute == 30, (ubna_data_df.index).minute == 0)
    datetime_cond = np.logical_and((ubna_data_df.index).second == 0, minute_cond)
    file_error_cond = np.logical_and((ubna_data_df["file_duration"]!='File has no comment due to error!'), (ubna_data_df["file_duration"]!='File has no Audiomoth-related comment'))
    all_errors_cond = np.logical_and((ubna_data_df["file_duration"]!='Is empty!'), file_error_cond)
    file_date_cond = np.logical_and(file_year_cond, file_month_cond)

    filtered_location_df = ubna_data_df.loc[site_name_cond&datetime_cond&file_date_cond&all_errors_cond].sort_index()
    filtered_location_nightly_df = filtered_location_df.between_time(cfg['recording_start'], cfg['recording_end'], inclusive="left")

    return filtered_location_nightly_df

In [18]:
cfg = dict()
cfg['site'] = 'Telephone Field'
cfg['year'] = '2022'
cfg['month'] = 'October'
cfg['recording_start'] = '00:00'
cfg['recording_end'] = '16:00'
cfg['duration'] = 1795


files_from_location = filter_df_with_location(ubna_data_df, cfg)

data_params = dict()
data_params['ref_audio_files'] = sorted(list(files_from_location["file_path"].apply(lambda x : Path(x)).values))
file_status_cond = files_from_location["file_status"] == "Usable for detection"
file_duration_cond = np.isclose(files_from_location["file_duration"].astype('float'), cfg['duration'])
good_location_df = files_from_location.loc[file_status_cond&file_duration_cond]
good_location_df

Unnamed: 0_level_0,site_name,recover_folder,audiomoth_num,sd_card_num,file_path,file_metadata,file_status,audiomoth_temperature,audiomoth_battery,sample_rate,audiomoth_artist_ID,file_duration,Deployment notes
datetime_UTC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2022-10-01 00:00:00,Telephone Field,recover-20221003,A,007,/mnt/ubna_data_02_mir/recover-20221003/UBNA_00...,Recorded at 00:00:00 01/10/2022 (UTC) by Audio...,Usable for detection,27.4C,4.0V,192000,AudioMoth 24F3190361CBE990,1795.00127083333,IKEA
2022-10-01 00:30:00,Telephone Field,recover-20221003,A,007,/mnt/ubna_data_02_mir/recover-20221003/UBNA_00...,Recorded at 00:30:00 01/10/2022 (UTC) by Audio...,Usable for detection,26.7C,4.0V,192000,AudioMoth 24F3190361CBE990,1795.00127083333,IKEA
2022-10-01 01:00:00,Telephone Field,recover-20221003,A,007,/mnt/ubna_data_02_mir/recover-20221003/UBNA_00...,Recorded at 01:00:00 01/10/2022 (UTC) by Audio...,Usable for detection,26.6C,4.0V,192000,AudioMoth 24F3190361CBE990,1795.00127083333,IKEA
2022-10-01 01:30:00,Telephone Field,recover-20221003,A,007,/mnt/ubna_data_02_mir/recover-20221003/UBNA_00...,Recorded at 01:30:00 01/10/2022 (UTC) by Audio...,Usable for detection,26.0C,4.0V,192000,AudioMoth 24F3190361CBE990,1795.00127083333,IKEA
2022-10-01 02:00:00,Telephone Field,recover-20221003,A,007,/mnt/ubna_data_02_mir/recover-20221003/UBNA_00...,Recorded at 02:00:00 01/10/2022 (UTC) by Audio...,Usable for detection,25.3C,4.0V,192000,AudioMoth 24F3190361CBE990,1795.00127083333,IKEA
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-10-17 13:30:00,Telephone Field,recover-20221017,A,007,/mnt/ubna_data_02_mir/recover-20221017/UBNA_00...,Recorded at 13:30:00 17/10/2022 (UTC) by Audio...,Usable for detection,18.3C,3.7V,192000,AudioMoth 24F3190361CBE990,1795.00127083333,IKEA
2022-10-17 14:00:00,Telephone Field,recover-20221017,A,007,/mnt/ubna_data_02_mir/recover-20221017/UBNA_00...,Recorded at 14:00:00 17/10/2022 (UTC) by Audio...,Usable for detection,17.4C,3.7V,192000,AudioMoth 24F3190361CBE990,1795.00127083333,IKEA
2022-10-17 14:30:00,Telephone Field,recover-20221017,A,007,/mnt/ubna_data_02_mir/recover-20221017/UBNA_00...,Recorded at 14:30:00 17/10/2022 (UTC) by Audio...,Usable for detection,17.5C,3.7V,192000,AudioMoth 24F3190361CBE990,1795.00127083333,IKEA
2022-10-17 15:00:00,Telephone Field,recover-20221017,A,007,/mnt/ubna_data_02_mir/recover-20221017/UBNA_00...,Recorded at 15:00:00 17/10/2022 (UTC) by Audio...,Usable for detection,17.4C,3.7V,192000,AudioMoth 24F3190361CBE990,1795.00127083333,IKEA


In [19]:
data_params['good_audio_files'] = sorted(list(good_location_df["file_path"].apply(lambda x : Path(x)).values))

if data_params['good_audio_files'] == data_params['ref_audio_files']:
    print("All files from deployment session good!")
else:
    print("Error files exist!")

print(f"Will be looking at {len(data_params['good_audio_files'])} files from {cfg['site']}")

paths = good_location_df['file_path'].values
with open(f"{cfg['site'].split()[0].lower()}_{cfg['month'].lower()}__filelist.txt", 'w') as txt_file:
    for line in paths:
        txt_file.write("".join(line) + "\n") # works with any number of elements in a line

Error files exist!
Will be looking at 538 files from Telephone Field
