# **Complete subsampling pipeline breakdown**

## 1) Start with our imports:

### The cell below contains the imports needed for `src/pipeline/pipeline.py`

In [None]:
import numpy as np
import pandas as pd

from pathlib import Path

from torch import multiprocessing
from tqdm import tqdm

### The cell below contains the imports needed for `src/pipeline/audio_segmentor.py`

In [None]:
import os
import soundfile as sf

### The cell below contains the imports needed for `src/cli.py`

In [None]:
import sys

# # append the path of the
# # parent directory
sys.path.append('..')
sys.path.append('../src/')
sys.path.append('../src/models/bat_call_detector/batdetect2/')

import src.subsampling as ss

## 2) Write any custom methods below:

### Below method is the implementation of subsampling we used for generating detections used in the Symposium results
#### a) How it works:
##### &nbsp;&nbsp;&nbsp; i) Takes in segmented_file_paths from MSDS pipeline
##### &nbsp;&nbsp;&nbsp; ii) Filters out segmented_file_paths generated from MSDS pipeline. 
##### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; - Removes paths that would not exist if the recorder incorporated duty cycling by using the `percent_on` and `cycle_length` parameters of duty cycle.

In [None]:
def subsample_withpaths(segmented_file_paths, cfg, cycle_length, percent_on):
    necessary_paths = []

    for path in segmented_file_paths:
        if (path['offset'] % cycle_length == 0 # Check if starting position is within recording period; won't need to check rest of boolean if it is
            or ((path['offset']+cfg['segment_duration'])%cycle_length > 0 and (path['offset']+cfg['segment_duration'])%cycle_length <= int(cycle_length*percent_on))):
            necessary_paths.append(path)

    return necessary_paths

## 3) Now we begin our actual dissection of the subsampling pipeline:

### Let's start with an audio file to demonstrate how we used our `subsampling.py` script

In [None]:
filepath = f"{Path.home()}/Documents/Research/Lab_related/example/original_recording"
filename = "20210910_030000.WAV"

### The below command is the command line invocation of the subsampling pipeline.

Command: `python src/subsampling.py ../Documents/Research/Lab_related/example/original_recording '5min_every_30min__Central_20210910_030000.csv' 'output_dir' 'output/tmp' 1800 0.167`

- `../Documents/Research/Lab_related/example/original_recording` is the folder path that contains our recording. 
   - Our pipeline takes in a folder and generates detections for every recording in those folders
- `5min_every_30min__Central_20210910_030000.csv` is the name of the output detections .csv.
   - For multiple consecutive recordings, we've labelled the output file as "...030000to130000.csv"
- `output_dir` is the repository folder where output detections .csv files will be saved.
- `output/tmp` is the repository folder where generated segment recordings will be saved and deleted after detections have been generated.
- `1800` is the provided cycle_length to generate duty cycled detections.
- `0.167` is the provided percent_on to generate duty cycled detections.

### Calling the above command runs the following code:

- `args = parse_args()`
**Takes in the command line positional arguments**

- `run_subsampling_pipeline(args['input_dir'], args['cycle_length'], args['percent_on'], args['csv_filename'], args['output_dir'], args['temp_dir'])`

In [None]:
def run_subsampling_pipeline(input_dir, cycle_length, percent_on, csv_name, output_dir, tmp_dir):
    cfg = ss.get_params(output_dir, tmp_dir, 4, 30.0)
    summer_audio_files = sorted(list(Path(input_dir).iterdir()))
    segmented_file_paths = ss.generate_segmented_paths(summer_audio_files, cfg)
    
    ## Get file paths specific to our subsampling parameters
    if (percent_on < 1.0):
        necessary_paths = subsample_withpaths(segmented_file_paths, cfg, cycle_length, percent_on)
    else:
        necessary_paths = segmented_file_paths

    file_path_mappings = ss.initialize_mappings(necessary_paths, cfg)
    bd_dets = ss.run_models(file_path_mappings, cfg, csv_name)

    return bd_dets