In [116]:
import pandas as pd
from pydub import AudioSegment
import os
from birdclassification.preprocessing.filtering import filter_recordings_30

#ESC50
esc50_original_csv = "/Users/zosia/Desktop/Not_birds/Original_csv/esc50.csv"
esc50_audio = "/Users/zosia/Desktop/Not_birds/ESC50/audio"
esc50_output_audio = "/Users/zosia/Desktop/NotBirds/ESC50"

#Audioset
audioset_original_csv = "/Users/zosia/Desktop/Not_birds/Original_csv/audioset.csv"
audioset_original_csv_labels = "/Users/zosia/Desktop/Not_birds/Original_csv/audioset_labels.csv"
audioset_audio = "/Users/zosia/Desktop/Not_birds/Audioset/audio"
audioset_output_audio = "/Users/zosia/Desktop/NotBirds/Audioset"

#WARBLRB
warblrb_original_csv = "/Users/zosia/Desktop/Not_birds/Original_csv/warblrb.csv"
warblrb_audio = "/Users/zosia/Desktop/Not_birds/Warblrb/audio"
warblrb_output_audio = "/Users/zosia/Desktop/NotBirds/Warblrb"

#Freefield
freefield_original_csv = "/Users/zosia/Desktop/Not_birds/Original_csv/freefield.csv"
freefield_audio = "/Users/zosia/Desktop/Not_birds/Freefield/audio"
freefield_output_audio = "/Users/zosia/Desktop/NotBirds/Freefield"

#Output CSV
output_folder = "/Users/zosia/Desktop/data.csv"

In [117]:
def convert(recordings_folder, output_folder):
    i=0
    for filename in os.listdir(recordings_folder):
        i+=1
        if filename.endswith(".wav"):
            input_path = os.path.join(recordings_folder, filename)
            output_filename = os.path.splitext(filename)[0] + '.ogg'
            output_path = os.path.join(output_folder, output_filename)
    
            sound = AudioSegment.from_wav(input_path)
            sound = sound.set_frame_rate(32000)
            sound.export(output_path, format="ogg")
            if i%100==0:
                print(i)

In [118]:
def delete_files(bird_files, path, row_name):
    for index, row in bird_files.iterrows():
        filename = row[row_name]
        file_path = os.path.join(path, filename)
        try:
            os.remove(file_path)
            print(f"Deleted: {file_path}")
        except FileNotFoundError:
            print("Not found")
        except Exception as e:
            print(f"Error deleting file {file_path}: {e}")

### ESC50

In [119]:
df = pd.read_csv(esc50_original_csv, delimiter=",")
print("Overall size: ", df.shape[0])

Overall size:  2000


In [120]:
categories_to_delete = ['rooster', 'hen', 'crow', 'chirping_birds']
bird_files = df[df['category'].isin(categories_to_delete)]

#delete_files(bird_files, esc50_audio, 'filename')

new_df = df[~df['filename'].isin(bird_files['filename'])]

new_df = new_df[['filename']]
new_df['isBird'] = 0
new_df['folder'] = 'ESC50'
new_df['filename'] = new_df['filename'].str.replace('.wav', '')
esc50_df = new_df

In [121]:
print("Size after excluding chosen categories: ", esc50_df.shape[0])

Size after excluding chosen categories:  1840


In [122]:
if sum([len(files) for root, dirs, files in os.walk(esc50_output_audio)]) <= esc50_df.shape[0]:
    convert(esc50_audio, esc50_output_audio)

### WARBLRB

In [123]:
df = pd.read_csv(warblrb_original_csv, delimiter=",")
print("Overall size: ", df.shape[0])

Overall size:  8000


In [124]:
bird_files = df[df['hasbird'] == 1]

#delete_files(bird_files, warblrb_audio, 'itemid')

new_df = df[df['hasbird'] == 0]
new_df = new_df[['itemid', 'hasbird']]
new_df['folder'] = 'Warblrb'
new_df.rename(columns={'itemid': 'filename', 'hasbird': 'isBird'}, inplace=True)
warblrb_df = new_df

In [125]:
print("Size after excluding chosen categories: ", warblrb_df.shape[0])

Size after excluding chosen categories:  1955


In [126]:
if sum([len(files) for root, dirs, files in os.walk(warblrb_output_audio)]) <= warblrb_df.shape[0]:
    convert(warblrb_audio, warblrb_output_audio)

### AUDIOSET

In [127]:
df_classes = pd.read_csv(audioset_original_csv, delimiter=',')
df_labels = pd.read_csv(audioset_original_csv_labels, delimiter=',')

In [128]:
df_classes['positive_labels'] = df_classes['positive_labels'].str.split(',')
df_classes = df_classes.explode('positive_labels')

merged_df = pd.merge(df_classes, df_labels, left_on='positive_labels', right_on='mid')
df = merged_df.groupby('YTID')['display_name'].agg(list).reset_index()
print("Overall size: ", df.shape[0])

Overall size:  19644


In [129]:
categories_to_delete = ['Bird', 'Bird vocalization, bird call, bird song', 'Chirp, tweet', 'Squawk', 'Pigeon, dove', 'Coo', 'Crow', 'Caw', 'Owl', 'Hoot', 'Bird flight, flapping wings', 'Fowl', 'Chicken, rooster', 'Cluck', 'Crowing cock-a-doodle-doo', 'Turkey', 'Gobble', 'Duck', 'Quack', 'Goose', 'Honk', 'Field recording', 'Croak', 'Squeal', 'Chirp tone', 'Wild animals']
bird_files = df[df['display_name'].apply(lambda x: any(cat in x for cat in categories_to_delete))]

#delete_files(bird_files, audioset_audio, 'YTID')

new_df = df[~df['display_name'].isin(bird_files['display_name'])].copy()

new_df.loc[:, 'folder'] = 'Audioset'
new_df.loc[:, 'isBird'] = 0
new_df.rename(columns={'YTID': 'filename'}, inplace=True)
audioset_df = new_df[['filename', 'isBird', 'folder']].copy()

In [130]:
print("Size after excluding chosen categories: ", audioset_df.shape[0])

Size after excluding chosen categories:  18779


In [131]:
if sum([len(files) for root, dirs, files in os.walk(audioset_output_audio)]) <= audioset_df.shape[0]:
    convert(audioset_audio, audioset_output_audio)

### Freefiled

In [132]:
df = pd.read_csv(freefield_original_csv, delimiter=",")
print("Overall size: ", df.shape[0])

Overall size:  7690


In [133]:
#bird_files = df[df['hasbird'] == 1]
#delete_files(bird_files, freefield_audio, 'itemid')

new_df = df[df['hasbird'] == 0]
new_df = new_df[['itemid', 'hasbird']]
new_df['folder'] = 'Freefield'
new_df.rename(columns={'itemid': 'filename', 'hasbird': 'isBird'}, inplace=True)
freefield_df = new_df

In [134]:
print("Size after excluding chosen categories: ", freefield_df.shape[0])

Size after excluding chosen categories:  5755


In [135]:
if sum([len(files) for root, dirs, files in os.walk(freefield_output_audio)]) <= freefield_df.shape[0]:
    convert(freefield_audio, freefield_output_audio)

### Bird sounds

In [136]:
df = filter_recordings_30()
birds_df = df[['id', 'Latin name']].copy()
birds_df['isBird'] = 1
birds_df.rename(columns={'id': 'filename', 'Latin name': 'folder'}, inplace=True)

  recordings = pd.read_csv(filepath_recordings)


### Merge datasets

In [137]:
result_df = pd.concat([esc50_df, warblrb_df, audioset_df, freefield_df, birds_df], ignore_index=True)
result_df.to_csv(output_folder)

#### Duplicates

In [138]:
duplicates = result_df.duplicated(subset='filename', keep=False)
duplicate_count = duplicates.sum()
print("Duplicate count: ", duplicate_count)

Duplicate count:  296
