In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

import torchaudio

from tqdm import tqdm
tqdm.pandas()

In [2]:
RANDOM_STATE = 42
SHUFFLE = True

DATA_PATH = "../data"

# Data Preparation

In [3]:
train_df = pd.read_csv(DATA_PATH + "/train.csv")

In [4]:
train_df.head()

Unnamed: 0,primary_label,secondary_labels,type,filename,collection,rating,url,latitude,longitude,scientific_name,common_name,author,license
0,1139490,[''],[''],1139490/CSA36385.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0
1,1139490,[''],[''],1139490/CSA36389.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0
2,1192948,[''],[''],1192948/CSA36358.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0
3,1192948,[''],[''],1192948/CSA36366.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.28,-73.8582,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0
4,1192948,[''],[''],1192948/CSA36373.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0


## Create duration column

In [5]:
def get_audio_duration(path):
    metadata = torchaudio.info(path)
    return metadata.num_frames
    # return metadata.num_frames / metadata.sample_rate

train_df['num_frames'] = train_df['filename'].progress_apply(lambda x: get_audio_duration(f"{DATA_PATH}/train_audio/{x}"))

100%|██████████| 28564/28564 [05:46<00:00, 82.33it/s]


## Check for semi-duplicates

In [6]:
same_columns = ['primary_label', 'latitude', 'longitude', 'author', 'num_frames']
diff_column = 'collection'

In [7]:
def get_duplicate_rows_with_diff(df, same_columns, diff_column):
    """
    Returns all rows from the DataFrame 'df' that share the same values in the columns specified in 'columns_check'
    and have different values in 'diff_column'.
    """
    grouped = df.groupby(same_columns)
    filtered = grouped.filter(lambda x: x[diff_column].nunique() > 1)
    
    return filtered

In [8]:
filtered = get_duplicate_rows_with_diff(
    df=train_df,
    same_columns=same_columns, diff_column=diff_column
)


print(filtered.shape)
filtered.head()

(44, 14)


Unnamed: 0,primary_label,secondary_labels,type,filename,collection,rating,url,latitude,longitude,scientific_name,common_name,author,license,num_frames
9,126247,"['65448', '22976', '476538']",['advertisement call'],126247/XC941297.ogg,XC,3.5,https://xeno-canto.org/941297,9.0465,-79.3024,Leptodactylus insularum,Spotted Foam-nest Frog,Chris Harrison,cc-by-nc-sa 4.0,517007
10,126247,[''],[''],126247/iNat1109254.ogg,iNat,0.0,https://static.inaturalist.org/sounds/1109254.wav,9.0465,-79.3024,Leptodactylus insularum,Spotted Foam-nest Frog,Chris Harrison,cc-by-nc 4.0,517007
20,134933,[''],['advertisement call'],134933/XC941298.ogg,XC,4.0,https://xeno-canto.org/941298,8.626,-80.1392,Espadarana prosoblepon,Emerald Glass Frog,Chris Harrison,cc-by-nc-sa 4.0,1544717
22,134933,[''],[''],134933/iNat1160199.ogg,iNat,0.0,https://static.inaturalist.org/sounds/1160199.wav,8.626,-80.1392,Espadarana prosoblepon,Emerald Glass Frog,Chris Harrison,cc-by-nc 4.0,1544717
184,22973,[''],['advertisement call'],22973/XC892927.ogg,XC,5.0,https://xeno-canto.org/892927,11.1631,-60.827,Leptodactylus fuscus,Whistling Grass Frog,Chris Harrison,cc-by-nc-sa 4.0,1191936


In [9]:
filtered[diff_column].value_counts()

collection
XC      22
iNat    22
Name: count, dtype: int64

## Drop semi-duplicates

In [14]:
# drop iNat
train_df_nod = train_df.drop(index=filtered[filtered['collection'] == 'iNat'].index).reset_index()

In [15]:
# recheck
filtered_nod = get_duplicate_rows_with_diff(
    df=train_df_nod,
    same_columns=same_columns, diff_column=diff_column
)


print(filtered_nod.shape)

(0, 15)


# Split

In [16]:
splitter = StratifiedKFold(
    n_splits=5,
    shuffle=SHUFFLE,
    random_state=RANDOM_STATE
)

split = list(splitter.split(X=train_df_nod, y=train_df_nod['primary_label']))



In [17]:
split_df = train_df_nod[['primary_label', 'filename']].copy()
split_df['fold'] = -1

for ii, (train_idx, test_idx) in enumerate(split):
    split_df.loc[test_idx, 'fold'] = ii

In [18]:
split_df['fold'].value_counts()

fold
1    5709
0    5709
2    5708
3    5708
4    5708
Name: count, dtype: int64

In [19]:
all_labels = set(split_df['primary_label'].values)
for ii in range(5):
    fold_labels = set(split_df.loc[split_df['fold'] == ii, 'primary_label'].values)

    diff = all_labels - fold_labels
    print(f"fold_{ii}: {diff}")

fold_0: {'21038', '528041', '1462711', '42113', '476537', '66016', '1194042', '66531', '66893', '64862'}
fold_1: {'21038', '528041', '1462711', '1192948', '42113', '47067', '41778', '868458', '66531', '66893'}
fold_2: {'21038', '67082', '528041', '476538', '1139490', '47067', '134933', '41778', '81930', '21116', '42087', '66531', '66578'}
fold_3: {'67082', '1139490', '47067', '134933', '41778', '65419', '81930', '476537', '523060', '66016', '21116', '24292', '42087', '64862', '66578'}
fold_4: {'67082', '42113', '1139490', '65419', '81930', '476537', '66016', '21116', '1194042', '42087', '24292', '64862', '66578'}


> ... at least one CV split with each species in train AND val splits.

In [21]:
# fold_0 for validation
val_split = 0

fold_labels = set(split_df.loc[split_df['fold'] == val_split, 'primary_label'].values)
missed_labels = all_labels - fold_labels

split_df.loc[split_df['primary_label'].isin(missed_labels), 'primary_label'].value_counts()

primary_label
1194042    3
1462711    3
66893      3
21038      2
42113      2
476537     2
528041     2
64862      2
66016      2
66531      2
Name: count, dtype: int64

In [22]:
split_df.loc[
    split_df['primary_label'].isin(missed_labels) & \
    ~split_df['primary_label'].duplicated(),
    'fold'
] = val_split

In [None]:
# recheck
all_labels = set(split_df['primary_label'].values)
for ii in range(5):
    fold_labels = set(split_df.loc[split_df['fold'] == ii, 'primary_label'].values)

    diff = all_labels - fold_labels
    print(f"fold_{ii}: {diff}")

fold_0: set()
fold_1: {'21038', '528041', '1462711', '1192948', '42113', '47067', '41778', '476537', '1194042', '868458', '66531', '66893', '64862'}
fold_2: {'21038', '67082', '528041', '1462711', '476538', '1139490', '47067', '134933', '41778', '81930', '66016', '21116', '42087', '66531', '66578'}
fold_3: {'67082', '528041', '42113', '1139490', '47067', '134933', '41778', '65419', '81930', '476537', '523060', '66016', '21116', '24292', '42087', '64862', '66578'}
fold_4: {'21038', '67082', '42113', '1139490', '65419', '81930', '476537', '66016', '21116', '1194042', '42087', '24292', '66531', '66893', '64862', '66578'}


In [24]:
split_df.to_csv('cv_split.csv', index=False)