In [141]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [142]:
ROOT_DIR = os.getcwd()
print("Root Dir: ", ROOT_DIR)

Root Dir:  /scratch/IOSZ/waveformer/multimod-sound-separation/multimod-waveformer


## Mozilla CV data

In [143]:
folder_path = os.path.join("data", "mozilla-cv", "cv-13-delta", "cv-corpus-13.0-delta-2023-03-09", "en")
tsv_file = "validated.tsv"

In [144]:
# read tsv file
df_tsv = pd.read_csv(os.path.join(folder_path, tsv_file), sep='\t')

In [145]:
# dataframe details
print(df_tsv.shape)
print(df_tsv.columns)

(6983, 11)
Index(['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age',
       'gender', 'accents', 'variant', 'locale', 'segment'],
      dtype='object')


In [146]:
df_tsv["gender"].value_counts()

gender
male      4336
female    1514
other       97
Name: count, dtype: int64

### Data cleaning

In [147]:
# get only samples with 
df_tsv = df_tsv[~df_tsv["gender"].isna()]

In [148]:
df_tsv["down_votes"].value_counts()

down_votes
0    5892
2      39
1      10
3       4
4       1
5       1
Name: count, dtype: int64

In [149]:
df_tsv = df_tsv[df_tsv["down_votes"] == 0]  # keep only samples without downvotes

In [150]:
df_tsv["down_votes"].value_counts()

down_votes
0    5892
Name: count, dtype: int64

In [151]:
df_tsv = df_tsv[df_tsv["gender"].isin(["male", "female"])]  # keep only male and female labels

In [152]:
df_tsv["gender"].value_counts()  

gender
male      4291
female    1506
Name: count, dtype: int64

In [153]:
df_tsv.reset_index(drop=True, inplace=True)

### Replace .mp3 with .wav

In [154]:
df_tsv["path"] = df_tsv["path"].apply(lambda row: row[:-3] + "wav")

In [155]:
df_tsv.head(3)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accents,variant,locale,segment
0,1a2cacd4c67ba9001d44e3cf0ec327ab480e79b7ff0fd3...,common_voice_en_36951323.wav,"Restricted breathing, joint stiffness, and hea...",4,0,twenties,male,United States English,,en,
1,1ee465e99f453ac7b20193d2cb28986bf0d553b2ca6789...,common_voice_en_37251984.wav,The German population was either evacuated or ...,2,0,fourties,female,United States English,,en,
2,2432927a3acd24b583f0f759819eb4e1b7150bf681b938...,common_voice_en_37028857.wav,The Church said Sony did not obtain permission...,2,0,sixties,female,United States English,,en,


### Split data into train, val and test

In [156]:
csv_output_folder = os.path.join("data", "CVSoundScapes", "CV-13-mini")

In [157]:
# create the train/test split
df_train, df_test = train_test_split(df_tsv, test_size=0.15, random_state=42)

# get the val split
df_train, df_val = train_test_split(df_train, test_size=0.05, random_state=42)

# reordering the columns
df_train = df_train[['gender', 'path']].reset_index(drop=True, inplace=False)
df_train.columns = ['label', 'fname']

df_val = df_val[['gender', 'path']].reset_index(drop=True, inplace=False)
df_val.columns = ['label', 'fname']

df_test = df_test[['gender', 'path']].reset_index(drop=True, inplace=False)
df_test.columns = ['label', 'fname']

# save each dataframe to a csv
df_train.to_csv(os.path.join(csv_output_folder, 'train.csv'), index=True)  # index to match original data
df_val.to_csv(os.path.join(csv_output_folder, 'val.csv'), index=True)
df_test.to_csv(os.path.join(csv_output_folder, 'test.csv'), index=True)

In [158]:
pd.read_csv(os.path.join("data", "CVSoundScapes", "CV-13-mini", "test.csv"))

Unnamed: 0.1,Unnamed: 0,label,fname
0,0,male,common_voice_en_37256628.wav
1,1,male,common_voice_en_36837088.wav
2,2,female,common_voice_en_37184046.wav
3,3,male,common_voice_en_36884787.wav
4,4,male,common_voice_en_37017938.wav
...,...,...,...
865,865,female,common_voice_en_37206583.wav
866,866,female,common_voice_en_36927020.wav
867,867,male,common_voice_en_36735221.wav
868,868,male,common_voice_en_36735495.wav


In [106]:
# example of existing data
df_temp = pd.read_csv(os.path.join("data", "FSDSoundScapes", "FSDKaggle2018", "train.csv"))
df_temp

Unnamed: 0.1,Unnamed: 0,label,fname
0,0,Saxophone,FSDKaggle2018.audio_train/b93d6988.wav
1,1,Saxophone,FSDKaggle2018.audio_train/99411449.wav
2,2,Saxophone,FSDKaggle2018.audio_train/7470ab2c.wav
3,3,Saxophone,FSDKaggle2018.audio_train/dcd37383.wav
4,4,Saxophone,FSDKaggle2018.audio_train/b120dc90.wav
...,...,...,...
3355,3355,Applause,FSDKaggle2018.audio_train/e08db496.wav
3356,3356,Applause,FSDKaggle2018.audio_train/d3ede893.wav
3357,3357,Applause,FSDKaggle2018.audio_train/ad2bb540.wav
3358,3358,Applause,FSDKaggle2018.audio_train/a5a7b9a9.wav


### Copy selected files to a specified location

In [68]:
destination_folder = os.path.join("data", "mozilla-cv", "cv-13-selected", "clips")
destination_folder

'data/mozilla-cv/cv-13-selected/clips'

In [69]:
# parse all rows
for idx, row in df_tsv.iterrows():
    file_path = os.path.join(folder_path, "clips", row['path'])
    
    # Check if file exists
    if os.path.isfile(file_path):
        # Copy the file to the destination folder
        shutil.copy(file_path, destination_folder)
    else:
        print(f"File does not exist: {file_path}")

In [71]:
df_tsv.to_csv(os.path.join("data", "mozilla-cv", "cv-13-selected", "selected.csv") ,index=False, header=True)