In [201]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from pydub import AudioSegment

import os, shutil

In [181]:
ROOT_DIR = os.getcwd()
print("Root Dir: ", ROOT_DIR)

Root Dir:  /scratch/IOSZ/waveformer/multimod-sound-separation/multimod-waveformer


## Mozilla CV data

In [182]:
SEED = 42

cv_corpus = "cv-corpus-13.0-delta-2023-03-09"  # "cv-corpus-14.0-delta-2023-06-23"
tsv_file = "validated.tsv"
folder_path = os.path.join("data", "mozilla-cv", cv_corpus, "en")

csv_output_folder = os.path.join("data", "CVSoundScapes", "cv-files")

### Read metadata

In [144]:
# read tsv file
df_tsv = pd.read_csv(os.path.join(folder_path, tsv_file), sep='\t')

In [145]:
# dataframe details
print(df_tsv.shape)
print(df_tsv.columns)

(6983, 11)
Index(['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age',
       'gender', 'accents', 'variant', 'locale', 'segment'],
      dtype='object')


In [146]:
df_tsv["gender"].value_counts()

gender
male      4336
female    1514
other       97
Name: count, dtype: int64

### Data cleaning

In [147]:
# get only samples with 
df_tsv = df_tsv[~df_tsv["gender"].isna()]

In [148]:
df_tsv["down_votes"].value_counts()

down_votes
0    5892
2      39
1      10
3       4
4       1
5       1
Name: count, dtype: int64

In [149]:
df_tsv = df_tsv[df_tsv["down_votes"] == 0]  # keep only samples without downvotes

In [150]:
df_tsv["down_votes"].value_counts()

down_votes
0    5892
Name: count, dtype: int64

In [151]:
df_tsv = df_tsv[df_tsv["gender"].isin(["male", "female"])]  # keep only male and female labels

In [152]:
df_tsv["gender"].value_counts()  

gender
male      4291
female    1506
Name: count, dtype: int64

In [153]:
df_tsv.reset_index(drop=True, inplace=True)

### (Optional) Convert mp3 files to wav - manual choice

In [154]:
# define the directory that contains the mp3 files
input_dir = os.path.join(folder_path, 'clips')
output_dir = os.path.join('data', 'mozilla-cv', 'cv-13-selected')  # MANUALLY SELECT LOCATION

print(input_dir)
print(output_dir)

data/mozilla-cv/cv-corpus-13.0-delta-2023-03-09/en/clips
data/mozilla-cv/cv-13-selected


In [155]:
def convert_mp3_to_wav(mp3_path, wav_path):
    audio = AudioSegment.from_mp3(mp3_path)
    audio.export(wav_path, format='wav')

In [156]:
# make sure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# iterate over the files in the above selected df
for idx, row in df_tsv.iterrows():
    
    # get the path to the given file
    file_name = row["path"]
    
    # check if the file is an mp3
    if file_name.endswith('.mp3'):
        mp3_path = os.path.join(input_dir, file_name)
        wav_file_name = os.path.splitext(file_name)[0] + '.wav'  # replace .mp3 with .wav
        wav_path = os.path.join(output_dir, wav_file_name)
        
        convert_mp3_to_wav(mp3_path, wav_path)  # run the conversion

### Replace .mp3 with .wav

In [157]:
df_tsv["path"] = df_tsv["path"].apply(lambda row: row[:-3] + "wav")

In [158]:
df_tsv.head(3)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accents,variant,locale,segment
0,1a2cacd4c67ba9001d44e3cf0ec327ab480e79b7ff0fd3...,common_voice_en_36951323.wav,"Restricted breathing, joint stiffness, and hea...",4,0,twenties,male,United States English,,en,
1,1ee465e99f453ac7b20193d2cb28986bf0d553b2ca6789...,common_voice_en_37251984.wav,The German population was either evacuated or ...,2,0,fourties,female,United States English,,en,
2,2432927a3acd24b583f0f759819eb4e1b7150bf681b938...,common_voice_en_37028857.wav,The Church said Sony did not obtain permission...,2,0,sixties,female,United States English,,en,


### (Optional) Join multiple CV datasets into a single csv file

#### This is done by running the above code one time/cv dataset and storing the df into a temp file as below to join them afterwards

In [163]:
# df_tsv_13 = df_tsv.copy()
# df_tsv_13.to_csv(os.path.join(ROOT_DIR, "data", "mozilla-cv", "cv-13-selected.csv"), header=True, index=False)

In [142]:
# df_tsv_14 = df_tsv.copy()
# df_tsv_14.to_csv(os.path.join(ROOT_DIR, "data", "mozilla-cv", "cv-14-selected.csv"), header=True, index=False)

In [176]:
# df_tsv = pd.concat([df_tsv, df_tsv_14], axis=0).reset_index(drop=True)
# df_tsv.to_csv(os.path.join(ROOT_DIR, "data", "mozilla-cv", "cv-all-selected.csv"), index=False, header=True)

In [None]:
# to read the file back
# df_tsv = pd.read_csv(os.path.join(ROOT_DIR, "data", "mozilla-cv", "cv-all-selected.csv"))
# df_tsv.head(3)

### Split data into train, val and test

In [170]:
# create the train/test split
df_train, df_test = train_test_split(df_tsv, test_size=0.15, random_state=SEED)

# get the val split
df_train, df_val = train_test_split(df_train, test_size=0.05, random_state=SEED)

# reordering the columns
df_train = df_train[['gender', 'path']].reset_index(drop=True, inplace=False)
df_train.columns = ['label', 'fname']

df_val = df_val[['gender', 'path']].reset_index(drop=True, inplace=False)
df_val.columns = ['label', 'fname']

df_test = df_test[['gender', 'path']].reset_index(drop=True, inplace=False)
df_test.columns = ['label', 'fname']

# save each dataframe to a csv
df_train.to_csv(os.path.join(csv_output_folder, 'train.csv'), index=True)  # index to match original data
df_val.to_csv(os.path.join(csv_output_folder, 'val.csv'), index=True)
df_test.to_csv(os.path.join(csv_output_folder, 'test.csv'), index=True)

In [172]:
pd.read_csv(os.path.join(csv_output_folder, "train.csv"))

Unnamed: 0.1,Unnamed: 0,label,fname
0,0,male,common_voice_en_37289426.wav
1,1,female,common_voice_en_37206851.wav
2,2,male,common_voice_en_36657414.wav
3,3,male,common_voice_en_36808767.wav
4,4,female,common_voice_en_36906367.wav
...,...,...,...
7378,7378,male,common_voice_en_36912887.wav
7379,7379,male,common_voice_en_37343524.wav
7380,7380,male,common_voice_en_37539766.wav
7381,7381,female,common_voice_en_37976327.wav


In [89]:
# # example of existing data
# df_temp = pd.read_csv(os.path.join("data", "FSDSoundScapes", "FSDKaggle2018", "train.csv"))
# df_temp

### Copy selected files to the new dataset location

In [206]:
DATA_SPLIT = "val"  # MANUALLY CHOOSE THE SPLIT
DATA_INPUT_LOCATION = os.path.join(ROOT_DIR, "data", "mozilla-cv", "cv-all-selected")

# switch the folder loccations
if DATA_SPLIT == "train":
    destination_folder = os.path.join(csv_output_folder, "train")
    df_to_use = df_train
elif DATA_SPLIT == "test":
    destination_folder = os.path.join(csv_output_folder, "test")
    df_to_use = df_test
else:
    destination_folder = os.path.join(csv_output_folder, "val")
    df_to_use = df_val
    
print("Data input: ", DATA_INPUT_LOCATION)
print("Destination: ", destination_folder, " : ", df_to_use.shape)

Data input:  /scratch/IOSZ/waveformer/multimod-sound-separation/multimod-waveformer/data/mozilla-cv/cv-all-selected
Destination:  data/CVSoundScapes/cv-files/val  :  (389, 2)


In [207]:
# parse all rows
for idx, row in df_to_use.iterrows():
    file_path = os.path.join(DATA_INPUT_LOCATION, row['fname'])
    
    # Check if file exists
    if os.path.isfile(file_path):
        # Copy the file to the destination folder
        shutil.copy(file_path, os.path.join(destination_folder, row['label']))
    else:
        print(f"File does not exist: {file_path}")

## Following code is optional post-hoc analysis

### Check amplitude values

#### Turns out FSDKaggle2018 has many files going outside the [-1, 1] amplitude range

In [19]:
import librosa, soundfile

In [6]:
curr_path = os.path.join("data", "CVSoundScapes", "CV-13-mini", "train", "male", "common_voice_en_36705411.wav")

In [7]:
# Load audio file
y, sr = librosa.load(curr_path)

# Check max and min amplitude
max_amp = max(y)
min_amp = min(y)

print('Max amplitude:', max_amp)
print('Min amplitude:', min_amp)

Max amplitude: 0.53837484
Min amplitude: -0.41033322


In [34]:
audio_dir = os.path.join("data", "CVSoundScapes", "CV-13-mini", "test", "female")
#audio_dir = os.path.join("data", "CVSoundScapes", "TAU-acoustic-sounds", "TAU-urban-acoustic-scenes-2019-development", "audio")
#audio_dir = os.path.join("data", "FSDSoundScapes", "FSDKaggle2018", "train", "Applause")

print(audio_dir)

data/CVSoundScapes/CV-13-mini/test/female


In [35]:
# Get all audio files in the directory
audio_files = [f for f in os.listdir(audio_dir) if f.endswith('.wav')]

ct = 0
# Check each file
for audio_file in audio_files:
    
    # Load audio file
    y, sr = librosa.load(os.path.join(audio_dir, audio_file))
    
    # Check max and min amplitude
    max_amp = max(y)
    min_amp = min(y)

    # If amplitude is out of range, print file name and amplitude
    if max_amp > 1 or min_amp < -1:
        # print('File:', audio_file)
        # print('Max amplitude:', max_amp)
        # print('Min amplitude:', min_amp)
        # print('---')
        
        # to normalize and override those files
        y = librosa.util.normalize(y)
        
        soundfile.write(os.path.join(audio_dir, audio_file), y, sr)
        
        ct += 1

print("Files changed: ", ct)

Files changed:  3


### Check files from multiple downloads

In [36]:
os.getcwd()

'/scratch/IOSZ/waveformer/multimod-sound-separation/multimod-waveformer'

In [44]:
# folder locations
dir1 = os.path.join('data', 'mozilla-cv', 'cv-corpus-14.0-delta-2023-06-23', 'en', 'clips')
dir2 = os.path.join('data', 'mozilla-cv', 'cv-corpus-13.0-delta-2023-03-09', 'en', 'clips')

# get filenames in each directory (not including the path)
files1 = set(os.listdir(dir1))
files2 = set(os.listdir(dir2))

# find common filenames
common_files = files1.intersection(files2)

# print common filenames
for file in common_files:
    print(file)