In [2]:
import os
from glob import glob
from shutil import copyfile, move
from collections import defaultdict
import pandas as pd

2022-12-01 18:16:26.033868: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Preprocessing dilemmas:
    

1. Should we balance the dataset?
2. Should we filter by country (for Germany that results in having only 35 audio files)?
3. We need to make tracks of the same size - should we limit the duration to the minimum duration based on the dataset? or do zero-padding?
4. We can try chopping tracks into segments which will help to increase the dataset size and also solve the problem with #3 (having tracks of same size) -> but will this help in case of accents?



In [3]:
# move data to "original" folder
print(os.listdir("./data/"))
if not os.path.exists("./data/original") and not os.path.exists("./data/filtered"):
    #
    for folder in os.listdir("./data/")[1:]:
        move(f"./data/{folder}", f"./data/original/{folder}")

print(os.listdir("./data/"))

['.DS_Store', 'original', 'stft_data_16sec.json', 'filtered', 'processed']
['.DS_Store', 'original', 'stft_data_16sec.json', 'filtered', 'processed']


In [4]:
audio_files = glob("./data/original/*/*.wav")

# Dataset statistics

In [5]:
def load_wav_16k_mono(filename):
    # Load encoded wav file
    file_contents = tf.io.read_file(filename)
    # Decode wav (tensors by channels) 
    wav, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=1)
    # Removes trailing axis
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    # Goes from 22050 Hz to 16000 Hz - amplitude of the audio signal
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

In [6]:
def get_lang_dict(dirpath):
    lang_dict = defaultdict(list)
    audio_files = glob(dirpath)

    for file in audio_files:
        lang = file.split("/")[3]
        tensor_wave = load_wav_16k_mono(file)
        lang_dict[lang].append(len(tensor_wave))
    return lang_dict

In [7]:
def get_stats(dirpath):
    '''
    returns statistics for two dataframes:
    df_stats_frames = duration measured in number of frames
    df_stats_duration = duration measured in seconds 
    '''
    lang_dict = get_lang_dict(dirpath)
    df_stats_frames = pd.DataFrame(index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'], columns=[lang for lang in lang_dict])
    df_stats_duration = pd.DataFrame(index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'], columns=[lang for lang in lang_dict])
    for lang in lang_dict:
        df_audio = pd.DataFrame({lang: lang_dict[lang]})
        duration_len = [l/16000 for l in lang_dict[lang]]
        df_audio_duration = pd.DataFrame({lang: duration_len}) 
        df_stats_frames[lang] = df_audio.describe()
        df_stats_duration[lang] = df_audio_duration.describe()
    return df_stats_frames, df_stats_duration

In [8]:
ORIGINAL_DATASET_PATH = "./data/original/*/*.wav"
df_stats_frames, df_stats_duration = get_stats(ORIGINAL_DATASET_PATH)

2022-12-01 18:16:46.138396: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-01 18:16:46.347351: I tensorflow_io/core/kernels/cpu_check.cc:128] Your CPU supports instructions that this TensorFlow IO binary was not compiled to use: AVX2 FMA




In [9]:
df_stats_duration

Unnamed: 0,german,mandarin,russian,english
count,45.0,156.0,81.0,321.0
mean,24.603707,30.816357,29.573198,22.602022
std,3.010575,6.798004,8.871542,3.430256
min,19.229125,18.92425,18.896687,16.4605
25%,22.824687,25.486313,23.447125,20.28675
50%,24.412375,29.745625,27.43325,22.119812
75%,26.038625,34.685141,33.418312,24.244187
max,34.014812,55.59,66.142,41.670375


Number of files per language is very varying - ranging from 45 audio clips for German to 156 for Mandarin. 
The clips duration is also very different - from about 16 sec for English, up to 66 sec for Russian.

Preprocessing decisions:
1. limit number of audio files per language to 45 to avoid imbalanced dataset
2. ensure the file duration is 16 seconds (skip those that are less than 16, trim if longer)

In [10]:
df_stats_frames

Unnamed: 0,german,mandarin,russian,english
count,45.0,156.0,81.0,321.0
mean,393659.311111,493061.711538,473171.2,361632.352025
std,48169.197338,108768.058246,141944.7,54884.094414
min,307666.0,302788.0,302347.0,263368.0
25%,365195.0,407781.0,375154.0,324588.0
50%,390598.0,475930.0,438932.0,353917.0
75%,416618.0,554962.25,534693.0,387907.0
max,544237.0,889440.0,1058272.0,666726.0


# Filter by country

In [13]:
def filter_by_country(countries, audio_files):
    for file in audio_files:
        lang = file.split("/")[3]
        filename = file.split("/")[-1]
        country = filename.split("_")[-1].split(".wav")[0]
        if country in countries:
            dst = f"./data/filtered/{lang}"
            if not os.path.exists(dst):
                os.makedirs(dst)
            copyfile(file, f"{dst}/{filename}")

In [14]:
countries = ["russia", "germany", "usa", "china"]
audio_files = glob("./data/original/*/*.wav")

if not os.path.exists("./data/filtered"):
    filter_by_country(countries, audio_files)

In [15]:
FILTERED_DATASET_PATH = "./data/filtered/*/*.wav"
df_stats_frames, df_stats_duration = get_stats(FILTERED_DATASET_PATH)

In [16]:
df_stats_duration

Unnamed: 0,german,mandarin,russian,english
count,35.0,35.0,35.0,35.0
mean,24.662941,29.9251,30.194914,23.623271
std,3.140721,7.596608,8.254043,3.492605
min,19.997688,19.7515,18.896687,16.474438
25%,22.68125,24.804094,24.393062,21.428312
50%,24.412375,29.044375,27.834125,23.266375
75%,26.252812,32.165219,36.628219,25.771375
max,34.014812,55.59,53.257375,31.362063


## Downsample for balanced dataset

In [19]:
languages = os.listdir("./data/original")
languages

['german', 'mandarin', 'russian', 'english']

In [20]:
min_num = int(df_stats_duration.T['count'].min())
min_num

35

In [21]:
def downsample(min_num):
    for lang in languages:
        files_per_lang = glob(f"./data/filtered/{lang}/*.wav")
        for i, file in enumerate(files_per_lang):
            if i >= min_num:
                os.remove(file)

In [None]:
# downsample(min_num)

In [10]:
FILTERED_DATASET_PATH = "./data/filtered/*/*.wav"
df_stats_frames, df_stats_duration = get_stats(FILTERED_DATASET_PATH)

In [11]:
df_stats_duration

Unnamed: 0,german,mandarin,russian,english
count,35.0,35.0,35.0,35.0
mean,24.662941,29.9251,30.194914,23.623271
std,3.140721,7.596608,8.254043,3.492605
min,19.997688,19.7515,18.896687,16.474438
25%,22.68125,24.804094,24.393062,21.428312
50%,24.412375,29.044375,27.834125,23.266375
75%,26.252812,32.165219,36.628219,25.771375
max,34.014812,55.59,53.257375,31.362063


In [None]:
min_duration = int(df_stats_duration.T['min'].min())
min_duration