In [1]:
# assign directory
import git
from pathlib import Path
import os
ROOT_DIR = Path(git.Repo('.', search_parent_directories=True).working_tree_dir)
os.chdir(os.path.join(ROOT_DIR, "utilities"))
from transform import *
from plotting import *
os.chdir(os.path.join(ROOT_DIR, "dataset-preparation"))

data_dir  = os.path.join(ROOT_DIR, 'raw-data', 'ravdess', 'full-ravdess-wav')
# iterate over files in that directory
file_list = [os.path.join(data_dir, filename) for filename in os.listdir(data_dir) if filename != ".DS_Store"]
file_names = os.listdir(data_dir)

## Audio Processing using Librosa and soundfile

* Detects audio files that contain clipping and removes them from the data set
* Filters out beginning and end silence 
* Normalizes loudness

In [2]:
import librosa #Need to pip install librosa 
import soundfile as sf #Need to pip install soundfile
from joblib import Parallel, delayed
import numpy as np
from tqdm import tqdm

In [3]:
#Creates new wav files that have been processed

CLIP_THRESHOLD = 1 
NORMALIZE = 0 # 1 for pitch normalization, 0 for no pitch normalization

if NORMALIZE == 1:
    OUTPUT_DIR = os.path.join(ROOT_DIR, 'raw-data', 'ravdess', 'full-ravdess-wav-processed-pitch-normalized')
else:
    OUTPUT_DIR = os.path.join(ROOT_DIR, 'raw-data', 'ravdess', 'full-ravdess-wav-processed')



def is_clipped(y, threshold = CLIP_THRESHOLD):
    return np.any(np.abs(y) >= threshold)

def normalize_peaks(y):
    return (y - np.mean(y))/ np.std(y) if np.std(y) > 0 else y

def normalize_pitch_shifting(y, sr, target_hz=150.0):
    f0, voiced_flag, _ = librosa.pyin(y, 
                                      fmin=librosa.note_to_hz('C2'), 
                                      fmax=librosa.note_to_hz('C7'))
    
    voiced_f0 = f0[voiced_flag]
    voiced_f0 = voiced_f0[~np.isnan(voiced_f0)]

    current_pitch_hz = np.median(voiced_f0)
    if current_pitch_hz <= 0:
        return y
    n_steps = 12 * np.log2(target_hz / current_pitch_hz)

    y_shifted = librosa.effects.pitch_shift(y=y, sr=sr, n_steps=n_steps)
    
    return y_shifted

def process_audio_file_pitch_normalized(filename):
    if filename.lower().endswith('wav'):
        path_in = os.path.join(data_dir, filename)
        path_out = os.path.join(OUTPUT_DIR, filename)
        
        y, sr = librosa.load(path_in, sr=None)

        if is_clipped(y):
            return

        y_trimmed, _ = librosa.effects.trim(y, top_db=20)
        
        # pitch normalization
        y_normalized = normalize_pitch_shifting(y_trimmed, sr, target_hz=150.0)

        y_loudness_normalized = normalize_peaks(y_normalized) 

        sf.write(path_out, y_loudness_normalized, sr)

def process_audio_file(filename):
    if filename.lower().endswith('wav'):
        path_in = os.path.join(data_dir, filename)
        path_out = os.path.join(OUTPUT_DIR, filename)
        
        y, sr = librosa.load(path_in, sr=None)

        if is_clipped(y):
            return

        y_trimmed, _ = librosa.effects.trim(y, top_db=20)
        
        y_loudness_normalized = normalize_peaks(y_trimmed) 

        sf.write(path_out, y_loudness_normalized, sr)

try:
    os.mkdir(OUTPUT_DIR)
except FileExistsError:
    pass

total_files = len(file_names)
print(f"Starting processing for {total_files} files...")

if NORMALIZE == 1:
    Parallel(n_jobs=-1)(delayed(process_audio_file_pitch_normalized)(filename) for filename in tqdm(file_names, desc="Processing audio files"))
else:
    Parallel(n_jobs=-1)(delayed(process_audio_file)(filename) for filename in tqdm(file_names, desc="Processing audio files"))

print("\nProcessing complete. All files have been handled.")


Starting processing for 1441 files...


Processing audio files: 100%|██████████| 1441/1441 [00:14<00:00, 100.64it/s]



Processing complete. All files have been handled.


In [4]:
#Changes dataset to post-processed dataset
data_dir = OUTPUT_DIR
file_list = [os.path.join(data_dir, filename) for filename in os.listdir(data_dir) if filename != ".DS_Store"]
file_names = os.listdir(data_dir)

## numpy representation

using `scipy.io.wavfile.read(filename)` to intake .wav audio files 

> returns 
> * rate, registing the hertz of the audio 
> * data, an array with each index representing a point in time for the audio and its value representing amplitude at said time 
>   * index position divided by the length of array represents its temporal occurence in seconds

In [5]:
from scipy.io import wavfile

In [6]:
wav_rates, wav_coeffs = zip(*[wavfile.read(file) for file in file_list])
wav_rates = np.array(wav_rates)
wav_coeffs = np.array(wav_coeffs, dtype="object")

In [7]:
assert len(wav_rates) == len(wav_rates), "Check for lossy data in .wav scipy numpy representation"
n = len(wav_rates)
n

1440

In [8]:
wav_lengths = pd.Series(wav_coeffs).apply(len) / wav_rates[0]

## info parsing to .csv

file naming conventions (from RAVDESS)

In [9]:
modalities = {
    "01": "full-AV", 
    "02": "video-only",
    "03": "audio-only"
} 

channels = {
    "01": "speech",
    "02": "song"
}

emotions = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "suprised",
} 

intensities = {
    "01": "normal",
    "02": "strong"
}

statements = {
    "01": "Kids are talking by the door",
    "02": "Dogs are sitting by the door"
}

speaker_sex = ["female", "male"] 

In [10]:
ravdess_cols = ["modality", "channel", "emotion", "intensity", "statement", "repetition", "actor", "sex", "filename"]
rows = []

for f in file_names:
    if f != ".DS_Store":
        parsed_info = f[:-4].split("-")

        modality = modalities[parsed_info[0]]
        channel = channels[parsed_info[1]]
        emotion = emotions[parsed_info[2]]
        intensity = intensities[parsed_info[3]]
        sentence = statements[parsed_info[4]]
        rep = int(parsed_info[5])
        no1 = int(parsed_info[6])
        sx = speaker_sex[no1 % 2]

        rows.append({
            "modality": modality,
            "channel": channel,
            "emotion": emotion,
            "intensity": intensity,
            "statement": sentence,
            "repetition": rep,
            "actor": no1,
            "sex": sx,
            "filename": f
        })

ravdess_df = pd.DataFrame(rows, columns=ravdess_cols)


ravdess_df["length (s)"] = wav_lengths
ravdess_df["rate (Hz)"] = wav_rates
ravdess_df["amplitudes"] = wav_coeffs

ravdess_df.head()

Unnamed: 0,modality,channel,emotion,intensity,statement,repetition,actor,sex,filename,length (s),rate (Hz),amplitudes
0,audio-only,speech,neutral,normal,Kids are talking by the door,1,1,male,03-01-01-01-01-01-01.wav,1.194667,48000,"[-11609, -4083, 1787, -6191, -11760, -3481, 49..."
1,audio-only,speech,neutral,normal,Kids are talking by the door,1,2,female,03-01-01-01-01-01-02.wav,1.493333,48000,"[1791, -2783, -7258, -9843, -11036, -10241, -8..."
2,audio-only,speech,neutral,normal,Kids are talking by the door,1,3,male,03-01-01-01-01-01-03.wav,1.344,48000,"[-2182, -867, 1236, 3251, 4916, 4916, 2813, 27..."
3,audio-only,speech,neutral,normal,Kids are talking by the door,1,4,female,03-01-01-01-01-01-04.wav,1.269333,48000,"[6352, 5197, -3321, -2888, -3610, -14150, -134..."
4,audio-only,speech,neutral,normal,Kids are talking by the door,1,5,male,03-01-01-01-01-01-05.wav,1.525333,48000,"[-5947, 1003, 2202, 1722, 6995, 4359, -5468, -..."


In [11]:
xport_name = "full-ravdess.csv"
EXPORT_DIR = os.path.join(ROOT_DIR, "raw-data", "ravdess")
EXPORT_PATH = os.path.join(EXPORT_DIR, xport_name)

ravdess_df.to_csv(EXPORT_PATH)

## npz representation export

In [12]:
EXPORT_DIR = os.path.join(ROOT_DIR, 'raw-data', 'ravdess', 'full-ravdess-npz')

try:
    os.mkdir(EXPORT_DIR)
except FileExistsError:
    pass

for f in np.arange(len(file_list)):
    coef = wav_coeffs[f]
    file_name = file_list[f][-24:-4] ## removes the .wav naming convention

    np.savez(os.path.join(EXPORT_DIR, file_name))