In [4]:
# modules
from tqdm import  tqdm
from glob import glob
import pandas as pd
import numpy as np
import os
from os import listdir
from os.path import basename as bn, join, split as sp

import librosa
import parselmouth
from parselmouth.praat import call
from scipy.io.wavfile import write

import praat_formants_python as pfp


# DATASET Path and Constants

In [5]:
# Read paths
ROOT_TIMIT_DATA_PATH = "/home/jeevan/datasets/TIMIT Acoustic-Phonetic Continuous Speech Corpus (LDC93S1)/TIMIT"

# Write paths
ALL_EXP_FOLDER = "./exports/"
(lambda fp : os.mkdir(fp) if not os.path.exists(fp) else 0)(ALL_EXP_FOLDER) #make export folder

# Vowel info Export CSV filename
ALL_TIMIT_VOWELS_IMP_FILENAME = "a_all-timit_vowels.csv"
ALL_TIMIT_VOWELS_IMP_FILEPATH = join(ALL_EXP_FOLDER, ALL_TIMIT_VOWELS_IMP_FILENAME)

# Vowel subset Export CSV filename
SUBSET_TIMIT_VOWELS_IMP_FILENAME = "b_subset-timit_vowels_vowlimLIM.csv"
SUBSET_TIMIT_VOWELS_IMP_FILEPATH = join(ALL_EXP_FOLDER, SUBSET_TIMIT_VOWELS_IMP_FILENAME)

# Vowel subset Export CSV filename
TIMIT_VOWEL_FORMANT_ESTIMATION_EXP_FILENAME = "c_timit-vowels_formant_estimation_vowlimLIM.csv"
TIMIT_VOWEL_FORMANT_ESTIMATION_EXP_FILEPATH = join(ALL_EXP_FOLDER, TIMIT_VOWEL_FORMANT_ESTIMATION_EXP_FILENAME)


# TMP Audio Export folder
TEMP_AUDIO_EXP_FOLDER = "./audio_exports"
(lambda fp : os.mkdir(fp) if not os.path.exists(fp) else 0)(TEMP_AUDIO_EXP_FOLDER) #make export folder

# TIMIT SAMPLING RATE
TIMIT_AUDIO_FS = 16000

### Import SUBSET TIMIT Vowel Info dataframe

In [6]:
VOWEL_LIMIT = 10
SUBSET_TIMIT_VOWELS_DF = pd.read_csv(SUBSET_TIMIT_VOWELS_IMP_FILEPATH.replace("LIM", f"{VOWEL_LIMIT}"))
# SUBSET_TIMIT_VOWELS_DF.set_index("index", inplace=True)
ALL_TIMIT_VOWEL_LIST = pd.unique(SUBSET_TIMIT_VOWELS_DF["vowel_name"])
print(SUBSET_TIMIT_VOWELS_DF["vowel_name"].value_counts())
SUBSET_TIMIT_VOWELS_DF

iy      20
ae      20
uh      20
ey      20
ah      20
aw      20
ux      20
ax      20
ay      20
oy      20
eh      20
ix      20
ow      20
axr     20
ao      20
ih      20
uw      20
aa      20
er      20
ax-h    20
Name: vowel_name, dtype: int64


Unnamed: 0,index,audio_filepath,wav_file,person_id,sex,start_sample,end_sample,duration_sample,start_second,end_second,duration_second,vowel_name
0,0,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SA2.WAV,MTRT0,M,19040,20720,1680,1.190000,1.295000,0.105000,iy
1,1,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SA1.WAV,MMGG0,M,31768,33269,1501,1.985500,2.079312,0.093812,iy
2,2,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX62.WAV,MPPC0,M,27195,28365,1170,1.699688,1.772812,0.073125,iy
3,3,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SA2.WAV,MKAH0,M,9800,10778,978,0.612500,0.673625,0.061125,iy
4,4,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SI1421.WAV,MRML0,M,79791,82630,2839,4.986937,5.164375,0.177437,iy
...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SI648.WAV,FCJF0,F,1507,2154,647,0.094187,0.134625,0.040438,ax-h
396,396,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX206.WAV,FVMH0,F,29660,30400,740,1.853750,1.900000,0.046250,ax-h
397,397,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SX271.WAV,FSMA0,F,15838,16318,480,0.989875,1.019875,0.030000,ax-h
398,398,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,SI849.WAV,FCAJ0,F,42411,42680,269,2.650688,2.667500,0.016813,ax-h


## FUNCTION: Measure Pitch of audio chunk | PARSEL MOUTH

In [7]:
def measure_pitch(audio_path: str) -> float:
    f0min, f0max = [75, 600]
    
    sound = parselmouth.Sound(audio_path) # read the sound
    pitch = call(sound, "To Pitch", 0, f0min, f0max) #create a praat pitch object
    mean_pitch = call(pitch, "Get mean", 0, 0, "Hertz") # get mean pitch
    return mean_pitch

## FUNCTION: Measure formants of audio chunk | PARSEL MOUTH

In [8]:
def measure_formants_psm(audio_path: str, vowel_name: str, start_sec: float, end_sec: float):
    f0min, f0max = [75, 600]
    sound = parselmouth.Sound(audio_path) # read the sound
    # pitch = call(sound, "To Pitch (cc)", 0, f0min, 15, 'no', 0.03, 0.45, 0.01, 0.35, 0.14, f0max)
    pitch = call(sound, "To Pitch", 0.0001, f0min, f0max)
    mean_pitch = call(pitch, "Get mean", 0, 0, "Hertz") # get mean pitch
    
    audio_chunk, fs = librosa.load(audio_path, sr=None, offset=start_sec, duration=(end_sec - start_sec))
    tmp_audio_file = os.path.join(TEMP_AUDIO_EXP_FOLDER, f"{vowel_name}.wav")
    write(tmp_audio_file, fs, audio_chunk)
    sound_frm = parselmouth.Sound(tmp_audio_file)
    # sound_frm = sound_frm.extract_part(rom_time=start_sec, to_time=end_sec, window_shape=0, relative_width=1, preserve_times=False) # read the sound chunk
    pointProcess = call(sound_frm, "To PointProcess (periodic, cc)", f0min, f0max)
    formants = call(sound_frm, "To Formant (burg)", 0.0025, 5, 5000, 0.025, 50)
    numPoints = call(pointProcess, "Get number of points")

    f1_list = []
    f2_list = []
    f3_list = []
    f4_list = []
    
    # Measure formants only at glottal pulses
    for point in range(0, numPoints):
        point += 1
        t  = call(pointProcess, "Get time from index", point)
        f1 = call(formants, "Get value at time", 1, t, 'Hertz', 'Linear')
        f2 = call(formants, "Get value at time", 2, t, 'Hertz', 'Linear')
        f3 = call(formants, "Get value at time", 3, t, 'Hertz', 'Linear')
        f4 = call(formants, "Get value at time", 4, t, 'Hertz', 'Linear')
        f1_list.append(f1)
        f2_list.append(f2)
        f3_list.append(f3)
        f4_list.append(f4)
    
    f1_list = [f1 for f1 in f1_list if str(f1) != 'nan']
    f2_list = [f2 for f2 in f2_list if str(f2) != 'nan']
    f3_list = [f3 for f3 in f3_list if str(f3) != 'nan']
    f4_list = [f4 for f4 in f4_list if str(f4) != 'nan']
    
    # calculate mean formants across pulses
    f1_mean = np.mean(f1_list)
    f2_mean = np.mean(f2_list)
    f3_mean = np.mean(f3_list)
    f4_mean = np.mean(f4_list)
    
    # calculate median formants across pulses, this is what is used in all subsequent calcualtions
    # you can use mean if you want, just edit the code in the boxes below to replace median with mean
    f1_median = np.median(f1_list)
    f2_median = np.median(f2_list)
    f3_median = np.median(f3_list)
    f4_median = np.median(f4_list)
    
    return {
        "pitch_mean_praat_base": mean_pitch,

        "F1_mean_praat_base":f1_mean,
        "F2_mean_praat_base":f2_mean,
        "F3_mean_praat_base":f3_mean,
        "F4_mean_praat_base":f4_mean,

        "F1_median_praat_base": f1_median,
        "F2_median_praat_base": f2_median,
        "F3_median_praat_base": f3_median,
        "F4_median_praat_base": f4_median,
    }

## FUNCTION: Measure formants of audio chunk | PRAAT FORMANTS

In [9]:
def measure_formants_pfp(audio_path, start_sec, end_sec):
    formants = pfp.formants_at_interval(
        audio_path, start_sec, end_sec, maxformant=5500, winlen=0.025, preemph=50
    )

    pitch_mean = measure_pitch(audio_path)
    pitch_mean = np.round(pitch_mean, 2)
    
    formants_mean = formants.mean(axis=0)
    formants_mean = list(formants_mean)[1:]  # skip time
    formants_mean = np.round(formants_mean, 2)  # round

    formants_median = np.median(formants, axis=0)
    formants_median = list(formants_median)[1:]  # skip time
    formants_median = np.round(formants_median, 2) # round


    return {
        "pitch_mean_praat_base": pitch_mean,

        "F1_mean_praat_base": formants_mean[0],
        "F2_mean_praat_base": formants_mean[1],
        "F3_mean_praat_base": formants_mean[2],

        "F1_median_praat_base": formants_median[0],
        "F2_median_praat_base": formants_median[1],
        "F3_median_praat_base": formants_median[2],
    }


### FUNCTION TESTS

In [10]:
audf, start, end, v = SUBSET_TIMIT_VOWELS_DF.loc[np.random.randint(0, len(SUBSET_TIMIT_VOWELS_DF)), ["audio_filepath", "start_second", "end_second", "vowel_name"]]
print(audf, start, end)
print(measure_formants_pfp(audf, start, end))

/home/jeevan/datasets/TIMIT Acoustic-Phonetic Continuous Speech Corpus (LDC93S1)/TIMIT/TRAIN/DR3/MDHS0/SX270.WAV 1.7231875 1.804125
{'pitch_mean_praat_base': 138.52, 'F1_mean_praat_base': 498.61, 'F2_mean_praat_base': 1166.96, 'F3_mean_praat_base': 2796.55, 'F1_median_praat_base': 493.35, 'F2_median_praat_base': 1098.81, 'F3_median_praat_base': 2767.2}


## FUNCTION: TIMIT Vowel PITCH, FORMANT Estimation

In [11]:
def estimate_vowel_formants(vowel_info):
    audio_file = vowel_info["audio_filepath"]
    start_sec = vowel_info["start_second"]
    end_sec = vowel_info["end_second"]
    vowel_name = vowel_info["vowel_name"]

    formant_estimates = measure_formants_pfp(audio_file, start_sec, end_sec)
    # formant_estimates = measure_formants_psm(audio_file, vowel_name, start_sec, end_sec)

    new_vowel_info = dict(vowel_info) | formant_estimates

    return new_vowel_info

vinfo = SUBSET_TIMIT_VOWELS_DF.loc[np.random.randint(0, len(SUBSET_TIMIT_VOWELS_DF))]
estimate_vowel_formants(vinfo)

{'index': 363,
 'audio_filepath': '/home/jeevan/datasets/TIMIT Acoustic-Phonetic Continuous Speech Corpus (LDC93S1)/TIMIT/TRAIN/DR5/MRML0/SX161.WAV',
 'wav_file': 'SX161.WAV',
 'person_id': 'MRML0',
 'sex': 'M',
 'start_sample': 42240,
 'end_sample': 43391,
 'duration_sample': 1151,
 'start_second': 2.64,
 'end_second': 2.7119375,
 'duration_second': 0.0719375,
 'vowel_name': 'uh',
 'pitch_mean_praat_base': 94.99,
 'F1_mean_praat_base': 772.22,
 'F2_mean_praat_base': 1508.79,
 'F3_mean_praat_base': 2385.5,
 'F1_median_praat_base': 755.69,
 'F2_median_praat_base': 1502.12,
 'F3_median_praat_base': 2425.85}

### Create and export Vowel Formant Estimation Result dataframe

In [12]:
VOWELS_FORMANT_DF = pd.DataFrame([estimate_vowel_formants(v_i) for i, v_i in tqdm(SUBSET_TIMIT_VOWELS_DF[0:].iterrows())])

columns = ['index',  'person_id', 'sex', 'duration_second', 'vowel_name', 
           'pitch_mean_praat_base', 
           'F1_mean_praat_base', 'F2_mean_praat_base', 'F3_mean_praat_base', 
           'F1_median_praat_base', 'F2_median_praat_base', 'F3_median_praat_base']

csv_path = TIMIT_VOWEL_FORMANT_ESTIMATION_EXP_FILEPATH.replace("LIM", f"{VOWEL_LIMIT}")
if not os.path.exists(csv_path):
    VOWELS_FORMANT_DF.to_csv(csv_path, columns=columns, index=False)

VOWELS_FORMANT_DF = VOWELS_FORMANT_DF.loc[:, columns]
VOWELS_FORMANT_DF

400it [01:04,  6.18it/s]


Unnamed: 0,index,person_id,sex,duration_second,vowel_name,pitch_mean_praat_base,F1_mean_praat_base,F2_mean_praat_base,F3_mean_praat_base,F1_median_praat_base,F2_median_praat_base,F3_median_praat_base
0,0,MTRT0,M,0.105000,iy,97.40,456.21,1906.70,2309.29,454.47,1921.02,2346.71
1,1,MMGG0,M,0.093812,iy,122.09,561.03,2072.62,2771.38,314.55,2110.85,2724.62
2,2,MPPC0,M,0.073125,iy,142.78,466.98,1948.16,2673.15,463.02,1958.39,2700.85
3,3,MKAH0,M,0.061125,iy,126.96,480.70,2161.41,2724.09,481.49,2157.63,2737.17
4,4,MRML0,M,0.177437,iy,86.21,880.29,2167.21,2948.52,355.54,2210.83,2897.05
...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,FCJF0,F,0.040438,ax-h,206.60,1095.70,1784.74,3048.24,1018.81,1767.53,2987.76
396,396,FVMH0,F,0.046250,ax-h,215.07,758.37,2162.47,3188.13,541.80,2153.17,3176.38
397,397,FSMA0,F,0.030000,ax-h,207.91,395.79,1155.12,2881.54,386.55,1252.54,2838.76
398,398,FCAJ0,F,0.016813,ax-h,190.54,511.05,1749.78,2994.59,511.05,1749.78,2994.59


### Create and export Vowel Formant Estimation Result dataframe: JSON

In [15]:
json_fp = TIMIT_VOWEL_FORMANT_ESTIMATION_EXP_FILEPATH.replace("LIM", f"{VOWEL_LIMIT}").replace(".csv", ".json")

if not os.path.exists(json_fp):
    VOWELS_FORMANT_DF.to_json(
        json_fp, index=False, orient="table"
    )

print(json_fp)

./exports/c_timit-vowels_formant_estimation_vowlim10.json
