In [1]:
import subprocess
import pandas as pd
import os
import glob
from tqdm import tqdm
from os.path import basename as bn, join, split as sp
import parselmouth
import numpy as np
from parselmouth.praat import call
from scipy.io.wavfile import write
import praat_formants_python as pfp


# DATASET Path and Constants

In [2]:
# Write paths
ALL_EXP_FOLDER = "./exports/"
(lambda fp : os.mkdir(fp) if not os.path.exists(fp) else 0)(ALL_EXP_FOLDER) #make export folder

# Synth Vowel info Export CSV filename
SYNTH_VOWEL_INFO_IMP_FILENAME = "d_base_formants_synth_vowels_vowlimLIM.csv"
SYNTH_VOWEL_INFO_IMP_FILEPATH = join(ALL_EXP_FOLDER, SYNTH_VOWEL_INFO_IMP_FILENAME)

# Vowel subset Export CSV filename
SYNTH_VOWEL_FORMANT_ESTIMATION_EXP_FILENAME = "e_synth-vowels_formant_estimation_vowlimLIM.csv"
SYNTH_VOWEL_FORMANT_ESTIMATION_EXP_FILEPATH = join(ALL_EXP_FOLDER, SYNTH_VOWEL_FORMANT_ESTIMATION_EXP_FILENAME)


### Import Synth Vowel (BASE) Formant Info dataframe

In [3]:
VOWEL_LIMIT = 500
SYNTH_VOWEL_AUDIO_FORMANT_INFO_DF = pd.read_csv(SYNTH_VOWEL_INFO_IMP_FILEPATH.replace("LIM", f"{VOWEL_LIMIT}"))
SYNTH_VOWEL_AUDIO_FORMANT_INFO_DF

Unnamed: 0,index,person_id,sex,duration_second,vowel_name,pitch_mean_praat_base,F1_mean_praat_base,F2_mean_praat_base,F3_mean_praat_base,F1_median_praat_base,F2_median_praat_base,F3_median_praat_base,synth_vowel_path
0,0,MMDM2,M,0.095000,iy,93.30,423.78,1792.19,2124.79,400.73,1837.84,2131.05,./audio_exports/vowlim500/iy_0_MMDM2_M_93.wav
1,1,MWJG0,M,0.135750,iy,92.23,331.04,2084.30,2817.60,326.05,2067.98,2838.16,./audio_exports/vowlim500/iy_1_MWJG0_M_92.wav
2,2,MMEB0,M,0.155938,iy,128.79,376.17,2157.42,2572.08,378.88,2172.93,2483.87,./audio_exports/vowlim500/iy_2_MMEB0_M_129.wav
3,3,MDLR1,M,0.215562,iy,119.88,451.73,2158.56,2721.53,466.94,2184.98,2736.78,./audio_exports/vowlim500/iy_3_MDLR1_M_120.wav
4,4,MCTT0,M,0.152312,iy,143.62,378.22,2125.59,2932.89,380.77,2119.58,2908.19,./audio_exports/vowlim500/iy_4_MCTT0_M_144.wav
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15372,15372,MSFV0,M,0.174937,ax-h,99.52,765.19,1876.51,3195.29,600.50,1647.24,2806.36,./audio_exports/vowlim500/ax-h_15372_MSFV0_M_1...
15373,15373,MJEE0,M,0.141000,ax-h,113.59,818.58,2064.01,3234.50,414.79,1780.08,2888.72,./audio_exports/vowlim500/ax-h_15373_MJEE0_M_1...
15374,15374,MTMR0,M,0.129938,ax-h,97.79,917.57,2128.07,3541.92,1219.34,2358.93,3806.64,./audio_exports/vowlim500/ax-h_15374_MTMR0_M_9...
15375,15375,MRJM3,M,0.099812,ax-h,112.14,488.53,2069.92,2919.50,510.21,2057.41,2742.00,./audio_exports/vowlim500/ax-h_15375_MRJM3_M_1...


## FUNCTION: Measure formants of synth vowels | PRAAT FORMANTS

In [4]:
def measure_pitch(audio_path: str) -> float:
    f0min, f0max = [75, 600]
    
    sound = parselmouth.Sound(audio_path) # read the sound
    pitch = call(sound, "To Pitch", 0, f0min, f0max) #create a praat pitch object
    mean_pitch = call(pitch, "Get mean", 0, 0, "Hertz") # get mean pitch
    return mean_pitch
    
def measure_formants_pfp(audio_path: str, start_sec: float, end_sec: float) -> dict:
    formants: list = pfp.formants_at_interval(
        audio_path, start_sec, end_sec, maxformant=5500, winlen=0.025, preemph=50
    )

    pitch_mean = measure_pitch(audio_path)
    pitch_mean = np.round(pitch_mean, 2)
    
    formants_mean = formants.mean(axis=0)
    formants_mean = list(formants_mean)[1:]  # skip time
    formants_mean = np.round(formants_mean, 2)  # round

    formants_median = np.median(formants, axis=0)
    formants_median = list(formants_median)[1:]  # skip time
    formants_median = np.round(formants_median, 2) # round


    return {
        "pitch_mean_synthvow_praat": pitch_mean,

        "F1_mean_synthvow_praat": formants_mean[0],
        "F2_mean_synthvow_praat": formants_mean[1],
        "F3_mean_synthvow_praat": formants_mean[2],

        "F1_median_synthvow_praat": formants_median[0],
        "F2_median_synthvow_praat": formants_median[1],
        "F3_median_synthvow_praat": formants_median[2],
    }


## FUNCTION: Measure formants of synth vowels | DEEPFORMANTS

In [5]:
def get_deepformants(audio_file: str, begin: float, end: float):
    proc = subprocess.check_output('/home/jeevan/dev/anaconda3/envs/pytorch/bin/python formants.py ' + audio_file + ' /home/jeevan/Desktop/DeepFormants/test.csv --begin ' +  str(begin) + ' --end ' + str(end), cwd="/home/jeevan/Desktop/DeepFormants/", shell=True)
    df = pd.read_csv("/home/jeevan/Desktop/DeepFormants/test.csv")
    return df.iloc[0,1:].tolist()
   

def measure_formants_df(audio_path: str,  start_sec: float, end_sec: float):
    _formants: list = get_deepformants(audio_path, start_sec, end_sec)
    _formants = np.round(_formants, 2)  # round

    return {
        "F1_mean_synthvow_deepformant": _formants[0],
        "F2_mean_synthvow_deepformant": _formants[1],
        "F3_mean_synthvow_deepformant": _formants[2],
        "F4_mean_synthvow_deepformant": _formants[3],
    }


### FUNCTION TESTS

In [7]:
v, audf = SYNTH_VOWEL_AUDIO_FORMANT_INFO_DF.loc[np.random.randint(0, len(SYNTH_VOWEL_AUDIO_FORMANT_INFO_DF)), ["vowel_name", "synth_vowel_path"]]
dur = 1.7/2
start, end = (dur - 0.5), (dur + 0.5)
audf = os.path.realpath(audf)
print(audf, start, end )

print(measure_formants_pfp(audf, start, end))
print(measure_formants_df(audf, start, end))


/home/jeevan/Jeevan_K/Projects/Asquire/Vowtiar-Quest/vowtiar-formant_estimation/audio_exports/vowlim500/aa_3482_MRGS0_M_122.wav 0.35 1.35
{'pitch_mean_synthvow_praat': 121.96, 'F1_mean_synthvow_praat': 685.31, 'F2_mean_synthvow_praat': 846.94, 'F3_mean_synthvow_praat': 1453.09, 'F1_median_synthvow_praat': 705.45, 'F2_median_synthvow_praat': 847.03, 'F3_median_synthvow_praat': 1453.53}
{'F1_mean_synthvow_deepformant': 566.58, 'F2_mean_synthvow_deepformant': 1481.19, 'F3_mean_synthvow_deepformant': 2520.54, 'F4_mean_synthvow_deepformant': 3990.71}


## FUNCTION: Synth Vowel PITCH, FORMANT Estimation | Praat, DeepFormants, +

In [8]:
def estimate_synth_vowel_formants(vowel_info):
    audio_file = vowel_info["synth_vowel_path"]
    audio_file = os.path.realpath(audio_file)
    vowel_name = vowel_info["vowel_name"]

    mdur = 1.7/2
    start_sec, end_sec = (mdur - 0.5), (mdur + 0.5)

    formant_estimates_pfp = measure_formants_pfp(audio_file, start_sec, end_sec) # estimator 1
    formant_estimates_df = measure_formants_df(audio_file, start_sec, end_sec) # estimator 2
    # TODO: estimators 3, 4, 5

    new_vowel_info = dict(vowel_info) | formant_estimates_pfp | formant_estimates_df

    return new_vowel_info

syvinfo = SYNTH_VOWEL_AUDIO_FORMANT_INFO_DF.loc[np.random.randint(0, len(SYNTH_VOWEL_AUDIO_FORMANT_INFO_DF))]
estimate_synth_vowel_formants(syvinfo)

{'index': 15311,
 'person_id': 'FHXS0',
 'sex': 'F',
 'duration_second': 0.09975,
 'vowel_name': 'uh',
 'pitch_mean_praat_base': 180.3,
 'F1_mean_praat_base': 520.11,
 'F2_mean_praat_base': 1892.24,
 'F3_mean_praat_base': 2761.4,
 'F1_median_praat_base': 515.39,
 'F2_median_praat_base': 1889.85,
 'F3_median_praat_base': 2749.84,
 'synth_vowel_path': './audio_exports/vowlim500/uh_15311_FHXS0_F_180.wav',
 'pitch_mean_synthvow_praat': 180.0,
 'F1_mean_synthvow_praat': 533.62,
 'F2_mean_synthvow_praat': 930.7,
 'F3_mean_synthvow_praat': 1882.82,
 'F1_median_synthvow_praat': 533.48,
 'F2_median_synthvow_praat': 941.55,
 'F3_median_synthvow_praat': 1882.93,
 'F1_mean_synthvow_deepformant': 492.76,
 'F2_mean_synthvow_deepformant': 1767.31,
 'F3_mean_synthvow_deepformant': 2646.57,
 'F4_mean_synthvow_deepformant': 4009.41}

### Create and export Synth Vowel Formant Estimation Result dataframe

In [9]:
SYNTH_VOWELS_FORMANT_DF = pd.DataFrame([estimate_synth_vowel_formants(syv_i) for _, syv_i in tqdm(SYNTH_VOWEL_AUDIO_FORMANT_INFO_DF[0:].iterrows())])

csv_path = SYNTH_VOWEL_FORMANT_ESTIMATION_EXP_FILEPATH.replace("LIM", f"{VOWEL_LIMIT}")
print(csv_path)
if not os.path.exists(csv_path):
    SYNTH_VOWELS_FORMANT_DF.to_csv(csv_path, index=False)

SYNTH_VOWELS_FORMANT_DF

26it [00:42,  1.78s/it]

In [11]:
SYNTH_VOWELS_FORMANT_DF.describe()

Unnamed: 0,index,duration_second,pitch_mean_praat_base,F1_mean_praat_base,F2_mean_praat_base,F3_mean_praat_base,F1_median_praat_base,F2_median_praat_base,F3_median_praat_base,pitch_mean_synthvow_praat,F1_mean_synthvow_praat,F2_mean_synthvow_praat,F3_mean_synthvow_praat,F1_median_synthvow_praat,F2_median_synthvow_praat,F3_median_synthvow_praat,F1_mean_synthvow_deepformant,F2_mean_synthvow_deepformant,F3_mean_synthvow_deepformant,F4_mean_synthvow_deepformant
count,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0,3821.0
mean,1910.0,0.11688,164.518642,610.393455,1596.547228,2614.085854,609.84402,1586.787592,2604.047092,164.526349,595.354415,968.253224,1645.894331,599.023054,969.342073,1629.912232,524.955883,1588.785198,2531.574666,3954.801795
std,1103.172017,0.045302,44.306388,128.547243,355.117779,324.429094,140.804466,376.711775,350.90381,44.310575,113.789776,108.423864,317.527335,115.255663,120.228222,336.537985,60.622122,161.135142,151.16111,69.097117
min,0.0,0.064,82.94,255.42,747.82,1433.48,259.07,657.22,1382.01,83.0,249.11,663.42,966.19,248.93,613.32,942.97,396.95,1276.0,2028.6,3520.88
25%,955.0,0.081875,124.32,521.86,1335.09,2430.67,512.52,1310.6,2416.27,124.05,514.65,891.72,1408.49,516.6,892.67,1370.06,479.12,1450.64,2422.63,3910.9
50%,1910.0,0.105,162.58,603.16,1567.44,2630.73,600.96,1556.01,2616.66,162.96,590.54,957.89,1610.08,595.49,958.91,1603.23,520.03,1554.03,2529.62,3954.5
75%,2865.0,0.140187,202.3,686.36,1822.48,2825.76,690.98,1815.51,2831.64,202.04,667.48,1033.09,1841.46,672.61,1035.75,1847.29,567.09,1723.09,2633.13,4003.05
max,3820.0,0.424375,290.68,1854.18,2874.72,3842.22,2095.86,2891.46,4034.36,291.25,961.15,1851.12,2820.2,960.66,2123.96,2819.98,709.99,1999.08,2978.47,4149.25


# TEST

In [35]:
# :)

