In [64]:
import os
from fnmatch import fnmatch

import numpy as np
import pandas as pd
import parselmouth
from parselmouth.praat import call
from tqdm.notebook import tqdm

In [65]:
#to extract mfcc features as well
flag_extract_mfcc = False

#which of the following datasets you want to extract features
flag_italian_db = True
flag_mdvr_db = True
flag_ah_db = True
flag_czech_db = True



In [66]:
audios_path = "../data/audios/"

italian_pd_audiofiles_path = audios_path + 'Italian_PD'
italian_control_audiofiles_path = audios_path + 'Italian_HC'
mdvr_pd_audiofiles_path = audios_path + 'MDVR_PD'
mdvr_control_audiofiles_path = audios_path + 'MDVR_HC'
ah_pd_audiofiles_path = audios_path + 'AH_PD/'
ah_control_audiofiles_path = audios_path + 'AH_HC/'
czech_pd_audiofiles_path = audios_path + "Czech_PD"
czech_control_audiofiles_path = audios_path + "Czech_HC"

In [67]:
def get_features(voiceID, f0min, f0max, unit):
    try:
        sound = parselmouth.Sound(voiceID) # read the sound
        duration = call(sound, "Get total duration") # duration
        pitch = call(sound, "To Pitch", 0.0, f0min, f0max) #create a praat pitch object

        meanF0 = call(pitch, "Get mean", 0, 0, unit) #MDVP:Fo(Hz)
        # get standard deviation
        stdevF0 = call(pitch, "Get standard deviation", 0 ,0, unit) 
        harmonicity = call(sound, "To Harmonicity (cc)", 0.01, f0min, 0.1, 1.0)
        hnr = call(harmonicity, "Get mean", 0, 0) #HNR
        pointProcess = call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
        localJitter = call(pointProcess, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3) #MDVP:Jitter(%)
        localabsoluteJitter = call(pointProcess, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3) #MDVP:Jitter(Abs)
        rapJitter = call(pointProcess, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3) #MDVP:RAP
        ppq5Jitter = call(pointProcess, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3) #MDVP:PPQ
        ddpJitter = call(pointProcess, "Get jitter (ddp)", 0, 0, 0.0001, 0.02, 1.3) #Jitter:DDP
        localShimmer =  call([sound, pointProcess], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6) #MDVP:Shimmer
        localdbShimmer = call([sound, pointProcess], "Get shimmer (local_dB)", 0, 0, 0.0001, 0.02, 1.3, 1.6) #MDVP:Shimmer(dB)
        apq3Shimmer = call([sound, pointProcess], "Get shimmer (apq3)", 0, 0, 0.0001, 0.02, 1.3, 1.6) #Shimmer:APQ3
        aqpq5Shimmer = call([sound, pointProcess], "Get shimmer (apq5)", 0, 0, 0.0001, 0.02, 1.3, 1.6) #Shimmer:APQ5
        apq11Shimmer =  call([sound, pointProcess], "Get shimmer (apq11)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
        ddaShimmer = call([sound, pointProcess], "Get shimmer (dda)", 0, 0, 0.0001, 0.02, 1.3, 1.6)

        
        #print("file " + voiceID + " was read successfuly")
        return duration, meanF0, stdevF0, hnr, localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter, localShimmer, localdbShimmer, apq3Shimmer, aqpq5Shimmer, apq11Shimmer, ddaShimmer
    except Exception as exc:
        #print(exc)
        print("file " + voiceID + " was not read because it is not a sound file - ")
        return -1

In [68]:
def extract_mfcc(voiceID):
        try:
        
            """
            Extracts the mel frequency ceptral coefficients from the voice sample

            Parameters:
            voiceID : .wav file
                the voice sample we want to extract the features from
            """

            sound = parselmouth.Sound(voiceID)
            mfcc_object = sound.to_mfcc(number_of_coefficients=12) #the optimal number of coeefficient used is 12
            mfcc = mfcc_object.to_array()
            mfcc_mean = np.mean(mfcc.T,axis=0)
            [mfcc0, mfcc1, mfcc2, mfcc3, mfcc4, mfcc5, mfcc6, mfcc7, mfcc8, mfcc9, mfcc10, mfcc11, mfcc12] = mfcc_mean
            return mfcc0, mfcc1, mfcc2, mfcc3, mfcc4, mfcc5, mfcc6, mfcc7, mfcc8, mfcc9, mfcc10, mfcc11, mfcc12
        except Exception as exc:
            #print(exc)
            print("file " + voiceID + " was not read because it is not a sound file - ")
            return -1

In [69]:
def get_list_of_audiofiles(dir_path):
    res = []
    pattern = "*.wav"

    for name in os.listdir(dir_path):
        file_path = os.path.join(dir_path, name)
        if os.path.isfile(file_path) and fnmatch(name, pattern):
            res.append(name)

    return res

In [70]:
def get_features_from_audiofiles(pd_audiofiles_path, control_audiofiles_path):


    pd_audiofiles_names = get_list_of_audiofiles(pd_audiofiles_path)
    print(pd_audiofiles_path, "PD audiofiles: ", pd_audiofiles_names)
    control_audiofiles_names = get_list_of_audiofiles(control_audiofiles_path)
    print(control_audiofiles_path, "HC audiofiles: ", control_audiofiles_names)

    if flag_extract_mfcc:

        original_features_df = pd.DataFrame(columns=['Duration', 'meanF0', 'stdevF0', 'hnr', 'localJitter', 'localabsoluteJitter', 'rapJitter', 'ppq5Jitter', 'ddpJitter', 'localShimmer', 'localdbShimmer', 'apq3Shimmer', 'aqpq5Shimmer', 'apq11Shimmer', 'ddaShimmer', 'mfcc0', 'mfcc1', 'mfcc2', 'mfcc3', 'mfcc4', 'mfcc5', 'mfcc6', 'mfcc7', 'mfcc8', 'mfcc9', 'mfcc10', 'mfcc11', 'mfcc12','PD'])

    else:
        original_features_df = pd.DataFrame(columns=['Duration', 'meanF0', 'stdevF0', 'hnr', 'localJitter', 'localabsoluteJitter', 'rapJitter', 'ppq5Jitter', 'ddpJitter', 'localShimmer', 'localdbShimmer', 'apq3Shimmer', 'aqpq5Shimmer', 'apq11Shimmer', 'ddaShimmer','PD'])


    for i in (range(len(pd_audiofiles_names)-1)):
        values = get_features(pd_audiofiles_path+'/'+pd_audiofiles_names[i], 75, 500, "Hertz")

        if flag_extract_mfcc:
            mfcc_values = extract_mfcc(pd_audiofiles_path+'/'+pd_audiofiles_names[i])
        
        if (values!=-1):

            duration, meanF0, stdevF0, hnr, localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter, localShimmer, localdbShimmer, apq3Shimmer, aqpq5Shimmer, apq11Shimmer, ddaShimmer = values
            
            if (flag_extract_mfcc and mfcc_values!=-1):
                mfcc0,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,mfcc8,mfcc9,mfcc10,mfcc11,mfcc12 = mfcc_values
                original_features_df.loc[len(original_features_df)] = [duration, meanF0, stdevF0, hnr, localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter, localShimmer, localdbShimmer, apq3Shimmer, aqpq5Shimmer, apq11Shimmer, ddaShimmer, mfcc0, mfcc1, mfcc2, mfcc3, mfcc4, mfcc5, mfcc6, mfcc7, mfcc8, mfcc9, mfcc10, mfcc11, mfcc12,1]
            
            else:
                original_features_df.loc[len(original_features_df)] = [duration, meanF0, stdevF0, hnr, localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter, localShimmer, localdbShimmer, apq3Shimmer, aqpq5Shimmer, apq11Shimmer, ddaShimmer,1]

    for i in (range(len(control_audiofiles_names)-1)):
        values = get_features(control_audiofiles_path+'/'+control_audiofiles_names[i], 75, 500, "Hertz")
        if flag_extract_mfcc:
            mfcc_values = extract_mfcc(control_audiofiles_path+'/'+control_audiofiles_names[i])

        if (values!=-1):
            duration, meanF0, stdevF0, hnr, localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter, localShimmer, localdbShimmer, apq3Shimmer, aqpq5Shimmer, apq11Shimmer, ddaShimmer = values

            if (flag_extract_mfcc and mfcc_values!=-1):
                mfcc0,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,mfcc8,mfcc9,mfcc10,mfcc11,mfcc12 = mfcc_values
                original_features_df.loc[len(original_features_df)] = [duration, meanF0, stdevF0, hnr, localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter, localShimmer, localdbShimmer, apq3Shimmer, aqpq5Shimmer, apq11Shimmer, ddaShimmer, mfcc0, mfcc1, mfcc2, mfcc3, mfcc4, mfcc5, mfcc6, mfcc7, mfcc8, mfcc9, mfcc10, mfcc11, mfcc12,0]
            
            else:
                original_features_df.loc[len(original_features_df)] = [duration, meanF0, stdevF0, hnr, localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter, localShimmer, localdbShimmer, apq3Shimmer, aqpq5Shimmer, apq11Shimmer, ddaShimmer, 0]

    return original_features_df


In [71]:
if flag_mdvr_db:
    mdvr_original_features_df = get_features_from_audiofiles(mdvr_pd_audiofiles_path, mdvr_control_audiofiles_path)
    print(mdvr_original_features_df)

../data/audios/MDVR_PD PD audiofiles:  ['id29chunk6.wav', 'id17chunk3.wav', 'id34chunk20.wav', 'id24chunk5.wav', 'id30chunk5.wav', 'id06chunk19.wav', 'id06chunk31.wav', 'id06chunk25.wav', 'id06chunk24.wav', 'id06chunk30.wav', 'id06chunk18.wav', 'id30chunk4.wav', 'id24chunk4.wav', 'id34chunk21.wav', 'id17chunk2.wav', 'id29chunk7.wav', 'id17chunk0.wav', 'id29chunk5.wav', 'id34chunk23.wav', 'id24chunk6.wav', 'id30chunk6.wav', 'id17chunk18.wav', 'id06chunk26.wav', 'id06chunk32.wav', 'id20chunk18.wav', 'id20chunk19.wav', 'id06chunk27.wav', 'id17chunk19.wav', 'id30chunk7.wav', 'id24chunk7.wav', 'id34chunk22.wav', 'id29chunk4.wav', 'id17chunk1.wav', 'id17chunk5.wav', 'id29chunk0.wav', 'id24chunk3.wav', 'id34chunk26.wav', 'id30chunk3.wav', 'id17chunk21.wav', 'id06chunk23.wav', 'id20chunk20.wav', 'id06chunk22.wav', 'id17chunk20.wav', 'id30chunk2.wav', 'id34chunk27.wav', 'id24chunk2.wav', 'id29chunk1.wav', 'id17chunk4.wav', 'id27chunk8.wav', 'id06chunk9.wav', 'id29chunk3.wav', 'id17chunk6.wav', 

     Duration      meanF0    stdevF0        hnr  localJitter  \
0    0.260998  114.559712  11.523052  14.191471     0.028422   
1    0.632993  244.628674  18.861410  20.751485     0.011556   
2    0.844989  223.685833  62.412158  15.327212     0.020620   
3    1.274989  132.029628  12.062114  12.557775     0.017816   
4    3.637007  232.768539  37.333866  13.153182     0.021496   
..        ...         ...        ...        ...          ...   
809  0.959002  193.209988  22.001734  13.987776     0.016422   
810  1.102993  117.653228   8.975167  12.096987     0.023460   
811  1.466009  209.829420  75.861320   7.536627     0.027930   
812  1.248005  153.326690  19.930532  12.334277     0.033035   
813  0.916009  183.911653  19.803819  12.777012     0.018326   

     localabsoluteJitter  rapJitter  ppq5Jitter  ddpJitter  localShimmer  \
0               0.000247   0.013444    0.013165   0.040333      0.125351   
1               0.000047   0.004619    0.005859   0.013858      0.053902   
2  

In [72]:
if flag_italian_db:
    italian_original_features_df = get_features_from_audiofiles(italian_pd_audiofiles_path, italian_control_audiofiles_path)
    print(italian_original_features_df)


../data/audios/Italian_PD PD audiofiles:  ['VA1VSIOTLOP47M100220171331.wav', 'VU1sncihcio44M1606161724.wav', 'VI2cdaopmoe67M2605161911.wav', 'VI1rlouscsi77F2605161827.wav', 'VE2ssacvhei61M1606161745.wav', 'VI2GLIAUDLO50F100220171303.wav', 'VU1VSIPTIOZ46M240120171932.wav', 'VE2VSIPTIOZ46M240120171927.wav', 'VE2GNIEOGVL47M100220171217.wav', 'VO1GPIUUGLL63F100220171026.wav', 'VI1sncihcio44M1606161722.wav', 'VI1AGNUTGOL52F100220171051.wav', 'VA2lloeroun56F2605161927.wav', 'VE2NCIICAOC52M100220171240.wav', 'VO2lbuairgo52M1606161817.wav', 'VI2RROIBVEI49M240120171909.wav', 'VE1DLAARCII37F100220171115.wav', 'VO1ABNINSAC46F240120171802.wav', 'VA1rlouscsi77F2605161825.wav', 'VI2VSIOTLOP47M100220171334.wav', 'VO2RROIBVEI49M240120171910.wav', 'VE2NSICCHOI44M240120171847.wav', 'VI2lloeroun56F2605161930.wav', 'VA2cdaopmoe67M2605161907.wav', 'VU2RLOABREE42M240120171952.wav', 'VI1MCIICLHL46M240120171831.wav', 'VU2ABNINSAC46F240120171804.wav', 'VU1NCIICAOC52M100220171243.wav', 'VA2ABNINSAC46F2401201717

In [73]:
if flag_ah_db:
        ah_original_features_df = get_features_from_audiofiles(ah_pd_audiofiles_path, ah_control_audiofiles_path)
        print(ah_original_features_df)

../data/audios/AH_PD/ PD audiofiles:  ['AH_545713223-E6D59EE5-4C3F-4B40-AE8F-0657EF94DB66.wav', 'AH_545812844-DFBCDA22-CADB-444A-9623-16A39D45E9E7.wav', 'AH_545789682-7554E0C7-4E25-49C3-9E6C-04D525455E28.wav', 'AH_545789671-794D2256-DDFF-4009-8BA8-8A306C8FA14F.wav', 'AH_545841222-DE5AEF27-7F4E-45A4-BF7D-9E87E7A786AE.wav', 'AH_545622719-52C23861-6E0D-41E0-A3D8-9358C28C019B.wav', 'AH_545841227-5C77713A-66F1-49D0-BC8A-702C152E668D.wav', 'AH_545713221-1E77C030-4558-4A88-B1A2-6AB777ACAE61.wav', 'AH_545841226-C699FC9E-1E0C-474D-A12A-936DD92B8980.wav', 'AH_545692309-EA8C4DC0-9B2A-4CC7-A490-851A2129A733.wav', 'AH_545616858-3A749CBC-3FEB-4D35-820E-E45C3E5B9B6A.wav', 'AH_545834603-857E007F-1CCF-4249-8160-3A0F3F5AB58D.wav', 'AH_545713222-DA13DC3A-F24B-454E-984F-19DF19328D39.wav', 'AH_545880204-EE87D3E2-0D4C-4EAA-ACD7-C3F177AFF62F.wav', 'AH_545789677-D381D801-B073-4945-BE0D-E250126EA6B1.wav', 'AH_545789670-C297FD53-BF71-4183-86A0-58E5E1EB0DF8.wav', 'AH_545622720-E1486AF6-8C95-47EB-829B-4D62698C987

In [74]:
if flag_czech_db:
        czech_original_features_df = get_features_from_audiofiles(czech_pd_audiofiles_path, czech_control_audiofiles_path)
        print(czech_original_features_df)

../data/audios/Czech_PD PD audiofiles:  ['PD2a1_clean.wav', 'PD16a1_clean.wav', 'PD9a2_clean.wav', 'PD14a2_clean.wav', 'PD13a1_clean.wav', 'PD11a2_clean.wav', 'PD21a1_clean.wav', 'PD7a1_clean.wav', 'PD3a3_clean.wav', 'PD18a2_clean.wav', 'PD5a2_clean.wav', 'PD6a2_clean.wav', 'PD4a1_clean.wav', 'PD19a1_clean.wav', 'PD22a1_clean.wav', 'PD12a2_clean.wav', 'PD10a1_clean.wav', 'PD20a2_clean.wav', 'PD17a2_clean.wav', 'PD15a1_clean.wav', 'PD8a1_clean.wav', 'PD3a2_clean.wav', 'PD1a1_clean.wav', 'PD5a3_clean.wav', 'PD4a2_clean.wav', 'PD19a2_clean.wav', 'PD6a1_clean.wav', 'PD20a1_clean.wav', 'PD10a2_clean.wav', 'PD12a1_clean.wav', 'PD22a2_clean.wav', 'PD15a2_clean.wav', 'PD8a2_clean.wav', 'PD17a1_clean.wav', 'PD1a2_clean.wav', 'PD3a1_clean.wav', 'PD2a2_clean.wav', 'PD9a1_clean.wav', 'PD14a1_clean.wav', 'PD10a3_clean.wav', 'PD16a2_clean.wav', 'PD22a3_clean.wav', 'PD21a2_clean.wav', 'PD11a1_clean.wav', 'PD13a2_clean.wav', 'PD18a1_clean.wav', 'PD5a1_clean.wav', 'PD7a2_clean.wav']
../data/audios/Czec

In [75]:
def save_file(df, outdir, outname):
    df.to_csv(os.path.join(outdir, outname),index=False)

In [76]:
general_outname = "_original_features.csv"

italian_outname = 'italian' + general_outname
mdvr_outname = 'mdvr' + general_outname
ah_outname = 'ah' + general_outname
czech_outname = "czech" + general_outname

outdir = '../data/tables/'
if not os.path.exists(outdir):
    os.mkdir(outdir)

if flag_italian_db:
    save_file(italian_original_features_df, outdir, italian_outname)

if flag_mdvr_db:
    save_file(mdvr_original_features_df, outdir, mdvr_outname)


if flag_ah_db:
    save_file(ah_original_features_df, outdir, ah_outname)

if flag_czech_db:
    save_file(czech_original_features_df, outdir, czech_outname)
