# Speech-Based Feature Extraction

In [13]:
from parselmouth.praat import call

import pandas as pd
import parselmouth

## Load `Train` and `Test` Data

In [14]:
train_csv = pd.read_csv('../data/train.csv')
train_csv.head(3)

Unnamed: 0,dialog_id,speaker,transcript,da_tag,start_time,end_time,function,pronoun,ppron,i,...,home,money,relig,death,informal,swear,netspeak,assent,nonflu,filler
0,sw2005,A,okay,"fo_o_fw_""""_by_bc",0.0,1.31597,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,sw2005,B,SIL,x,0.0,10.94882,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,sw2005,A,uh first um i need to know uh how do you feel ...,qo,1.31597,10.93013,0.384615,0.076923,0.076923,0.038462,...,0.076923,0.0,0.0,0.0,0.230769,0.0,0.0,0.0,0.230769,0.0


In [15]:
test_csv = pd.read_csv('../data/test.csv')
test_csv.head(3)

Unnamed: 0,dialog_id,speaker,transcript,da_tag,start_time,end_time,function,pronoun,ppron,i,...,home,money,relig,death,informal,swear,netspeak,assent,nonflu,filler
0,sw2015,A,SIL,x,0.0,2.36986,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,sw2015,B,SIL,x,0.0,24.29833,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,sw2015,A,have you ever gotten one of those calls that i...,qy,2.36986,7.66596,0.619048,0.190476,0.047619,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Extract Features

In [24]:
def get_acoustic_features(row, sound):
    sound = sound.extract_part(row['start_time'], row['end_time'])
    
    # Pitch
    pitch = call(sound, 'To Pitch', 0.0, 75, 500)
    pitch_min = call(pitch, "Get minimum", 0, 0, "hertz", "Parabolic")
    pitch_max = call(pitch, "Get maximum", 0, 0, "hertz", "Parabolic")
    pitch_mean = call(pitch, "Get mean", 0, 0, "hertz")
    pitch_sd = call(pitch, "Get standard deviation", 0, 0, "hertz")

    # Intensity
    intensity = call(sound, 'To Intensity', 75, False)
    intensity_min = call(intensity, "Get minimum", 0, 0, "Parabolic")
    intensity_max = call(intensity, "Get maximum", 0, 0, "Parabolic")
    intensity_mean = call(intensity, "Get mean", 0, 0, "energy")
    intensity_sd = call(intensity, "Get standard deviation", 0, 0)

    # Speaking Rate
    duration = row['end_time'] - row['start_time']
    num_of_words = len(row['transcript'].split())
    speaking_rate = num_of_words / duration

    # Jitter and Shimmer
    point_process = call(sound, 'To PointProcess (periodic, cc)...', 75, 500)
    jitter = call(point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
    shimmer = call([sound, point_process], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)

    # HNR
    harmonicity = call(sound, 'To Harmonicity (cc)', 0.01, 75, 0.1, 1.0)
    hnr = call(harmonicity, "Get mean", 0, 0)

    # Return features.
    return [row['dialog_id'], row['speaker'], row['start_time'], row['end_time'], \
            pitch_min, pitch_max, pitch_mean, pitch_sd, \
            intensity_min, intensity_max, intensity_mean, intensity_sd, \
            speaking_rate, jitter, shimmer, hnr]

In [34]:
def get_speech_features(df):
    columns = ['dialog_id', 'speaker', 'start_time', 'end_time', \
           'Min Pitch', 'Max Pitch', 'Mean Pitch', 'Sd Pitch', \
           'Min Intensity', 'Max Intensity', 'Mean Intensity', 'Sd Intensity', \
           'Speaking Rate', 'Jitter', 'Shimmer', 'HNR'] 
    data = []
    
    for row, item in df.groupby(['dialog_id', 'speaker']):
        dialog_id = row[0]
        speaker = row[1]

        sound_file = '../data/wav/{}_{}.wav'.format(dialog_id, speaker) 
        sound = parselmouth.Sound(sound_file)

        item.apply(lambda x: data.append(get_acoustic_features(x, sound)), axis=1)

        break
    
    return pd.DataFrame(data, columns=columns)

In [35]:
get_speech_features(test_csv)

Unnamed: 0,dialog_id,speaker,start_time,end_time,Min Pitch,Max Pitch,Mean Pitch,Sd Pitch,Min Intensity,Max Intensity,Mean Intensity,Sd Intensity,Speaking Rate,Jitter,Shimmer,HNR
0,sw2015,A,0.00000,2.36986,174.046508,174.066746,174.054179,0.007531,27.397755,78.526451,61.632826,8.104074,0.421966,0.000111,0.030335,13.731803
1,sw2015,A,2.36986,7.66596,93.313373,256.387580,127.352474,35.210170,27.913918,62.486222,55.339087,9.229627,3.965182,0.016218,0.090979,10.927442
2,sw2015,A,7.66596,11.57304,75.651108,148.327094,117.173620,16.502379,27.797807,65.919881,55.229903,12.941730,1.535674,0.015932,0.075782,12.392446
3,sw2015,A,11.57304,16.74866,82.132057,151.844450,117.921692,11.688333,27.548523,63.903176,56.596460,14.108906,1.738922,0.014186,0.064153,12.949125
4,sw2015,A,16.74866,20.80178,74.854123,407.131028,129.184825,47.330238,27.540912,65.497511,57.328283,13.808277,2.467235,0.015583,0.069807,11.831661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,sw2015,A,381.56188,383.24834,163.752770,348.172476,223.554222,40.054687,27.316917,48.392368,39.999992,5.758162,0.592958,0.020688,0.108311,9.751818
69,sw2015,A,383.24834,384.42294,83.653259,147.365353,123.271998,16.443490,27.638665,56.784067,49.217052,7.984649,4.256768,0.019994,0.098310,11.530293
70,sw2015,A,384.42294,384.92729,166.953633,301.245847,239.398509,49.535087,26.988529,53.151807,47.732195,7.509400,1.982750,0.023686,0.134450,5.398692
71,sw2015,A,384.92729,385.32225,91.995897,239.272015,143.160936,59.336251,28.351365,54.527380,49.673930,8.396001,5.063804,0.030805,0.134529,5.887310


In [26]:
data = []

for row, item in test_csv.groupby(['dialog_id', 'speaker']):
    dialog_id = row[0]
    speaker = row[1]
    
    sound_file = '../data/wav/{}_{}.wav'.format(dialog_id, speaker) 
    sound = parselmouth.Sound(sound_file)
    
    item.apply(lambda x: data.append(get_acoustic_features(x, sound)), axis=1)
    
    break
    
    

In [31]:
columns = ['dialog_id', 'speaker', 'start_time', 'end_time', \
           'Min Pitch', 'Max Pitch', 'Mean Pitch', 'Sd Pitch', \
           'Min Intensity', 'Max Intensity', 'Mean Intensity', 'Sd Intensity', \
           'Speaking Rate', 'Jitter', 'Shimmer', 'HNR']
df = pd.DataFrame(data, columns=columns)

In [32]:
df

Unnamed: 0,dialog_id,speaker,start_time,end_time,Min Pitch,Max Pitch,Mean Pitch,Sd Pitch,Min Intensity,Max Intensity,Mean Intensity,Sd Intensity,Speaking Rate,Jitter,Shimmer,HNR
0,sw2015,A,0.00000,2.36986,174.046508,174.066746,174.054179,0.007531,27.397755,78.526451,61.632826,8.104074,0.421966,0.000111,0.030335,13.731803
1,sw2015,A,2.36986,7.66596,93.313373,256.387580,127.352474,35.210170,27.913918,62.486222,55.339087,9.229627,3.965182,0.016218,0.090979,10.927442
2,sw2015,A,7.66596,11.57304,75.651108,148.327094,117.173620,16.502379,27.797807,65.919881,55.229903,12.941730,1.535674,0.015932,0.075782,12.392446
3,sw2015,A,11.57304,16.74866,82.132057,151.844450,117.921692,11.688333,27.548523,63.903176,56.596460,14.108906,1.738922,0.014186,0.064153,12.949125
4,sw2015,A,16.74866,20.80178,74.854123,407.131028,129.184825,47.330238,27.540912,65.497511,57.328283,13.808277,2.467235,0.015583,0.069807,11.831661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,sw2015,A,381.56188,383.24834,163.752770,348.172476,223.554222,40.054687,27.316917,48.392368,39.999992,5.758162,0.592958,0.020688,0.108311,9.751818
69,sw2015,A,383.24834,384.42294,83.653259,147.365353,123.271998,16.443490,27.638665,56.784067,49.217052,7.984649,4.256768,0.019994,0.098310,11.530293
70,sw2015,A,384.42294,384.92729,166.953633,301.245847,239.398509,49.535087,26.988529,53.151807,47.732195,7.509400,1.982750,0.023686,0.134450,5.398692
71,sw2015,A,384.92729,385.32225,91.995897,239.272015,143.160936,59.336251,28.351365,54.527380,49.673930,8.396001,5.063804,0.030805,0.134529,5.887310


In [5]:
# extract a segment

sound_file = "../data/wav/{}_{}.wav".format("sw2005", 'A')
sound = parselmouth.Sound(sound_file)


In [6]:
call(sound, "sel")

PraatError: Command "sel" not available for given objects.

In [19]:
sound_part = sound.extract_part(0, 1)
pitch = call(sound_part, 'To Pitch', 0.0, 75, 500)
pitch_mean = call(pitch, "Get minimum", 0, 0, "hertz", "Parabolic")
pitch_mean

116.06667736442307

In [12]:
sound_part_2 = call(sound, "Extract part", 0, 1)

PraatError: Command requires more than the given 2 arguments: no value for argument "Window shape".