# Speech-Based Feature Extraction

In [13]:
from parselmouth.praat import call

import pandas as pd
import parselmouth

## Load `Train` and `Test` Data

In [14]:
train_csv = pd.read_csv('../data/train.csv')
train_csv.head(3)

Unnamed: 0,dialog_id,speaker,transcript,da_tag,start_time,end_time,function,pronoun,ppron,i,...,home,money,relig,death,informal,swear,netspeak,assent,nonflu,filler
0,sw2005,A,okay,"fo_o_fw_""""_by_bc",0.0,1.31597,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,sw2005,B,SIL,x,0.0,10.94882,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,sw2005,A,uh first um i need to know uh how do you feel ...,qo,1.31597,10.93013,0.384615,0.076923,0.076923,0.038462,...,0.076923,0.0,0.0,0.0,0.230769,0.0,0.0,0.0,0.230769,0.0


In [15]:
test_csv = pd.read_csv('../data/test.csv')
test_csv.head(3)

Unnamed: 0,dialog_id,speaker,transcript,da_tag,start_time,end_time,function,pronoun,ppron,i,...,home,money,relig,death,informal,swear,netspeak,assent,nonflu,filler
0,sw2015,A,SIL,x,0.0,2.36986,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,sw2015,B,SIL,x,0.0,24.29833,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,sw2015,A,have you ever gotten one of those calls that i...,qy,2.36986,7.66596,0.619048,0.190476,0.047619,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Extract Features

In [24]:
def get_acoustic_features(row, sound):
    sound = sound.extract_part(row['start_time'], row['end_time'])
    
    # Pitch
    pitch = call(sound, 'To Pitch', 0.0, 75, 500)
    pitch_min = call(pitch, "Get minimum", 0, 0, "hertz", "Parabolic")
    pitch_max = call(pitch, "Get maximum", 0, 0, "hertz", "Parabolic")
    pitch_mean = call(pitch, "Get mean", 0, 0, "hertz")
    pitch_sd = call(pitch, "Get standard deviation", 0, 0, "hertz")

    # Intensity
    intensity = call(sound, 'To Intensity', 75, False)
    intensity_min = call(intensity, "Get minimum", 0, 0, "Parabolic")
    intensity_max = call(intensity, "Get maximum", 0, 0, "Parabolic")
    intensity_mean = call(intensity, "Get mean", 0, 0, "energy")
    intensity_sd = call(intensity, "Get standard deviation", 0, 0)

    # Speaking Rate
    duration = row['end_time'] - row['start_time']
    num_of_words = len(row['transcript'].split())
    speaking_rate = num_of_words / duration

    # Jitter and Shimmer
    point_process = call(sound, 'To PointProcess (periodic, cc)...', 75, 500)
    jitter = call(point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
    shimmer = call([sound, point_process], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)

    # HNR
    harmonicity = call(sound, 'To Harmonicity (cc)', 0.01, 75, 0.1, 1.0)
    hnr = call(harmonicity, "Get mean", 0, 0)

    # Return features.
    return [row['dialog_id'], row['speaker'], row['start_time'], row['end_time'], \
            pitch_min, pitch_max, pitch_mean, pitch_sd, \
            intensity_min, intensity_max, intensity_mean, intensity_sd, \
            speaking_rate, jitter, shimmer, hnr]

In [26]:
data = []

for row, item in test_csv.groupby(['dialog_id', 'speaker']):
    dialog_id = row[0]
    speaker = row[1]
    
    sound_file = '../data/wav/{}_{}.wav'.format(dialog_id, speaker) 
    sound = parselmouth.Sound(sound_file)
    
    item.apply(lambda x: data.append(get_acoustic_features(x, sound)), axis=1)
    
    break
    
    

In [29]:
data

[['sw2015',
  'A',
  0.0,
  2.36986,
  174.04650823013716,
  174.06674614912873,
  174.05417923179394,
  0.0075313808715490455,
  27.397755268631183,
  78.52645112617813,
  61.63282562663639,
  8.104073724126238,
  0.42196585452305196,
  0.00011084342218634985,
  0.030335294014862426,
  13.731802749838227],
 ['sw2015',
  'A',
  2.36986,
  7.66596,
  93.31337303997032,
  256.3875799280019,
  127.35247369673222,
  35.21017006281883,
  27.913918381846745,
  62.48622159407356,
  55.339086904589514,
  9.22962736169861,
  3.9651819263231434,
  0.01621763935264894,
  0.09097900523436497,
  10.92744196631329],
 ['sw2015',
  'A',
  7.66596,
  11.57304,
  75.65110814732344,
  148.32709422476393,
  117.17362006220988,
  16.502379155695586,
  27.797807252722578,
  65.91988132573047,
  55.229902701887006,
  12.94172995501146,
  1.5356737000522127,
  0.015931788647952097,
  0.0757815136452784,
  12.3924463022302],
 ['sw2015',
  'A',
  11.57304,
  16.74866,
  82.13205720844904,
  151.84444951606713,


In [30]:
columns = ['dialog_id', 'speaker', 'start_time', 'end_time', \
                 'Speech File', 'Min Pitch', 'Max Pitch', 'Mean Pitch', 'Sd Pitch', \
                 'Min Intensity', 'Max Intensity', 'Mean Intensity', 'Sd Intensity', \
                 'Speaking Rate', 'Jitter', 'Shimmer', 'HNR']
df = pd.DataFrame(data, columns=columns)

ValueError: 17 columns passed, passed data had 16 columns

In [27]:
data

[['sw2015',
  'A',
  0.0,
  2.36986,
  174.04650823013716,
  174.06674614912873,
  174.05417923179394,
  0.0075313808715490455,
  27.397755268631183,
  78.52645112617813,
  61.63282562663639,
  8.104073724126238,
  0.42196585452305196,
  0.00011084342218634985,
  0.030335294014862426,
  13.731802749838227],
 ['sw2015',
  'A',
  2.36986,
  7.66596,
  93.31337303997032,
  256.3875799280019,
  127.35247369673222,
  35.21017006281883,
  27.913918381846745,
  62.48622159407356,
  55.339086904589514,
  9.22962736169861,
  3.9651819263231434,
  0.01621763935264894,
  0.09097900523436497,
  10.92744196631329],
 ['sw2015',
  'A',
  7.66596,
  11.57304,
  75.65110814732344,
  148.32709422476393,
  117.17362006220988,
  16.502379155695586,
  27.797807252722578,
  65.91988132573047,
  55.229902701887006,
  12.94172995501146,
  1.5356737000522127,
  0.015931788647952097,
  0.0757815136452784,
  12.3924463022302],
 ['sw2015',
  'A',
  11.57304,
  16.74866,
  82.13205720844904,
  151.84444951606713,


In [5]:
# extract a segment

sound_file = "../data/wav/{}_{}.wav".format("sw2005", 'A')
sound = parselmouth.Sound(sound_file)


In [6]:
call(sound, "sel")

PraatError: Command "sel" not available for given objects.

In [19]:
sound_part = sound.extract_part(0, 1)
pitch = call(sound_part, 'To Pitch', 0.0, 75, 500)
pitch_mean = call(pitch, "Get minimum", 0, 0, "hertz", "Parabolic")
pitch_mean

116.06667736442307

In [12]:
sound_part_2 = call(sound, "Extract part", 0, 1)

PraatError: Command requires more than the given 2 arguments: no value for argument "Window shape".