In [16]:
import pydub 
import numpy as np
import sys
modulePath = '../../ChristiansPythonLibrary/src' 
sys.path.append(modulePath)
import generalUtility
import dspUtil
import praatUtil
import generalUtility
import matplotlibUtil
import parselmouth 

# Get Jittle
def calculateJitter(data):
    """Data is list of time of peaks"""
    data = np.array(data)
    data = data[data != 0]
    n = len(data)
    sum1 = 0
    sum2 = 0
    for i in range(n):
        if i > 0:
            sum1 += abs(data[i-1] - data[i])
        sum2 += data[i]
    sum1 /= float(n - 1)
    sum2 /= float(n)
    return 100 * (sum1 / sum2)


#Get Shimmer
def calculateShimmer(data):
    data = np.array(data)
    data = data[data != 0]
    n = len(data)
    sum1 = 0
    sum2 = 0
    for i in range(n):
        if i > 0:
            sum1 += abs(data[i-1] - data[i])
        sum2 += data[i]
    sum1 /= float(n - 1)
    sum2 /= float(n)
    return 100 * (sum1 / sum2)

def getStatistic(numpy_arr):
    numpy_arr = np.array(numpy_arr)
    numpy_arr = numpy_arr[numpy_arr != 0]
    max_v = np.max(numpy_arr)
    min_v = np.min(numpy_arr)
    range_v = np.max(numpy_arr) - np.min(numpy_arr)
    mean_v = np.mean(numpy_arr)
    median_v = np.median(numpy_arr)
    per25_v = np.percentile(numpy_arr, 25)
    per75_v= np.percentile(numpy_arr, 75)
    std_v = np.std(numpy_arr)
    return np.array([max_v, min_v, range_v, mean_v, median_v, per25_v, per75_v, std_v])


def estimate_voiced_unvoiced_and_breaks(file, THRESHOLD_UNVOICED = 0.5 ):
    time_step = 0.01
    snd = parselmouth.Sound(file)
    pitch = snd.to_pitch(voicing_threshold=0.45 ,silence_threshold = 0.03, time_step = time_step)
    amplitude_arr = pitch.selected_array['strength']
    print(len(amplitude_arr))
    num_breaks = []
    num_unvoiced = []
    num_voiced = []
    
  
    temp = []
    i = 0
    while(i < len(amplitude_arr)):
        i += 1
        if (len(temp) == 0):
            temp.append(amplitude_arr[i])
            
                
        if (i < len(amplitude_arr) and (temp[0] == 0.0 and amplitude_arr[i] == 0.0) or (temp[0] !=0 and amplitude_arr[i]!=0)):
            temp.append(amplitude_arr[i])

        else:
            if (temp[0] != 0):
                num_voiced.append(len(temp) * time_step)
            else:  
                if (len(temp) >= THRESHOLD_UNVOICED / time_step):
                    num_unvoiced.append(len(temp) * time_step)
                else:
                    num_breaks.append(len(temp)  * time_step)
            
            temp = []
            
            #continue
        
    
            
    return num_voiced, num_unvoiced, num_breaks


def getAllFeatures(file):
    features = []
    
    #Get peaks and that of frames and times.
    snd = parselmouth.Sound(file)
    pitch = snd.to_pitch(time_step = 0.01)
    formants = snd.to_formant_burg()
    num_frames = pitch.get_number_of_frames()
    frames = [pitch.get_frame(i) for i in range(1, num_frames+1)]
    times = [pitch.get_time_from_frame_number(i) for i in range(1, num_frames+1)]
    
    #Get energy
    energy = snd.get_energy()
    features.append(energy)
    
    #Get F0 statitic
    f0_arr = pitch.selected_array['frequency']
    f0_stat = getStatistic(f0_arr)
    features = np.append(features, f0_stat)
    
    #Get intensity statistic
    intensity_arr = [frame.intensity for frame in frames]
    intensity_stat = getStatistic(intensity_arr)
    features = np.append(features, intensity_stat)
    
    
    #Get formant values and format bandwidth statistic
    f1_arr = [formants.get_value_at_time(1, time) for time in times]
    f1_bandwidth_arr = [formants.get_bandwidth_at_time(1, time) for time in times]
    f1_stat = getStatistic(f1_arr)
    f1_bandwidth_stat = getStatistic(f1_bandwidth_arr)
    features = np.append(features, f1_stat)
    features = np.append(features, f1_bandwidth_stat)
    
    f2_arr = [formants.get_value_at_time(2, time) for time in times]
    f2_bandwidth_arr = [formants.get_bandwidth_at_time(2, time) for time in times]
    f2_stat = getStatistic(f2_arr)
    f2_bandwidth_stat = getStatistic(f2_bandwidth_arr)
    features = np.append(features, f2_stat)
    features = np.append(features, f2_bandwidth_stat)
    
    f3_arr = [formants.get_value_at_time(3, time) for time in times]
    f3_bandwidth_arr = [formants.get_bandwidth_at_time(3, time) for time in times]
    f3_stat = getStatistic(f3_arr)
    f3_bandwidth_stat = getStatistic(f3_bandwidth_arr)
    features = np.append(features, f3_stat)
    features = np.append(features, f3_bandwidth_stat)
    
    #f2/f1, f3/f1 statistic
    f2_over_f1_arr = np.array(f2_arr) / (np.array(f1_arr) + 1)
    f3_over_f1_arr = np.array(f3_arr) / (np.array(f1_arr) + 1)
    f2_over_f1_stat = getStatistic(f2_over_f1_arr)
    f3_over_f1_stat = getStatistic(f3_over_f1_arr)
    features = np.append(features, f2_over_f1_stat)
    features = np.append(features, f3_over_f1_stat)
    
    
    #Jitter
    f0_arr = np.array(f0_arr)
    f0_arr = f0_arr[f0_arr !=  0]
    jitter = calculateJitter(1000/f0_arr)
    features = np.append(features, jitter)
    
    #Shimmer
    amplitude_arr = pitch.selected_array['strength']
    shimmer = calculateShimmer(amplitude_arr)
    features = np.append(features, shimmer)
    
    #Duration
    duration = snd.duration
    features = np.append(features, duration)
    
    # Get voiced, unvoiced, break periods
    voices, unvoices, breaks = estimate_voiced_unvoiced_and_breaks(file, THRESHOLD_UNVOICED = 0.5) #if amplitude is == 0 in at least 0.5s --> unvoiced
    unvoiced_percent = (np.sum(unvoices)  + np.sum(breaks))/ duration
    breaks_degree = np.sum(breaks) / np.sum(voices)
    max_dur_pause = np.max(breaks)
    average_dur_pause  = np.average(breaks)
    
    features = np.append(features, unvoiced_percent)
    features = np.append(features, breaks_degree)
    features = np.append(features, max_dur_pause)
    features = np.append(features, average_dur_pause)
    
   # print(unvoiced_percent, breaks_degree, max_dur_pause, average_dur_pause)
                
    
    

    return features

getAllFeatures("video.wav")
    
    
    








2699


array([1.44175843e-02, 5.86906206e+02, 9.06415217e+01, 4.96264685e+02,
       1.22418843e+02, 1.09018819e+02, 1.01396781e+02, 1.33471929e+02,
       4.31461743e+01, 9.37612558e-01, 1.42526300e-07, 9.37612415e-01,
       2.55928919e-01, 2.25038824e-01, 7.58316631e-02, 4.01655646e-01,
       1.89909261e-01, 1.96613516e+03, 8.29369029e+01, 1.88319826e+03,
       6.06754306e+02, 5.33331882e+02, 4.10733261e+02, 6.76814977e+02,
       3.08010929e+02, 4.17067624e+03, 1.97762392e+01, 4.15090001e+03,
       4.72876718e+02, 3.77961932e+02, 1.91026155e+02, 6.50877734e+02,
       3.93641096e+02, 3.96926030e+03, 5.17375250e+02, 3.45188505e+03,
       1.81333838e+03, 1.77827626e+03, 1.63697769e+03, 1.95547476e+03,
       3.69778082e+02, 6.86650619e+03, 3.13862822e+01, 6.83511991e+03,
       5.21818621e+02, 4.00644888e+02, 2.46661113e+02, 6.22061913e+02,
       5.08749227e+02, 5.43713725e+03, 1.70775632e+03, 3.72938092e+03,
       2.87735429e+03, 2.82232345e+03, 2.51327353e+03, 3.13365407e+03,
      

In [None]:
snd = parselmouth.Sound("ses01.wav")
pitch = snd.to_pitch(voicing_threshold=0.45 ,silence_threshold = 0.00, time_step = 0.01)
formants = snd.to_formant_burg()
num_frames = pitch.get_number_of_frames()
frames = [pitch.get_frame(i) for i in range(1, num_frames+1)]
times = [pitch.get_time_from_frame_number(i) for i in range(1, num_frames+1)]
#print("len times: ", len(times))

#Unvoice percentage
unvoice_per = (num_frames - pitch.count_voiced_frames())/ num_frames
print("real number of unvoice: ", unvoice_per)

amplitude_arr = pitch.selected_array['strength']
#print("amplitude_arr: ", amplitude_arr[0:500])


                
                
            
            
            
        
        
    
    

    

In [None]:
0.32/0.01