In [1]:
# import packages

import os # read system path 
import csv

import matplotlib as mpl
import matplotlib.pyplot as plt

import pandas as pd
import soundfile as sf
from gudhi.point_cloud import timedelay
import numpy as np
from numpy import argmax
import math
from ripser import ripser
from persim import plot_diagrams
import umap
%matplotlib qt5

from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# function define

# wav_fraction_finder is to find the corresponding wav signal according to interval
def wav_fraction_finder(start_time, end_time,sig):
    sig_fraction=sig[int(start_time*samplerate):int(end_time*samplerate)]
    return sig_fraction

# principle_frequency_finder is to find the period of a speech signal
def principle_frequency_finder(sig):
    t=int(len(sig)/2)
    corr=np.zeros(t)

    for index in np.arange(t):
        ACF_delay=sig[index:]
        L=(t-index)/2
        m = np.sum(sig[int(t-L):int(t+L+1)]**2) + np.sum(ACF_delay[int(t-L):int(t+L+1)]**2)
        r = np.sum(sig[int(t-L):int(t+L+1)]*ACF_delay[int(t-L):int(t+L+1)])
        corr[index] = 2*r/m

    zc = np.zeros(corr.size-1)
    zc[(corr[0:-1] < 0)*(corr[1::] > 0)] = 1
    zc[(corr[0:-1] > 0)*(corr[1::] < 0)] = -1

    admiss = np.zeros(corr.size)
    admiss[0:-1] = zc
    for i in range(1, corr.size):
        if admiss[i] == 0:
            admiss[i] = admiss[i-1]

    maxes = np.zeros(corr.size)
    maxes[1:-1] = (np.sign(corr[1:-1] - corr[0:-2])==1)*(np.sign(corr[1:-1] - corr[2::])==1)
    maxidx = np.arange(corr.size)
    maxidx = maxidx[maxes == 1]
    max_index = 0
    if len(corr[maxidx]) > 0:
        max_index = maxidx[np.argmax(corr[maxidx])]

    return (max_index, corr)

In [3]:
# Path is where the voiced/voicedless wav file located
voicedPath="/Users/pfeng3/Documents/research/TopCap/revise3/LJSpeech_seg_add500/voiced/"
voicedlessPath="/Users/pfeng3/Documents/research/TopCap/revise3/LJSpeech_seg_add500/voiceless/"

# Parameter for embedding
M=100 # embed dimension

In [7]:
# Retrive features from persistent diagram 
# For voiced data
for fn in os.listdir(voicedPath):
    # Subsample dataset, retrieve 1 in 10 among dataset
    randNum=np.random.randint(10)
    if randNum !=0:
        continue

    # Read wav file as "sig"
    fileName,ext=os.path.splitext(fn)
    wavFile=voicedPath+fileName+".wav"
    sig,samplerate=sf.read(wavFile)
    
    # Find the principle frequency, delay of sig
    T_voiced,corr=principle_frequency_finder(sig)
    delay_voiced=round(T_voiced*6/M)
    if delay_voiced==0:
        delay_voiced=1

    if delay_voiced*M>len(sig):
        delay_voiced=int(np.floor(len(sig)/M))

    # Write result in a csv file
    with open("Persistent_Diag.csv","a",newline="") as csvfile:
        writer=csv.writer(csvfile)

        # Time-delay embedding of voiced data
        point_Cloud=timedelay.TimeDelayEmbedding(M, delay_voiced, 5)
        Points=point_Cloud(sig)
        if len(Points)<40:               
            continue
        
        # Compute persistent diagram of piont cloud
        dgms = ripser(Points,maxdim=1)['dgms']
        dgms=dgms[1]
        if dgms.size==0:
            continue
        persistent_time=[ele[1]-ele[0] for ele in dgms]            
        index=argmax(persistent_time)

        # Compute birth time and lifetime 
        # Write them into csv file
        # 0 indicate voiced data
        birth_date=dgms[index][0]
        lifetime=persistent_time[index]
        writer.writerow((birth_date,lifetime,0))
        

# For voicedless data

for fn in os.listdir(voicedlessPath):
    # Subsample dataset, retrieve 1 in 10 among dataset
    randNum=np.random.randint(10)
    if randNum !=0:
        continue
    
    # Read wav file as "sig"
    fileName,ext=os.path.splitext(fn)
    wavFile=voicedlessPath+fileName+".wav"
    sig,samplerate=sf.read(wavFile)
    
    # Find the principle frequency, delay of sig
    T_voicedless,corr=principle_frequency_finder(sig)
    delay_voicedless=round(T_voicedless*6/M)
    if delay_voicedless==0:
        delay_voicedless=1

    if delay_voicedless*M>len(sig):
        delay_voicedless=int(np.floor(len(sig)/M))

    # Write result in a csv file
    with open("Persistent_Diag.csv","a",newline="") as csvfile:
        writer=csv.writer(csvfile)

        # Time-delay embedding of voiced data
        point_Cloud=timedelay.TimeDelayEmbedding(M, delay_voicedless, 5)
        Points=point_Cloud(sig)
        if len(Points)<40:               
            continue
        
        # Compute persistent diagram of piont cloud
        dgms = ripser(Points,maxdim=1)['dgms']
        dgms=dgms[1]
        if dgms.size==0:
            continue
        persistent_time=[ele[1]-ele[0] for ele in dgms]            
        index=argmax(persistent_time)

        # Compute birth time and lifetime 
        # Write them into csv file
        # 1 indicate voicedless data
        birth_date=dgms[index][0]
        lifetime=persistent_time[index]
        writer.writerow((birth_date,lifetime,1))

    


    



In [5]:
# Read the csv file into DataFrame
df=pd.read_csv('Persistent_Diag.csv', names=['birth_date','lifetime','type'],header=None)
df

Unnamed: 0,birth_date,lifetime,type
0,0.154053,0.174512,0
1,0.122578,0.173930,0
2,0.321282,0.522316,0
3,0.161613,0.372393,0
4,0.248990,0.968884,0
...,...,...,...
20331,0.983024,0.098277,1
20332,0.018481,0.003236,1
20333,0.142768,0.010035,1
20334,0.212146,0.019144,1


In [6]:
# Shuffle 9000 samples for each voice and voiceless type

# Shuffle 9000 rows from each class
df_0 = df[df['type'] == 0].sample(n=9000, random_state=42)
df_1 = df[df['type'] == 1].sample(n=9000, random_state=42)

# Combine and shuffle again (optional)
df_feature = pd.concat([df_0, df_1]).sample(frac=1, random_state=42).reset_index(drop=True)

In [7]:
# Normalization: min-max
normalized_df=(df_feature[['birth_date','lifetime']]-df_feature[['birth_date','lifetime']].min())/(df_feature[['birth_date','lifetime']].max()-df_feature[['birth_date','lifetime']].min())
normalized_df['type']=df_feature['type']

In [None]:
# Read/load data if needed
#normalized_df.to_pickle('topcap_df_feature_normalized')
#normalized_df=pd.read_pickle('topcap_df_feature_normalized')

In [9]:
# Check the number of samples in each class
(normalized_df['type']==0).sum()

9000

In [None]:
# Plot the normalized result
# Set up plot configuration
SMALL_SIZE = 10
MEDIUM_SIZE = 12
BIGGER_SIZE = 15

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels

# Group the data based on voiced/ voicedless
groups = normalized_df.groupby('type')

# Plot
fig, ax = plt.subplots(figsize=(6,6))
ax.margins(0.05)
typeDict= {1:'voicedless',0:'voiced'}
for type, group in groups:
    if type==1:
        ax.plot(group.birth_date, group.lifetime, marker='o', linestyle='', ms=2, label=typeDict[type],alpha=0.5, color='#4d4dff')
    if type==0:
        ax.plot(group.birth_date, group.lifetime, marker='o', linestyle='', ms=2, label=typeDict[type],alpha=0.5, color='#ff5c33')
legend=ax.legend(fontsize=15,markerscale=4)
plt.xlabel('Birth Time')
plt.ylabel('Lifetime')

## Save figure as pdf file
#plt.savefig("/Users/pfeng3/Documents/research/TopCap/picture/featureAna_pd1.pdf", format="pdf", bbox_inches="tight")

In [None]:
# Plot individual 
plt.figure(figsize=(6, 3))

for type, group in groups:
    if type==0:
        plt.subplot(1, 2, 1)
        plt.plot(group.birth_date, group.lifetime, marker='o', linestyle='', ms=2, label=typeDict[type], alpha=0.5, color='#ff5c33')
        plt.legend(['voiced'],fontsize=10,markerscale=4,loc='upper right')
        plt.xlabel('Birth Time')
        plt.ylabel('Lifetime')
        plt.xlim([-0.1,1.1])
        plt.ylim([-0.1,1.1])
        plt
    if type==1:
        plt.subplot(1, 2, 2)
        plt.plot(group.birth_date, group.lifetime, marker='o', linestyle='', ms=2, label=typeDict[type], alpha=0.5, color='#4d4dff')
        plt.legend(['voiceless'],fontsize=10,markerscale=4,loc='upper right')
        plt.xlabel('Birth Time')
        plt.ylabel('Lifetime')
        plt.xlim([-0.1,1.1])
        plt.ylim([-0.1,1.1])

plt.tight_layout()
plt.show()

## Save figure as pdf file
#plt.savefig("/Users/pfeng3/Documents/research/TopCap/picture/featureAna_pd2.pdf", format="pdf", bbox_inches="tight")

In [6]:
X_train, X_test, y_train, y_test = train_test_split(normalized_df[['birth_date','lifetime']], normalized_df['type'], test_size=0.2, random_state=42)

In [8]:
# Classification model knn 

knn = KNeighborsClassifier(n_neighbors=20) # Set the number of neighbors
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9088888888888889


In [9]:
# Train a linear model to classify the data, and plot the decision boundary

# Create a RidgeClassifier object
ridge = RidgeClassifier(alpha=1.0)

# Fit the model to the training data
ridge.fit(X_test, y_test.values.ravel())

# Predict the labels for the test data
y_pred = ridge.predict(X_test)

# Evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8663888888888889


In [10]:
# SVM classification
from sklearn import svm

# Create an SVM classifier
clf = svm.SVC(kernel='linear')
# Train the classifier
clf.fit(X_test, y_test.values.ravel())
# Predict on the test set
y_pred = clf.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8816666666666667


In [11]:
# LDA model
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

clf = LinearDiscriminantAnalysis()
# Train the classifier
clf.fit(X_train, y_train.values.ravel())
# Predict on the test set
y_pred = clf.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.8733333333333333


In [12]:
# Logistic classification
from sklearn.linear_model import LogisticRegression

# Create a logistic regression model
model = LogisticRegression()

# Train the model using the training data
model.fit(X_train, y_train.values.ravel())

# Predict class labels for the test data
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8986111111111111


In [20]:
# compute statistical information from data

normalized_df['birth_date'].mean()
normalized_df['birth_date'].std()

0.11195624909487463

In [22]:
normalized_df['lifetime'].mean()
normalized_df['lifetime'].std()

0.13107378500328268

In [23]:
nor_voice_df=normalized_df[normalized_df['type']==0]
print(f'Mean for voiced data: {nor_voice_df.mean()}')
print(f'Std for voiced data: {nor_voice_df.std()}')

Mean for voiced data: birth_date    0.065522
lifetime      0.211008
type          0.000000
dtype: float64
Std for voiced data: birth_date    0.047321
lifetime      0.126436
type          0.000000
dtype: float64


In [24]:
nor_voiceless_df=normalized_df[normalized_df['type']==1]
print(f'Mean for voiceless data: {nor_voiceless_df.mean()}')
print(f'Std for voiceless data: {nor_voiceless_df.std()}')

Mean for voiceless data: birth_date    0.157663
lifetime      0.041533
type          1.000000
dtype: float64
Std for voiceless data: birth_date    0.144997
lifetime      0.050460
type          0.000000
dtype: float64
