In [None]:
# import packages

import os # read system path 
import csv

import matplotlib as mpl
import matplotlib.pyplot as plt
import librosa

import pandas as pd
from scipy import signal
import soundfile as sf
import numpy as np
from numpy import argmax
import math

from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

import umap
from sklearn.preprocessing import StandardScaler
%matplotlib qt5

# Path is where the voiced/voicedless wav file located
voicedPath=".."
voicedlessPath=".."

In [None]:
for fn in os.listdir(voicedPath):
    # Subsample dataset, retrieve 1 in 10 among dataset
    randNum=np.random.randint(10)
    if randNum !=0:
        continue

    # Read wav file as "sig"
    fileName,ext=os.path.splitext(fn)
    wavFile=voicedPath+fileName+".wav"
    sig,samplerate=sf.read(wavFile)
    
    # Write result in a csv file
    with open("MFCC_Diag3.csv","a",newline="") as csvfile:
        writer=csv.writer(csvfile)

        # MFCC
        mfccs = librosa.feature.mfcc(y=sig, sr=samplerate, n_mfcc=10, n_fft=int(len(sig)/2),hop_length=int(len(sig)/4))
        mfccs=np.abs(mfccs.flatten())

        # Add last feature to indicate if it is voiced/ voicedless
        # 0 indicate the phone is voiced
        data=np.append(mfccs,0)
 
        writer.writerow(data)

# For voicedless data
for fn in os.listdir(voicedlessPath):
    # Subsample dataset, retrieve 1 in 10 among dataset
    randNum=np.random.randint(10)
    if randNum !=0:
        continue
    
    # Read wav file as "sig"
    fileName,ext=os.path.splitext(fn)
    wavFile=voicedlessPath+fileName+".wav"
    sig,samplerate=sf.read(wavFile)

    # Write result in a csv file
    with open("MFCC_Diag3.csv","a",newline="") as csvfile:
        writer=csv.writer(csvfile)

        # MFCC
        mfccs = librosa.feature.mfcc(y=sig, sr=samplerate, n_mfcc=10, n_fft=int(len(sig)/2),hop_length=int(len(sig)/4))
        mfccs=np.abs(mfccs.flatten())

        # Add last feature to indicate if it is voiced/ voicedless
        # 1 indicate the phone is voicedless
        data=np.append(mfccs,1)
 
        writer.writerow(data)


In [None]:
# Read the csv file into DataFrame
df=pd.read_csv('MFCC_Diag3.csv',header=None)
df

In [None]:
# Shuffle 9000 samples for each voice and voiceless type
n=50
# Shuffle 9000 rows from each class
df_0 = df[df[n] == 0].sample(n=9000, random_state=42)
df_1 = df[df[n] == 1].sample(n=9000, random_state=42)

# Combine and shuffle again (optional)
df = pd.concat([df_0, df_1]).sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# Use UMAP to reduce the feature dimension to 2
reducer = umap.UMAP()
data = df.iloc[:,0:n]
scaled_data = StandardScaler().fit_transform(data)
embedding = reducer.fit_transform(scaled_data)
embedding.shape

In [None]:
# Read the embedded feature as DataFrame
df_feature=pd.DataFrame(embedding,columns=['feature1','feature2'])
df_feature['type']=df[n]

# Normalization: min-max
normalized_df=(df_feature[['feature1','feature2']]-df_feature[['feature1','feature2']].min())/(df_feature[['feature1','feature2']].max()-df_feature[['feature1','feature2']].min())
normalized_df['type']=df_feature['type']

# Read/load data if needed
#normalized_df.to_pickle('mfcc_df_feature_nor')
#normalized_df=pd.read_pickle('mfcc_df_feature_nor')

# Check the number of samples in each class
(normalized_df['type']==1).sum()

In [None]:
# Plot the normalized result

# Set up plot configuration
SMALL_SIZE = 10
MEDIUM_SIZE = 12
BIGGER_SIZE = 15

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels

# Group the data based on voiced/ voicedless
groups = normalized_df.groupby('type')

# Plot
fig, ax = plt.subplots(figsize=(6, 6))
ax.margins(0.05)
typeDict= {1:'voiceless',0:'voiced'}
for type, group in groups:
    if type==1:
        ax.plot(group.feature1, group.feature2, marker='o', linestyle='', ms=2, label=typeDict[type],alpha=0.5, color='#4d4dff')
    if type==0:
        ax.plot(group.feature1, group.feature2, marker='o', linestyle='', ms=2, label=typeDict[type],alpha=0.5, color='#ff5c33')
legend=ax.legend(fontsize=15,markerscale=4,loc='upper right')
plt.xlabel('UMAP_1',fontsize=15)
plt.ylabel('UMAP_2',fontsize=15)

## Save figure as pdf file
#plt.savefig("..", format="pdf", bbox_inches="tight")

In [None]:
# Plot individual 
plt.figure(figsize=(6, 3))

for type, group in groups:
    if type==0:
        plt.subplot(1, 2, 1)
        plt.plot(group.feature1, group.feature2, marker='o', linestyle='', ms=2, label=typeDict[type], alpha=0.5, color='#ff5c33')
        plt.legend(['voiced'],fontsize=10,markerscale=4,loc='upper right')
        plt.xlabel('UMAP_1')
        plt.ylabel('UMAP_2')
        plt.xlim([-0.1,1.1])
        plt.ylim([-0.1,1.1])
        plt
    if type==1:
        plt.subplot(1, 2, 2)
        plt.plot(group.feature1, group.feature2, marker='o', linestyle='', ms=2, label=typeDict[type], alpha=0.5, color='#4d4dff')
        plt.legend(['voiceless'],fontsize=10,markerscale=4,loc='upper right')
        plt.xlabel('UMAP_1')
        plt.ylabel('UMAP_2')
        plt.xlim([-0.1,1.1])
        plt.ylim([-0.1,1.1])

plt.tight_layout()
plt.show()

## Save figure as pdf file
#plt.savefig("..", format="pdf", bbox_inches="tight")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(normalized_df[['feature1','feature2']], normalized_df['type'], test_size=0.2, random_state=42)

# Logistic classification
from sklearn.linear_model import LogisticRegression

# Create a logistic regression model
model = LogisticRegression()

# Train the model using the training data
model.fit(X_train, y_train.values.ravel())

# Predict class labels for the test data
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')