In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import librosa
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import Adam
from sklearn import metrics
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 
import tensorflow as tf
from scipy.io import wavfile as wav

In [2]:
audio_dataset_path='splitSound/'
metadata=pd.read_csv('data.csv')
mfcc = 60
metadata.head()

Unnamed: 0,file_name,class_name
0,2_kirgizca_56.wav,kirgizca
1,2_kirgizca_42.wav,kirgizca
2,3_azerbaycan_20.wav,azerbaycan
3,6_turkmence_46.wav,turkmence
4,6_turkmence_52.wav,turkmence


In [3]:
def features_extractor(filename):
    audio, sample_rate = librosa.load(filename, res_type='kaiser_fast') 
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=mfcc)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
    
    return mfccs_scaled_features

In [4]:
extracted_features=[]
for index_num,row in tqdm(metadata.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path),str(row["file_name"]))
    final_class_labels=row["class_name"]
    data=features_extractor(file_name)
    extracted_features.append([data,final_class_labels])

4632it [02:46, 27.78it/s]


In [5]:
extracted_features_df = pd.DataFrame(extracted_features,columns=['feature','class'])
extracted_features_df.head()

Unnamed: 0,feature,class
0,"[-348.59406, 101.22915, 17.085117, 37.934284, ...",kirgizca
1,"[-265.22748, 100.98364, 15.282236, 15.134626, ...",kirgizca
2,"[-316.61432, 136.03163, 15.960988, 25.720314, ...",azerbaycan
3,"[-254.13887, 134.00977, -27.66474, 49.05436, -...",turkmence
4,"[-270.3439, 135.30635, -0.20712668, 54.928783,...",turkmence


In [6]:
X=np.array(extracted_features_df['feature'].tolist())
y=np.array(extracted_features_df['class'].tolist())
print(X.shape)
print(y.shape)

(4632, 60)
(4632,)


In [7]:
labelencoder=LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))

In [8]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [9]:
num_labels = 7

In [10]:
model=Sequential()
# 1. hidden layer
model.add(Dense(125,input_shape=(mfcc,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
# 2. hidden layer
model.add(Dense(250))
model.add(Activation('relu'))
model.add(Dropout(0.5))
# 3. hidden layer
model.add(Dense(125))
model.add(Activation('relu'))
model.add(Dropout(0.5))

# output layer
model.add(Dense(num_labels))
model.add(Activation('softmax'))

Metal device set to: Apple M1


2022-03-25 10:01:56.053219: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-03-25 10:01:56.053573: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 125)               7625      
                                                                 
 activation (Activation)     (None, 125)               0         
                                                                 
 dropout (Dropout)           (None, 125)               0         
                                                                 
 dense_1 (Dense)             (None, 250)               31500     
                                                                 
 activation_1 (Activation)   (None, 250)               0         
                                                                 
 dropout_1 (Dropout)         (None, 250)               0         
                                                                 
 dense_2 (Dense)             (None, 125)               3

In [12]:
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [None]:
epochscount = 50
num_batch_size = 32

model.fit(X_train, y_train, batch_size=num_batch_size, epochs=epochscount, validation_data=(X_test, y_test), verbose=1)

In [18]:
validation_test_set_accuracy = model.evaluate(X_test,y_test,verbose=0)
print(validation_test_set_accuracy[1])
print(validation_test_set_accuracy[0])

0.9730313420295715
0.07995288074016571


In [19]:
model.predict(X_test)

2022-03-25 10:05:15.165673: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


array([[0.0000000e+00, 1.2810433e-35, 2.4074748e-27, ..., 5.2381563e-30,
        1.0000000e+00, 0.0000000e+00],
       [5.7457906e-11, 5.1263827e-10, 4.5218374e-11, ..., 8.9220897e-10,
        3.7254437e-14, 1.0000000e+00],
       [8.5289276e-10, 2.4072997e-05, 2.1360377e-07, ..., 2.3177108e-05,
        7.3220356e-08, 3.6700069e-06],
       ...,
       [9.2358130e-04, 2.2045854e-03, 2.6213692e-03, ..., 6.0413452e-03,
        1.7356714e-04, 9.8739129e-01],
       [4.7762384e-08, 9.9661583e-01, 1.4190709e-03, ..., 9.7282154e-06,
        2.0017394e-05, 1.3978879e-03],
       [8.9941655e-13, 9.6028074e-08, 4.1929360e-10, ..., 1.4443974e-07,
        3.7639183e-10, 7.5438663e-08]], dtype=float32)

In [31]:
result_classes = ["azerbaycan","kazakca", "uygurca","kirgizca","tatarca","turkmence","ozbekce"]
file_name = "ozbekce"
for i in range(1,11):
    filename="testSound/"+file_name+"_"+str(i)+".wav"
    sound_signal, sample_rate = librosa.load(filename, res_type='kaiser_fast') 
    mfccs_features = librosa.feature.mfcc(y=sound_signal, sr=sample_rate, n_mfcc=mfcc)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
    mfccs_scaled_features = mfccs_scaled_features.reshape(1,-1)
    result_array = model.predict(mfccs_scaled_features)
    result = np.argmax(result_array[0])
    print(result_classes[result]) 

ozbekce
turkmence
ozbekce
ozbekce
ozbekce
ozbekce
ozbekce
ozbekce
uygurca
turkmence


In [27]:
result_classes = ["azerbaycan","kazakca", "uygurca","ozbekce","tatarca","turkmence","kirgizca"]
filename="testSound/kirgizca_6.wav"
sound_signal, sample_rate = librosa.load(filename, res_type='kaiser_fast') 
mfccs_features = librosa.feature.mfcc(y=sound_signal, sr=sample_rate, n_mfcc=mfcc)
mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
mfccs_scaled_features = mfccs_scaled_features.reshape(1,-1)
result_array = model.predict(mfccs_scaled_features)
for a in range(0,len(result_array[0])):
    print("Sınıf : {0} -- Doğruluk : % {1}".format(result_classes[a],int(round(float(result_array[0][a]),2)*100)))
result = np.argmax(result_array[0])
print(result_classes[result]) 

Sınıf : azerbaycan -- Doğruluk : % 1
Sınıf : kazakca -- Doğruluk : % 2
Sınıf : uygurca -- Doğruluk : % 87
Sınıf : ozbekce -- Doğruluk : % 1
Sınıf : tatarca -- Doğruluk : % 7
Sınıf : turkmence -- Doğruluk : % 2
Sınıf : kirgizca -- Doğruluk : % 1
uygurca
