### **EXTRACTING MFCC'S FOR EVERY AUDIO FILE**

In [None]:
from google.colab import files
my_file=files.upload()

TypeError: ignored

Mounted at /content/drive


In [None]:
from zipfile import ZipFile
# Create a ZipFile Object and load sample.zip in it
with ZipFile('/content/drive/MyDrive/Audio-Classification-master.zip', 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall()

In [None]:
import pandas as pd
import os
import librosa
import numpy as np
from tqdm import tqdm


Creating the feature extractor function. Extracting the mfcc features from the discrete data points of a single audio file and then scaling it


In [None]:
def features_extractor(file):
  
    # kaiser_fast is a faster method for resampling. 
    # Since the sampling rate is not specified, all the files are sampled to the default sampling rate 22050.
    # librosa.load will return an audio sample which is a 1-D array and the sample rate which will be by default 22050
    audio, sample_rate = librosa.load(file, res_type='kaiser_fast') 
  
    # librosa.feature.mfcc will return an mfcc sequence of size n_mfcc,t
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    
    #Taking the transpose of the mfccs_features matrix and taking mean along the column(axis = 0)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
    
    return mfccs_scaled_features


Now we iterate through every audio file and extract features using Mel-Frequency Cepstral Coefficients


In [None]:
import os
import IPython.display as ipd
extracted_features=[]
extracted_classes = []
 
directory ="/content/Audio-Classification-master/wavfiles"
for files in os.listdir(directory):
  extracted_classes.append(files)
  for audio in os.listdir(directory+"/"+files):
    final_class_labels=files
    data=features_extractor(directory+"/"+files+"/"+audio)
 
    #Creating the input feature and its corresponding class
    extracted_features.append([data,final_class_labels])
 
extracted_classes.sort()
print(extracted_classes)

['Acoustic_guitar', 'Bass_drum', 'Cello', 'Clarinet', 'Double_bass', 'Flute', 'Hi_hat', 'Saxophone', 'Snare_drum', 'Violin_or_fiddle']


In [None]:
print(extracted_features)


[[array([-3.8206662e+02,  7.2313347e+01, -8.1685974e+01, -2.6692736e+01,
       -1.7759129e+01, -1.3218628e+01, -3.1354174e+01,  1.2372264e+00,
        1.3034053e+01,  3.5484715e+00, -2.6881053e+00, -2.0505705e+01,
        2.6869371e+00,  2.2849117e+01,  3.2609516e+01,  4.7344402e+01,
        1.4553745e+01, -7.3202972e+00, -2.3842472e+01, -9.3493481e+00,
        4.9498959e+00,  2.1952568e+01, -5.7899013e+00, -3.1125599e+01,
       -2.2197512e+01, -6.9755588e+00,  1.3369696e+00,  2.1299772e+00,
        2.3979000e+01,  5.2398403e+01,  2.8739120e+01, -1.4640336e+01,
       -1.3670593e+01, -4.9723396e+00, -8.1617838e-01, -3.8519592e+00,
        2.0035655e+00,  2.2296729e-03, -2.7652714e+01, -2.6683884e+01],
      dtype=float32), 'Saxophone'], [array([-276.16223   ,   86.11617   ,  -93.51989   ,  -34.993095  ,
        -38.751656  ,  -15.703917  ,  -38.80898   ,  -26.847971  ,
        -23.36265   ,  -16.872835  ,   -5.5813036 ,   15.0104685 ,
         25.84262   ,   53.73405   ,   49.122227 

Converting extracted_features to Pandas dataframe


In [None]:
#Keeping the input feature under the feature column and its corresponding musical instrument class under "class" column

extracted_features_df=pd.DataFrame(extracted_features,columns=['feature','class'])
extracted_features_df


Unnamed: 0,feature,class
0,"[-382.06662, 72.31335, -81.685974, -26.692736,...",Saxophone
1,"[-276.16223, 86.11617, -93.51989, -34.993095, ...",Saxophone
2,"[-376.70572, 60.207825, -41.89704, 13.174442, ...",Saxophone
3,"[-193.67361, 61.228775, -31.578365, 16.687138,...",Saxophone
4,"[-299.81885, 45.86282, -98.06471, -19.532494, ...",Saxophone
...,...,...
295,"[-469.41678, 31.971645, -22.451328, -23.43602,...",Acoustic_guitar
296,"[-411.09253, 74.60266, 15.648073, 18.556211, -...",Acoustic_guitar
297,"[-526.0049, 25.052528, 20.250063, 16.071043, 1...",Acoustic_guitar
298,"[-533.758, 22.485113, -8.706329, -17.043005, -...",Acoustic_guitar


In [None]:
#Separating out the input features and the "class" feature into separate np arrays.

X=np.array(extracted_features_df['feature'].tolist())
y=np.array(extracted_features_df['class'].tolist())

**TRAINING THE MODEL**

LABEL ENCODING


In [None]:
#Converting the class names into binary form
y=np.array(pd.get_dummies(y))

**TRAIN TEST SPLIT**

In [None]:
#USing sklearn to split the dataset into train and test. 
#Keeping the random_state = 0 means that the dataset will be not split randomly everytime we run this.
#Test size = 0.25 denotes that 25% of the dataset will be given to the test data. 

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

In [None]:
#Using the "___CNN___" model.
#Sequential model denotes that there will be layer after layer in our neural network
#Default layer used in neural network is dense layer.
#Dropout is used to avoid overfitting. It ignores the output of some of the nodes of hidden layer
#Activation function are of various types - RELU (Rectified linear Unit) , softmax , sigmoid. 
#The weighted sum of the input is passed through the activation function.
#Adam optimisers uses stoichastic gradient descent
#Stoichastic gradient descent means that backpropogation is done after evaluating the loss function on sets of batches  

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import Adam
from sklearn import metrics

num_labels=y.shape[1]
model=Sequential()
###first layer
#In the first layer it is mandatory to provide the input shape which is equal to n_mffc provided.
model.add(Dense(100,input_shape=(40,)))
model.add(Activation('relu'))
model.add(Dropout(0.2))
###second layer
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.2))
###third layer
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.2))

###final layer
#The number of nodes in the last layer should be equal to the number of musical instrument classes present. 
model.add(Dense(num_labels))
model.add(Activation('softmax'))

In [None]:
#Applying the loss function as categorical_crossEntropy as more than 2 output classes are present ...multilevel classifiction
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

#Speciying the batch size and the number of epochs

num_epochs = 500
num_batch_size = 40


#verbose = 1 that it will display that whenever a better model is found then it will "display" that it will be saved  

checkpointer = ModelCheckpoint(filepath='/content/Audio-Classification-master', 
                               verbose=1, save_best_only=True , save_weights_only=True)
start = datetime.now()

#The model trains on the X_train and y_train using forward propogation and back propogation
model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/500

Epoch 00001: val_loss improved from inf to 8.63328, saving model to /content/Audio-Classification-master
Epoch 2/500

Epoch 00002: val_loss improved from 8.63328 to 4.20154, saving model to /content/Audio-Classification-master
Epoch 3/500

Epoch 00003: val_loss improved from 4.20154 to 2.91803, saving model to /content/Audio-Classification-master
Epoch 4/500

Epoch 00004: val_loss did not improve from 2.91803
Epoch 5/500

Epoch 00005: val_loss improved from 2.91803 to 2.74025, saving model to /content/Audio-Classification-master
Epoch 6/500

Epoch 00006: val_loss improved from 2.74025 to 2.19172, saving model to /content/Audio-Classification-master
Epoch 7/500

Epoch 00007: val_loss improved from 2.19172 to 2.10543, saving model to /content/Audio-Classification-master
Epoch 8/500

Epoch 00008: val_loss did not improve from 2.10543
Epoch 9/500

Epoch 00009: val_loss did not improve from 2.10543
Epoch 10/500

Epoch 00010: val_loss did not improve from 2.10543
Epoch 11/500

E

Predicting for the test set

In [None]:

# model.load_weights(checkpointer)
test_accuracy=model.evaluate(X_test,y_test,verbose=1 , callbacks=[checkpointer])
print(test_accuracy[1])

0.7333333492279053


Predicting for the single audio file

In [None]:
filename="/content/Audio-Classification-master/wavfiles/Acoustic_guitar/0eeaebcb.wav"


prediction_feature=features_extractor(filename)
prediction_feature=prediction_feature.reshape(1,-1)
print(prediction_feature)
print(model(prediction_feature))
x1 = np.argmax(model(prediction_feature), axis=-1)[0]
print(extracted_classes[x1])

[[-2.1355525e+02  1.0364144e+02  1.8248293e+01  2.4165234e+01
  -5.9177189e+00  2.9075947e+01 -3.8614960e+01  3.1781249e+00
  -2.1478634e+01 -7.2651238e+00 -1.9626860e+01 -2.2713785e+01
  -1.8616789e+01 -2.0836485e+01 -3.9578562e+00 -1.1293378e+00
   8.8198578e-01  8.5589495e+00  5.2567034e+00  8.1535530e+00
  -1.3278853e+01 -1.7417551e+01 -4.1785808e+00 -2.8397921e-01
   6.2119322e+00 -3.5135534e+00 -7.5655165e+00 -1.1666251e+01
  -1.9165748e+00  8.3102465e+00  1.1909724e+01 -7.6757026e-01
   1.1942604e-01  1.0390764e+01  7.2981133e+00  1.2639936e+00
  -6.6707897e+00  1.7244997e+00  4.2608431e-01 -1.5742726e+01]]
tf.Tensor(
[[1.0000000e+00 3.8853489e-21 1.0701011e-22 9.9294366e-26 6.0580025e-15
  4.1006877e-32 5.1987551e-36 3.0869452e-30 1.4653227e-24 5.0379873e-33]], shape=(1, 10), dtype=float32)
Acoustic_guitar
