In [1]:
# !pip install --upgrade librosa
# !pip install --upgrade pydub
# !pip install --upgrade pip

In [2]:
import glob
import os
from pydub import AudioSegment
import pandas as pd
import librosa
import librosa.display
import numpy as np
from sklearn import preprocessing

In [3]:
def get_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs = get_mfcc(audio, sample_rate)
        flux = get_spectral_features(audio, sample_rate)[2]
        chroma_features = get_chroma_features(audio, sample_rate)
        return [mfccs, flux] + chroma_features
    except Exception as e:
        print("Error encountered while parsing file: ", file)
        return None 

In [4]:
def get_all_features(folder):
    folders = os.listdir(folder)
    folders.sort()
    all_features = []
    for label, sub_dir in enumerate(folders): #label {0; 4}
        for file_name in glob.glob(os.getcwd() + '/' + folder +'/' + sub_dir + '/*.wav'):
#             print("Extracting file ", file_name)
            try:
                features = get_features(file_name)
            except Exception as e:
                print("Extraction error")
                continue
            sub_strs= file_name.split('/')
            all_features.append(features +  [label + 1]+[sub_strs[-1]])
    data = pd.DataFrame(all_features, columns=['mfccs', 'flux', 'chroma_features1', 'chroma_features2', 'chroma_features3', 'class_label','author'])
    return data

In [5]:
def get_mfcc(audio, sr):
    #Mel Frequency Cepstral Coefficient
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=100)
    mfccs_scaled = np.mean(mfccs.T,axis=0)
    return mfccs_scaled

In [6]:
def get_zero_crossing_rate(audio):
    #ZERO_CROSSING_RATE
    rate = librosa.feature.zero_crossing_rate(audio)
    rate_scaled = np.mean(rate.T,axis=0)
    return rate_scaled

In [7]:
def get_spectral_features(audio, sr):
    sp = librosa.feature.spectral_centroid(audio)
    S, phase = librosa.magphase(librosa.stft(audio))
    a = librosa.feature.spectral_rolloff(S=S, sr=sr)
    onset_env = librosa.onset.onset_strength(y=audio, sr=sr)
    onset_env.shape = (onset_env.shape[0], 1)
    sp_scaled = np.mean(sp.T,axis=0)
    a_scaled = np.mean(a.T,axis=0)
    onset_env_scaled = np.mean(onset_env,axis=0)
    return [sp_scaled, a_scaled, onset_env_scaled]

In [8]:
def get_chroma_features(audio, sr):
    chroma_stft = librosa.feature.chroma_stft(y=audio, sr=sr)
    chroma_cens = librosa.feature.chroma_cens(y=audio, sr=sr)
    chroma_cq = librosa.feature.chroma_cqt(y=audio, sr=sr)
    chroma_stft_scaled = np.mean(chroma_stft.T,axis=0)
    chroma_cens_scaled = np.mean(chroma_cens.T,axis=0)
    chroma_cq_scaled = np.mean(chroma_cq.T,axis=0)
    return [chroma_stft_scaled, chroma_cens_scaled, chroma_cq_scaled]

In [9]:
data = get_all_features('data')
data

Unnamed: 0,mfccs,flux,chroma_features1,chroma_features2,chroma_features3,class_label,author
0,"[-524.52057, 52.767735, 2.273679, 35.936493, 6...",[1.530465],"[0.5613639, 0.51607424, 0.4942565, 0.559985, 0...","[0.36901940466972843, 0.2956498396853704, 0.31...","[0.73921776, 0.6329696, 0.6403303, 0.6548654, ...",1,zurab-khut-1c-new.wav
1,"[-375.04742, 71.76099, -9.058271, 58.62941, -2...",[1.567517],"[0.43075806, 0.71236825, 0.5354362, 0.37864283...","[0.2824099578816442, 0.2805342458267515, 0.207...","[0.6885597, 0.6979703, 0.56097203, 0.63815755,...",1,aleksandre-pert-1a.wav
2,"[-602.8608, 40.34575, 19.960438, 24.787498, 6....",[1.0779346],"[0.5120637, 0.4240948, 0.37835118, 0.40419602,...","[0.386308371152053, 0.2761394137069292, 0.2321...","[0.73146373, 0.547114, 0.52066433, 0.5054258, ...",1,shota-noza-1c.wav
3,"[-549.5745, 47.821518, -18.677393, 27.991554, ...",[1.7496176],"[0.48344243, 0.4247605, 0.31991082, 0.27700663...","[0.23877964415573058, 0.24710025871539226, 0.2...","[0.5377745, 0.55522126, 0.5620581, 0.51723933,...",1,aleksandre-pert-1-new.wav
4,"[-551.16956, 59.801514, -0.14260375, 24.958803...",[1.3302921],"[0.3357782, 0.3583633, 0.372898, 0.32006225, 0...","[0.3013156769196653, 0.2589461289581806, 0.298...","[0.62065524, 0.5347672, 0.59509873, 0.5396492,...",1,levan-gela-1j.wav
...,...,...,...,...,...,...,...
928,"[-428.7131, 72.103806, 8.8743925, 34.941143, 1...",[1.2077314],"[0.33085436, 0.36756566, 0.45109984, 0.3520728...","[0.2891750401201622, 0.2933941475703626, 0.332...","[0.5578847, 0.56241864, 0.58177984, 0.54885334...",5,nika-onia-5c.wav
929,"[-505.51352, 74.166504, 19.460085, 22.2807, -2...",[0.87001157],"[0.34257028, 0.39673063, 0.43497545, 0.3187953...","[0.2987257023857266, 0.29833079334143936, 0.25...","[0.61127925, 0.6083719, 0.5614367, 0.5630839, ...",5,levan-gela-5i.wav
930,"[-353.27307, 102.400566, 6.734233, 49.48563, -...",[2.5484147],"[0.61872715, 0.6274767, 0.7156743, 0.65416723,...","[0.24468334748028872, 0.3216160909607349, 0.41...","[0.6220891, 0.7412361, 0.85163176, 0.8437252, ...",5,lasha-kiti-5a.wav
931,"[-622.4822, 107.89334, 3.9235218, 20.481346, 0...",[1.3867838],"[0.4030363, 0.33613327, 0.31981707, 0.2961784,...","[0.3066639204790587, 0.28096365678488133, 0.25...","[0.59908, 0.5834185, 0.5612741, 0.50064546, 0....",5,devi-khos-5.wav


In [10]:
labels = [[i] for i in data['class_label']]
y_train = np.array(labels)

# One-hot encode

In [11]:
from sklearn.preprocessing import OneHotEncoder
def get_one_hot(y):
    encoder = OneHotEncoder(sparse=False)
    y_onehot = encoder.fit_transform(y)
    return y_onehot

In [12]:
y_onehot_train = get_one_hot(y_train)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [13]:
def get_processed_X(x):
    X = x.drop(columns=['class_label','author'])
    X = X.values
    ls = []
    for i in range(X.shape[0]):
        features = []
        for j in range(X.shape[1]):
            for k in X[i][j]:
                features.append(k)
        ls.append(features)
    res = np.array(ls)
    return preprocessing.scale(res)

In [14]:
ls_train = get_processed_X(data)

 # ნეირონული ქსელის კოდი

In [15]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [16]:
import math

def softmax(z):
    z_exp = [math.exp(i) for i in z]
    sum_z_exp = sum(z_exp)
    return [i / sum_z_exp for i in z_exp]

In [17]:
def get_accuracy(h, y):
    sm = 0
    for a,i in zip(h, y):
        sm+=a[(i[0]-1)]
    sm/= y.shape[0]
    return sm

In [18]:
def forward_propagate(X, theta1, theta2):
    m = X.shape[0]
    
    a1 = np.insert(X, 0, values=np.ones(m), axis=1)
    z2 = a1 * theta1.T
    a2 = np.insert(sigmoid(z2), 0, values=np.ones(m), axis=1)
    z3 = a2 * theta2.T
    h = []
    for i in range(z3.shape[0]):
        z = [z3[i, j] for j in range(z3.shape[1])]
        h.append(softmax(z))
    h = np.array(h)
#     h = sigmoid(z3)
    return a1, z2, a2, z3, h

In [19]:
def cost(params, input_size, hidden_size, num_labels, X, y, learning_rate):
    m = X.shape[0]
    X = np.matrix(X)
    y = np.matrix(y)
    theta1 = np.matrix(np.reshape(params[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
    theta2 = np.matrix(np.reshape(params[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))
    
    a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2)

    J = 0
    for i in range(m):
        first_term = np.multiply(-y[i,:], np.log(h[i,:]))
        second_term = np.multiply((1 - y[i,:]), np.log(1 - h[i,:]))
        J += np.sum(first_term - second_term)
    
    J = J / m
    
    J += (float(learning_rate) / (2 * m)) * (np.sum(np.power(theta1[:,1:], 2)) + np.sum(np.power(theta2[:,1:], 2)))
    
    return J

In [20]:
def sigmoid_gradient(z):
    return np.multiply(sigmoid(z), (1 - sigmoid(z)))

In [21]:
def backprop(params, input_size, hidden_size, num_labels, X, y, learning_rate):
    m = X.shape[0]
    X = np.matrix(X)
    y = np.matrix(y)
    
    theta1 = np.matrix(np.reshape(params[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
    theta2 = np.matrix(np.reshape(params[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))
    
    a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2)
    
    J = 0
    delta1 = np.zeros(theta1.shape)
    delta2 = np.zeros(theta2.shape)
    
    J = cost(params,input_size, hidden_size, num_labels, X, y, learning_rate)

    for t in range(m):
        a1t = a1[t,:]
        z2t = z2[t,:]
        a2t = a2[t,:]
        ht = h[t,:]
        yt = y[t,:]
        
        d3t = ht - yt
        
        z2t = np.insert(z2t, 0, values=np.ones(1))
        d2t = np.multiply((theta2.T * d3t.T).T, sigmoid_gradient(z2t))
        
        delta1 = delta1 + (d2t[:,1:]).T * a1t
        delta2 = delta2 + d3t.T * a2t
        
    delta1 = delta1 / m
    delta2 = delta2 / m
    
    delta1[:,1:] = delta1[:,1:] + (theta1[:,1:] * learning_rate) / m
    delta2[:,1:] = delta2[:,1:] + (theta2[:,1:] * learning_rate) / m
    
    grad = np.concatenate((np.ravel(delta1), np.ravel(delta2)))
    
    return J, grad

In [22]:
input_size = ls_train.shape[1]
hidden_size = 50
num_labels = y_onehot_train.shape[1]
learning_rate = 0.4

params = (np.random.random(size=hidden_size * (input_size + 1) + num_labels * (hidden_size + 1)) - 0.5) * 0.25

ls_train = np.matrix(ls_train)

theta1 = np.matrix(np.reshape(params[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
theta2 = np.matrix(np.reshape(params[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))

theta1.shape, theta2.shape 

((50, 138), (5, 51))

In [23]:
a1, z2, a2, z3, h = forward_propagate(ls_train, theta1, theta2)
a1.shape, z2.shape, a2.shape, z3.shape, h.shape

((933, 138), (933, 50), (933, 51), (933, 5), (933, 5))

In [24]:
J, grad = backprop(params, input_size, hidden_size, num_labels, ls_train, y_onehot_train, learning_rate)
J, grad.shape

(2.534818977904408, (7155,))

In [25]:
from scipy.optimize import minimize

fmin = minimize(fun=backprop, x0=params, args=(input_size, hidden_size, num_labels, ls_train, y_onehot_train, learning_rate), 
                method='TNC', jac=True, options={'maxiter': 250})
fmin

     fun: 0.19461255209618933
     jac: array([ 3.80758125e-05, -3.68646529e-05, -4.42895552e-05, ...,
        9.85170963e-05, -1.41269404e-05,  9.54710006e-05])
 message: 'Max. number of function evaluations reached'
    nfev: 250
     nit: 22
  status: 3
 success: False
       x: array([-0.06883123, -0.10900391,  0.07862007, ..., -0.30491671,
       -1.68441175,  1.80372857])

In [26]:
ls_train = np.matrix(ls_train)
theta1 = np.matrix(np.reshape(fmin.x[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
theta2 = np.matrix(np.reshape(fmin.x[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))

a1, z2, a2, z3, h = forward_propagate(ls_train, theta1, theta2)
accuracy = get_accuracy(h, y_train)
print(accuracy)

In [27]:
np.save('weights', [theta1, theta2])