<a href="https://colab.research.google.com/github/tamikaivi/-course-angular-upb/blob/master/AutoTag1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [83]:
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
import csv
# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
#Keras
import keras
from keras import models
from keras import layers


# generating a dataset
header = 'filename chroma_stft rmse spectral_centroid spectral_bandwidth rolloff zero_crossing_rate'
for i in range(1, 21):
    header += f' mfcc{i}'
header += ' label'
header = header.split()

file = open('data.csv', 'w', newline='')
with file:
    writer = csv.writer(file)
    writer.writerow(header)
genres = 'blues classical country disco hiphop jazz metal pop reggae rock'.split()
for g in genres:
    for filename in os.listdir(f'./genres/{g}'):
        songname = f'./genres/{g}/{filename}'
        y, sr = librosa.load(songname, mono=True, duration=30)
        chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
        rmse = librosa.feature.rmse(y=y)
        spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        to_append = f'{filename} {np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'    
        for e in mfcc:
            to_append += f' {np.mean(e)}'
        to_append += f' {g}'
        file = open('data.csv', 'a', newline='')
        with file:
            writer = csv.writer(file)
            writer.writerow(to_append.split())
            


In [70]:
# reading dataset from csv

data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,filename,chroma_stft,rmse,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,mfcc8,mfcc9,mfcc10,mfcc11,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,blues.00098.wav,0.442972,0.125422,2310.280604,2206.710936,4829.320798,0.128426,-90.677485,100.093133,-23.916267,52.581125,-20.916392,31.494155,-11.203329,20.085382,-10.90922,17.556229,-14.092525,12.988067,-11.415392,2.906069,-12.012914,4.822585,-11.156674,1.15921,-3.913956,3.250427,blues
1,blues.00086.wav,0.365865,0.124991,1245.609395,1634.415029,2517.684776,0.048827,-212.790368,139.057366,9.459221,35.441294,12.717557,20.413845,-5.570223,10.885779,-8.735101,8.054606,0.166728,2.640718,-1.025016,-1.406932,-4.704802,1.138613,-9.472101,-2.187314,-4.100537,-3.352784,blues
2,blues.00053.wav,0.412501,0.168358,2171.221742,1954.383785,4237.132712,0.132802,-76.717959,105.75353,-43.181755,60.94585,-8.758631,21.227079,-12.864846,18.603844,-11.384939,14.541766,-6.04797,12.726959,-9.366809,1.350859,-3.609972,6.997609,-4.053008,-0.271427,-3.696092,-0.504041,blues
3,blues.00065.wav,0.29273,0.10036,2816.676938,2372.164643,5614.449384,0.146318,-118.333511,74.853852,-19.858369,46.941585,-32.549699,23.827168,-26.361836,17.685263,-13.811255,11.941498,-17.876516,7.54486,-22.888936,4.759724,-12.79163,5.608855,-14.72511,6.684452,-9.159456,-2.243776,blues
4,blues.00085.wav,0.315363,0.150218,1312.308199,1673.915613,2638.117381,0.059416,-180.123596,131.420259,0.566188,43.152929,14.135186,25.628555,-5.31308,12.04029,-13.536841,-1.090993,-7.605675,2.718054,0.772589,1.997588,-6.093858,3.484042,-8.341479,3.204648,-0.926944,-2.243686,blues


In [71]:
# Dropping unneccesary columns
data = data.drop(['filename'],axis=1)
data.head()

genre_list = data.iloc[:, -1]
encoder = LabelEncoder()
y = encoder.fit_transform(genre_list)
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [72]:
# normalizing
scaler = StandardScaler()
X = scaler.fit_transform(np.array(data.iloc[:, :-1], dtype = float))

# spliting of dataset into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [82]:
# creating a model
model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)))

model.add(layers.Dense(128, activation='relu'))

model.add(layers.Dense(64, activation='relu'))

model.add(layers.Dense(10, activation='softmax'))

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
              
history = model.fit(X_train,
                    y_train,
                    epochs=20,
                    batch_size=128)
                    
# calculate accuracy
test_loss, test_acc = model.evaluate(X_test,y_test)
print('test_acc: ',test_acc)

# predictions
predictions = model.predict(X_test)
print(predictions)
np.argmax(predictions[0])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
test_acc:  0.9285714030265808
[[7.16175437e-01 2.54025429e-01 1.30556582e-03 1.54479756e-03
  5.28212637e-03 1.26278838e-02 1.28343189e-03 3.03874793e-03
  3.68110603e-03 1.03543012e-03]
 [5.80985378e-03 9.93604362e-01 4.81422758e-05 2.93052472e-05
  6.02143627e-05 2.78597552e-04 1.31196975e-05 8.87351198e-05
  4.71653548e-05 2.04395019e-05]
 [9.99326825e-01 4.30262968e-04 2.12574923e-06 5.81799395e-06
  3.32008531e-05 1.53463130e-04 4.85149667e-06 2.10016733e-05
  1.84260189e-05 3.97574286e-06]
 [9.83056486e-01 1.43613685e-02 5.22446971e-05 3.69779918e-05
  5.74360427e-04 1.50877691e-03 5.10294849e-05 1.14417264e-04
  2.15511347e-04 2.87980529e-05]
 [9.99811471e-01 1.49807747e-04 1.96632840e-07 3.69532813e-07
  3.88968056e-06 2.92394525e-05 2.97215536e-07 2.

0

In [81]:
from __future__ import print_function
import os.path
import numpy as np
import librosa
import pickle as cP

fftsize = 1024
window = 1024
hop = 512
melBin = 128

label_path = './GTZAN/'
audiolist = './train_filtered.txt'
with open(audiolist) as f:
    all_list = f.read().splitlines()
print(len(all_list))

load_path = 'genres/'
save_path = 'GTZANSAVE/'

for iter in range(0,len(all_list)):

	try:
		file_name = load_path + all_list[iter]
		save_name = save_path + all_list[iter].replace('.wav','.npy')

		if not os.path.exists(os.path.dirname(save_name)):
			os.makedirs(os.path.dirname(save_name))

		if os.path.isfile(save_name) == 1:
			print(iter, save_name + '_file_already_extracted!')
			continue

		y,sr = librosa.load(file_name,sr=22050)
		S = librosa.core.stft(y,n_fft=fftsize,hop_length=hop,win_length=window)
		X = np.abs(S)

		mel_basis = librosa.filters.mel(sr,n_fft=fftsize,n_mels=melBin)
	
		mel_S = np.dot(mel_basis,X)

		mel_S = np.log10(1+10*mel_S)
		mel_S = mel_S.astype(np.float32)

		mel_S = mel_S[:,:1291]

		print(iter,mel_S.shape,save_name)
		np.save(save_name,mel_S)

	except Exception:
		continue



443
0 GTZANSAVE/blues/blues.00029.npy_file_already_extracted!
1 GTZANSAVE/blues/blues.00030.npy_file_already_extracted!
2 GTZANSAVE/blues/blues.00031.npy_file_already_extracted!
3 GTZANSAVE/blues/blues.00032.npy_file_already_extracted!
4 GTZANSAVE/blues/blues.00033.npy_file_already_extracted!
5 GTZANSAVE/blues/blues.00034.npy_file_already_extracted!
6 GTZANSAVE/blues/blues.00035.npy_file_already_extracted!
7 GTZANSAVE/blues/blues.00036.npy_file_already_extracted!
8 GTZANSAVE/blues/blues.00037.npy_file_already_extracted!
9 GTZANSAVE/blues/blues.00038.npy_file_already_extracted!
10 GTZANSAVE/blues/blues.00039.npy_file_already_extracted!
11 GTZANSAVE/blues/blues.00040.npy_file_already_extracted!
12 GTZANSAVE/blues/blues.00041.npy_file_already_extracted!
13 GTZANSAVE/blues/blues.00042.npy_file_already_extracted!
14 GTZANSAVE/blues/blues.00043.npy_file_already_extracted!
15 GTZANSAVE/blues/blues.00044.npy_file_already_extracted!
16 GTZANSAVE/blues/blues.00045.npy_file_already_extracted!
17 

In [6]:
from __future__ import print_function
import os
import numpy as np
import time
import pickle as cP
import argparse

from keras.optimizers import SGD
from keras import backend as K
from keras.regularizers import l2

from keras.layers import Conv1D, MaxPool1D, BatchNormalization, GlobalAvgPool1D, Dense, Dropout, Activation, Reshape, \
    Input, Concatenate, dot, Add, Flatten, concatenate
from keras.models import Model

num_frames_per_song = 1291
img_cols = 128
num_frame_input = 129
num_segment = int(num_frames_per_song / num_frame_input)
print('Number of segments per song: ' + str(num_segment))

audiolist = './train_filtered.txt'
audio_path = './GTZANSAVE/'
save_path = './features/%s/' % 'basic'

# load data
with open(audiolist) as f:
    all_list = f.read().splitlines()
print(len(all_list))

# path generate
if not os.path.exists(os.path.dirname(save_path)):
    os.makedirs(os.path.dirname(save_path))


def load_melspec(file_name_from, num_segment, num_frame_input):
    file_name = audio_path + file_name_from.replace('.wav', '.npy')
    tmp = np.load(file_name)
    tmp = tmp.T

    mel_feat = np.zeros((num_segment, num_frame_input, 128))
    for iter2 in range(0, num_segment):
        mel_feat[iter2] = tmp[iter2 * num_frame_input:(iter2 + 1) * num_frame_input, :]

    return mel_feat

# load model
model_input = Input(shape=(num_frame_input, 128))

conv1 = Conv1D(128, 4, padding='same', use_bias=True, kernel_regularizer=l2(1e-5), kernel_initializer='he_uniform')
bn1 = BatchNormalization()
activ1 = Activation('relu')
MP1 = MaxPool1D(pool_size=4)
conv2 = Conv1D(128, 4, padding='same', use_bias=True, kernel_regularizer=l2(1e-5), kernel_initializer='he_uniform')
bn2 = BatchNormalization()
activ2 = Activation('relu')
MP2 = MaxPool1D(pool_size=4)
conv3 = Conv1D(128, 4, padding='same', use_bias=True, kernel_regularizer=l2(1e-5), kernel_initializer='he_uniform')
bn3 = BatchNormalization()
activ3 = Activation('relu')
MP3 = MaxPool1D(pool_size=4)
conv4 = Conv1D(128, 2, padding='same', use_bias=True, kernel_regularizer=l2(1e-5), kernel_initializer='he_uniform')
bn4 = BatchNormalization()
activ4 = Activation('relu')
MP4 = MaxPool1D(pool_size=2)
conv5 = Conv1D(256, 1, padding='same', use_bias=True, kernel_regularizer=l2(1e-5), kernel_initializer='he_uniform')
bn5 = BatchNormalization()
activ5 = Activation('relu')
drop1 = Dropout(0.5)

item_sem = GlobalAvgPool1D()

model_conv1 = conv1(model_input)
model_bn1 = bn1(model_conv1)
model_activ1 = activ1(model_bn1)
model_MP1 = MP1(model_activ1)
model_conv2 = conv2(model_MP1)
model_bn2 = bn2(model_conv2)
model_activ2 = activ2(model_bn2)
model_MP2 = MP2(model_activ2)
model_conv3 = conv3(model_MP2)
model_bn3 = bn3(model_conv3)
model_activ3 = activ3(model_bn3)
model_MP3 = MP3(model_activ3)
model_conv4 = conv4(model_MP3)
model_bn4 = bn4(model_conv4)
model_activ4 = activ4(model_bn4)
model_MP4 = MP4(model_activ4)
model_conv5 = conv5(model_MP4)
model_bn5 = bn5(model_conv5)
model_activ5 = activ5(model_bn5)
model_drop1 = drop1(model_activ5)
model_item_sem = item_sem(model_drop1)

output = Dense(10000, activation='softmax')(model_item_sem)
model = Model(inputs=model_input, outputs=output)

model.load_weights('./weights.889-6.28.h5')
print('model loaded!!!')



Number of segments per song: 10
443
model loaded!!!


In [8]:

# compile & optimizer
sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])

# print model summary
model.summary()

# mean / std
mel_mean = 0.22620339
mel_std = 0.25794547

# define activation layer
layer_dict = dict([(layer.name, layer) for layer in model.layers[1:]])
activation_layer = 'activation_9'
layer_output = layer_dict[activation_layer].output
get_last_hidden_output = K.function([model.layers[0].input], [layer_output])

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 129, 128)]        0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 129, 128)          65664     
_________________________________________________________________
batch_normalization_5 (Batch (None, 129, 128)          512       
_________________________________________________________________
activation_5 (Activation)    (None, 129, 128)          0         
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 32, 128)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 32, 128)           65664     
_________________________________________________________________
batch_normalization_6 (Batch (None, 32, 128)          

In [9]:

all_size = len(all_list)
for iter2 in range(0,len(all_list)):
	# check existence
	save_name = save_path + all_list[iter2].replace('.wav','.npy')
	
	if not os.path.exists(os.path.dirname(save_name)):
		os.makedirs(os.path.dirname(save_name))
	
	if os.path.isfile(save_name) == 1:
		print(iter2, save_name + '_file_exist')

	# load melgram
	x_mel_tmp = load_melspec(all_list[iter2],num_segment,num_frame_input)

	# normalization
	x_mel_tmp -= mel_mean
	x_mel_tmp /= mel_std

	# prediction
	weight = get_last_hidden_output([x_mel_tmp,0])[0] # testing phase 0
	print(weight.shape) # 10,1,256

	maxpooled = np.amax(weight,axis=1)
	averagepooled = np.average(maxpooled,axis=0)
	print(averagepooled.shape,iter2)
	
	np.save(save_name,averagepooled)

(10, 1, 256)
(256,) 0
(10, 1, 256)
(256,) 1
(10, 1, 256)
(256,) 2
(10, 1, 256)
(256,) 3
(10, 1, 256)
(256,) 4
(10, 1, 256)
(256,) 5
(10, 1, 256)
(256,) 6
(10, 1, 256)
(256,) 7
(10, 1, 256)
(256,) 8
(10, 1, 256)
(256,) 9
(10, 1, 256)
(256,) 10
(10, 1, 256)
(256,) 11
(10, 1, 256)
(256,) 12
(10, 1, 256)
(256,) 13
(10, 1, 256)
(256,) 14
(10, 1, 256)
(256,) 15
(10, 1, 256)
(256,) 16
(10, 1, 256)
(256,) 17
(10, 1, 256)
(256,) 18
(10, 1, 256)
(256,) 19
(10, 1, 256)
(256,) 20
(10, 1, 256)
(256,) 21
(10, 1, 256)
(256,) 22
(10, 1, 256)
(256,) 23
(10, 1, 256)
(256,) 24
(10, 1, 256)
(256,) 25
(10, 1, 256)
(256,) 26
(10, 1, 256)
(256,) 27
(10, 1, 256)
(256,) 28
(10, 1, 256)
(256,) 29
(10, 1, 256)
(256,) 30
(10, 1, 256)
(256,) 31
(10, 1, 256)
(256,) 32
(10, 1, 256)
(256,) 33
(10, 1, 256)
(256,) 34
(10, 1, 256)
(256,) 35
(10, 1, 256)
(256,) 36
(10, 1, 256)
(256,) 37
(10, 1, 256)
(256,) 38
(10, 1, 256)
(256,) 39
(10, 1, 256)
(256,) 40
(10, 1, 256)
(256,) 41
(10, 1, 256)
(256,) 42
(10, 1, 256)
(256,) 4

In [62]:
import tensorflow as tf
import librosa


just_path = "genres/rock/"
song_path = "genres/rock/rock.00003.wav"
song_name = "2"
##########################################################################

#Constants which depend on the model. If you train the model with different values,
#need to change those values here too
num_mfcc = 13
n_fft=2048
hop_length = 512
sample_rate = 22050
samples_per_track = sample_rate * 30
num_segment = 10
############################################################################

if __name__=="__main__":

    model = tf.keras.models.load_model("model_RNN_LSTM.h5")
    model.summary()

    classes = ["Blues","Classical","Country","Disco","Hiphop",
                "Jazz","Metal","Pop","Reggae","Rock"]

    class_predictions = []

    samples_per_segment = int(samples_per_track / num_segment)


    if song_path.endswith('.mp3'):
        print('mp3')
    else:
        pass

    #load the song
    x, sr = librosa.load(song_path, sr = sample_rate)
    song_length = int(librosa.get_duration(filename=song_path))

    prediction_per_part = []

    flag = 0
    print("Song is greater than 30 seconds")
    samples_per_track_30 = sample_rate * song_length
    parts = int(song_length/30)
    samples_per_segment_30 = int(samples_per_track_30 / (parts))
    flag = 1
    print("Song sliced into "+str(parts)+" parts")

    for i in range(0,parts):
        print("Song snippet ",i+1)
        start30 = samples_per_segment_30 * i
        finish30 = start30 + samples_per_segment_30
        y = x[start30:finish30]
        #print(len(y))
        for n in range(num_segment):
            start = samples_per_segment * n
            finish = start + samples_per_segment
            #print(len(y[start:finish]))
            mfcc = librosa.feature.mfcc(y[start:finish], sample_rate, n_mfcc = num_mfcc, n_fft = n_fft, hop_length = hop_length)
            mfcc = mfcc.T
            #print(mfcc.shape)
            mfcc = mfcc.reshape(1, mfcc.shape[0], mfcc.shape[1])
            #print(mfcc.shape)
            array = model.predict(mfcc)*100
            array = array.tolist()

            #find maximum percentage class predicted
            class_predictions.append(array[0].index(max(array[0])))

        occurence_dict = {}
        for i in class_predictions:
            if i not in occurence_dict:
                occurence_dict[i] = 1
            else:
                occurence_dict[i] +=1

        max_key = max(occurence_dict, key=occurence_dict.get)
        prediction_per_part.append(classes[max_key])

    #print(prediction_per_part)
    prediction = max(set(prediction_per_part), key = prediction_per_part.count)
    print(prediction)





Model: "sequential_36"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 130, 64)           19968     
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 10)                650       
Total params: 57,802
Trainable params: 57,802
Non-trainable params: 0
_________________________________________________________________
Song is greater than 30 seconds
Song sliced into 1 parts
Song snippet  1
Rock


In [67]:
variable = np.load("./features/basic/classical/classical.00030.npy")
print(variable)

[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.57662046 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.44123173 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.05260346 0.17178601 0.         0.         0.         0.
 0.         0.         0.         0.         0.03358665 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.44445592 0.         0.         0.         