In [1]:
import os
from scipy.io import wavfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.layers import Conv2D, MaxPool2D, Flatten, LSTM
from keras.layers import Dropout, Dense, TimeDistributed
from keras.models import Sequential
from keras.utils import to_categorical
from sklearn.utils import compute_class_weight
from tqdm import tqdm
from python_speech_features import mfcc
import math
import tensorflow as tf
from tensorflow.keras.regularizers import l2
import pickle
from keras.callbacks import ModelCheckpoint
from cfg import Config

In [2]:
audio_genres = pd.read_csv("clean_audio_genres.csv", sep=";")
audio_genres.head()

Unnamed: 0.1,Unnamed: 0,Audio_Filenames,track_genre_1,track_genre_2,length,multi_genres
0,0,4em001_01_Mideast-Darkness_703611.wav,Atmosphere,World Music,4.631.510.204.081.630,Atmosphere World Music
1,1,4em001_02_Desert-Search_703612.wav,Atmosphere,World Music,96.6,Atmosphere World Music
2,2,4em001_03_Qasidah_703613.wav,Atmosphere,World Music,8.347.827.664.399.090,Atmosphere World Music
3,3,4em001_04_Desert-War_703614.wav,Atmosphere,World Music,10.132.979.591.836.700,Atmosphere World Music
4,4,4em001_05_Desert-War---Lite_703615.wav,Atmosphere,World Music,9.382.977.324.263.030,Atmosphere World Music


In [4]:
audio_genres.set_index('Audio_Filenames', inplace=True)

In [5]:
for f in audio_genres.index:
    audio_genres.at[f, 'clean_length'] = 40.0

In [6]:
newoccur_col1 = audio_genres["track_genre_1"].value_counts()
newoccur_col1

Atmosphere            3899
Filmscore             3214
Rock                  3172
Electronica           2023
Country, Folk         1920
Hip Hop, Rap          1875
Others                1661
Classical Music       1442
Easy Listening        1369
Dance                 1286
Ambient, Chill        1144
Indie, Alternative    1141
Pop                   1088
Funk, Soul             941
World Music            927
Jazz                   863
House                  739
Latin                  446
Blues                  309
60ies                  303
RnB                    162
Swing                  124
Acoustic                99
Orchestral              95
Kids                    77
Christmas               72
Hard, Heavy             70
Drone                   43
Trailer                 29
Sound Design            10
Name: track_genre_1, dtype: int64

In [7]:
newoccur_col2 = audio_genres["track_genre_2"].value_counts()
newoccur_col2

Pop                   3472
Electronica           2674
Others                2481
Ambient, Chill        2365
Filmscore             1657
Classical Music       1263
World Music           1263
Indie, Alternative     613
Orchestral             593
Acoustic               558
Kids                   531
Sound Design           520
Trailer                484
Hard, Heavy            471
Jazz                   461
Drone                  369
Christmas              360
Blues                  332
Swing                  322
Dance                  313
RnB                    266
Easy Listening         243
Atmosphere             220
Funk, Soul             217
60ies                  141
House                  127
Country, Folk           95
Hip Hop, Rap            66
Rock                    57
Latin                   36
Name: track_genre_2, dtype: int64

In [8]:
class_dist = newoccur_col1 + newoccur_col2
class_dist.sort_values(ascending=False)

Filmscore             4871
Electronica           4697
Pop                   4560
Others                4142
Atmosphere            4119
Ambient, Chill        3509
Rock                  3229
Classical Music       2705
World Music           2190
Country, Folk         2015
Hip Hop, Rap          1941
Indie, Alternative    1754
Easy Listening        1612
Dance                 1599
Jazz                  1324
Funk, Soul            1158
House                  866
Orchestral             688
Acoustic               657
Blues                  641
Kids                   608
Hard, Heavy            541
Sound Design           530
Trailer                513
Latin                  482
Swing                  446
60ies                  444
Christmas              432
RnB                    428
Drone                  412
dtype: int64

In [10]:
genres = ['Drone', 'RnB', 'Christmas', '60ies', 'Swing', 'Latin', 'Trailer', 'Sound Design', 'Hard, Heavy', 'Kids', 'Blues', 'Acoustic', 'Orchestral', 'House', 'Funk, Soul', 'Jazz', 'Dance', 'Easy Listening', 'Indie, Alternative', 'Country, Folk', 'Hip Hop, Rap', 'World Music', 'Classical Music', 'Rock', 'Ambient, Chill', 'Others', 'Pop', 'Atmosphere', 'Electronica', 'Filmscore']

In [11]:
classes = list(np.unique(genres))

In [13]:
n_samples = int(audio_genres['clean_length'].sum())

In [14]:
prob_dist = class_dist/class_dist.sum()

In [16]:
prob_dist.sort_values(ascending=False)

Filmscore             0.091710
Electronica           0.088434
Pop                   0.085855
Others                0.077985
Atmosphere            0.077552
Ambient, Chill        0.066067
Rock                  0.060795
Classical Music       0.050929
World Music           0.041233
Country, Folk         0.037938
Hip Hop, Rap          0.036545
Indie, Alternative    0.033024
Easy Listening        0.030350
Dance                 0.030106
Jazz                  0.024928
Funk, Soul            0.021803
House                 0.016305
Orchestral            0.012954
Acoustic              0.012370
Blues                 0.012069
Kids                  0.011447
Hard, Heavy           0.010186
Sound Design          0.009979
Trailer               0.009659
Latin                 0.009075
Swing                 0.008397
60ies                 0.008360
Christmas             0.008134
RnB                   0.008058
Drone                 0.007757
dtype: float64

In [17]:
choices = np.random.choice(class_dist.index, p=prob_dist)

In [18]:
def check_data():
    if os.path.isfile(config.p_path):
        print('Loading existing data for {} model'.format(config.mode))
        with open(config.p_path, 'rb') as handle:
            tmp = pickle.load(handle)
            return tmp
    else:
        return None

In [19]:
def build_rand_feat():
    tmp = check_data()
    if tmp:
        return tmp.data[0], tmp.data[1]
    X = []
    y = []
    _min, _max = float('inf'), -float('inf')
    for _ in tqdm(range(n_samples)):
        rand_class = np.random.choice(class_dist.index, p=prob_dist)
        f = np.random.choice(audio_genres[audio_genres.multi_genres.str.contains(rand_class)].index)
        rate, wav = wavfile.read('40secwav/'+f)
        rand_index = np.random.randint(0, wav.shape[0]-config.step)
        sample = wav[rand_index:rand_index+config.step]
        X_sample = mfcc(sample, rate, 
                       numcep=config.nfeat, nfilt=config.nfilt, nfft=config.nfft)
        if X_sample.shape != (19,13): #avoid faulty file
            print(f)
            continue
        _min = min(np.amin(X_sample), _min)
        _max = max(np.amax(X_sample), _max)
        X.append(X_sample)
        y.append(
                (audio_genres.at[f, 'track_genre_1'], audio_genres.at[f, 'track_genre_2'])
        )
    config.min = _min
    config.max = _max
    X, y = np.array(X), np.array(y)
    X = (X - _min) / (_max - _min)
    if config.mode == 'conv':
        X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)
    elif config.mode == 'time':
        X = X.reshape(X.shape[0], X.shape[1], X.shape[2])
    y = one_hot_encode(y, classes)
    config.data = (X, y)
    
    with open(config.p_path, 'wb') as handle:
        pickle.dump(config, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return X, y

def one_hot_encode(y, genres):
    surrogate = genres + ['nan']
    y_vectorized = np.zeros((len(y), len(surrogate)))

    for i, (l1, l2) in enumerate(y):
        index = (surrogate.index(l1), surrogate.index(l2))
        y_vectorized[i, index] = 1

    y_vectorized = y_vectorized[:, :-1]
    return tf.convert_to_tensor(y_vectorized, dtype='float32')

In [20]:
def get_conv_model():
    model = Sequential()
    model.add(Conv2D(8, (7,7), activation='tanh', strides=(1,1),
                    padding='same', input_shape=input_shape))
    model.add(MaxPool2D((2,2), padding='same'))
    model.add(Conv2D(16, (5,5), activation='relu', strides=(1,1),
                    padding='same'))
    model.add(MaxPool2D((2,2), padding='same'))
    model.add(Conv2D(16, (3,3), activation='relu', strides=(1,1),
                    padding='same'))
    model.add(MaxPool2D((2,2), padding='same'))
    model.add(Conv2D(32, (3,3), activation='relu', strides=(1,1),
                    padding='same'))
    model.add(MaxPool2D((2,2), padding='same'))
    model.add(Conv2D(32, (3,3), activation='relu', strides=(1,1),
                    padding='same'))
    model.add(Flatten())
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu', activity_regularizer=l2(0.001)))
    model.add(Dense(30, activation='sigmoid'))
    model.summary()
    model.compile(loss='binary_crossentropy',
                 optimizer='adam',
                 metrics=[tf.keras.metrics.BinaryAccuracy()])
    return model

In [21]:
config = Config(mode='conv')

In [22]:
if config.mode == 'conv':
    X, y = build_rand_feat()
    def y_flat(y,thresh=0.):
        mask= y>thresh
        r,c = np.where(mask)
        out = np.empty(len(y), dtype=object)
        grp_idx = np.r_[0, np.flatnonzero(r[:-1] != r[1:])+1,len(r)]
        valid_rows = r[np.r_[True,r[:-1] != r[1:]]]
        for (row,i,j) in zip(valid_rows,grp_idx[:-1],grp_idx[1:]):
            out[row] = c[i:j]
        return out
    y_flat2 = np.concatenate(y_flat(y,thresh=0.))
    unique, counts = np.unique(y_flat2, return_counts=True)
    total_samples = np.sum(counts)
    dist_labels = counts/total_samples
    average_weights = 1/30
    label_weights = average_weights/dist_labels
    dict_weights = dict(zip(unique, label_weights))
    
    input_shape = (X.shape[1], X.shape[2], 1)
    model = get_conv_model()
    
    
elif config.mode == 'time':
    X, y = build_rand_feat()
    def y_flat(y,thresh=0.):
        mask= y>thresh
        r,c = np.where(mask)
        out = np.empty(len(y), dtype=object)
        grp_idx = np.r_[0, np.flatnonzero(r[:-1] != r[1:])+1,len(r)]
        valid_rows = r[np.r_[True,r[:-1] != r[1:]]]
        for (row,i,j) in zip(valid_rows,grp_idx[:-1],grp_idx[1:]):
            out[row] = c[i:j]
        return out
    y_flat2 = np.concatenate(y_flat(y,thresh=0.))
    unique, counts = np.unique(y_flat2, return_counts=True)
    total_samples = np.sum(counts)
    dist_labels = counts/total_samples
    average_weights = 1/30
    label_weights = average_weights/dist_labels
    dict_weights = dict(zip(unique, label_weights))

    input_shape = (X.shape[1], X.shape[2])
    model = get_recurrent_model()
    
weights = dict_weights

#checkpoint = ModelCheckpoint(config.model_path, monitor='val_loss', verbose=1, mode='max',
                            #save_best_only=True, save_weights_only=False, save_freq='epoch')

model.fit(X, y, epochs=50, validation_split=0.15, batch_size=96,
         shuffle=True,
         class_weight=weights,
         callbacks=[tf.keras.callbacks.TensorBoard(
    log_dir='./logs', histogram_freq=0, write_graph=True, write_images=False,
    update_freq='epoch', profile_batch=2, embeddings_freq=0,
    embeddings_metadata=None
)])

model.save(config.model_path)

Loading existing data for conv model
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 19, 13, 8)         400       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 10, 7, 8)          0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 10, 7, 16)         3216      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 5, 4, 16)          0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 5, 4, 16)          2320      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 3, 2, 16)          0         
_________________________________________________________________
conv2d_3 (Conv2D)  