# Genre classification using 30 second audio features

In [35]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

## Data preprocessing

In [36]:
df = pd.read_csv('../Data/features_30_sec.csv')
df = df.drop(columns=['filename', 'length'])
df.describe()

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.378682,0.08634,0.13093,0.003051,2201.780898,469691.6,2242.54107,137079.155165,4571.549304,1844345.0,...,1.148144,60.730958,-3.966028,62.633624,0.507696,63.712586,-2.328761,66.23193,-1.095348,70.126096
std,0.081705,0.007735,0.065683,0.003634,715.9606,400899.5,526.316473,96455.666326,1574.791602,1425085.0,...,4.578948,33.781951,4.549697,33.479172,3.869105,34.401977,3.755957,37.174631,3.837007,45.228512
min,0.171939,0.044555,0.005276,4e-06,570.040355,7911.251,898.066208,10787.185064,749.140636,14686.12,...,-15.693844,9.169314,-17.234728,13.931521,-11.963694,15.420555,-18.501955,13.487622,-19.929634,7.956583
25%,0.319562,0.082298,0.086657,0.000942,1627.697311,184350.5,1907.240605,67376.554428,3380.069642,772731.2,...,-1.86328,40.376442,-7.207225,40.830875,-2.007015,41.88424,-4.662925,41.710184,-3.368996,42.372865
50%,0.383148,0.086615,0.122443,0.001816,2209.26309,338486.2,2221.392843,111977.548036,4658.524473,1476115.0,...,1.212809,52.325077,-4.065605,54.717674,0.669643,54.80489,-2.393862,57.423059,-1.166289,59.186117
75%,0.435942,0.091256,0.175682,0.003577,2691.294667,612147.9,2578.469836,182371.576801,5533.81046,2555262.0,...,4.359662,71.691755,-0.838737,75.040838,3.119212,75.385832,0.150573,78.626444,1.312615,85.375374
max,0.663685,0.108111,0.397973,0.027679,4435.243901,3036843.0,3509.646417,694784.811549,8677.672688,8660900.0,...,13.45715,392.932373,11.482946,406.058868,15.38839,332.905426,14.694924,393.161987,15.369627,506.065155


In [37]:
df.shape

(1000, 58)

In [38]:
X = df.drop(columns=['label'])
y = np.array(df['label'].tolist()).reshape(-1, 1)

### One-hot encoding of labels

In [39]:
encoder = OneHotEncoder(sparse_output=False)
y = encoder.fit_transform(y)
y = pd.DataFrame(y, columns=encoder.categories_)
y.head()

Unnamed: 0,blues,classical,country,disco,hiphop,jazz,metal,pop,reggae,rock
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Train-test split

In [40]:
from sklearn.model_selection import train_test_split

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
X_train.head()

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
29,0.280357,0.105621,0.124736,0.004812,1648.835169,712342.593798,2189.985865,185296.921351,3759.892114,4161772.0,...,-8.310135,84.208382,-11.384393,98.334,-6.723499,96.627716,-10.013582,74.162971,-14.026128,77.931458
535,0.2646,0.089984,0.091492,0.001231,844.084418,51342.065119,1104.420736,26308.920034,1608.486974,428586.7,...,1.125303,43.192551,-1.974342,35.8419,-5.219293,65.051285,-1.86661,42.781399,-3.503479,53.901234
695,0.529182,0.068875,0.185447,0.00378,2446.267671,257141.784822,2331.010128,55816.09545,5192.807708,836453.6,...,9.323952,32.028889,-6.608163,33.616463,3.498461,48.407642,-3.286584,37.829609,3.655154,31.723753
557,0.234168,0.092644,0.078136,0.002283,1378.524274,168329.629531,1773.676404,86900.504631,2584.908654,889619.1,...,-4.048587,49.173058,-1.498452,73.097794,-3.520071,94.470222,-6.187496,113.834229,-7.340963,90.99482
836,0.434649,0.093606,0.079984,0.003172,1810.952863,654461.45888,2166.280664,182317.147014,4118.229261,2871598.0,...,5.381378,69.86142,0.286342,82.497223,5.01483,57.18903,-0.334739,74.218369,-0.855825,63.519684


In [43]:
X_test.head()

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
521,0.432707,0.087017,0.047977,0.000411,2915.017702,465988.571671,2420.421212,55397.978934,5864.933419,1067012.0,...,9.531877,52.428093,-6.002379,44.046604,2.347543,57.23119,-3.352306,68.406425,2.508936,58.562813
737,0.451825,0.085256,0.240985,0.008035,3238.215472,635263.200965,2954.126414,167148.2374,6701.364299,2241874.0,...,-0.603374,47.259617,-2.17777,38.882034,-1.60056,51.832035,-0.616253,57.315872,-4.048588,53.083454
740,0.292264,0.085571,0.199328,0.005632,1962.036801,261307.494531,2325.013206,179382.875864,3820.419899,1860261.0,...,3.414072,72.924728,-0.548955,78.90583,-0.568484,87.446632,0.643941,66.528664,-1.999727,107.257652
660,0.540349,0.059563,0.125761,0.000635,2315.317248,115250.776839,2066.651952,37095.019096,4615.982658,420717.7,...,2.051834,21.919737,-6.81086,24.91996,4.214549,15.420555,-3.401198,20.339085,1.762156,17.411707
411,0.444012,0.085203,0.203773,0.006599,2095.420824,430313.487168,2241.060906,110796.012282,4581.850948,1717499.0,...,4.736111,76.89418,-2.812528,101.614967,6.798242,107.964317,-5.803802,103.876694,-3.767237,92.247749


In [44]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

### Normalization

In [45]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Principal component analysis

In [46]:
pca = PCA(n_components=20)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

## Training model

In [47]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam

In [48]:
model = Sequential()

model.add(Input(shape=(X_train.shape[1],)))

model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))

model.add(Dense(len(encoder.categories_[0]), activation='softmax'))

In [49]:
model.compile(optimizer=Adam(learning_rate=0.001), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

model.summary()

In [50]:
history = model.fit(X_train, y_train, epochs=15, batch_size=32, validation_split=0.2)

Epoch 1/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.1821 - loss: 2.2255 - val_accuracy: 0.3375 - val_loss: 1.9155
Epoch 2/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.3875 - loss: 1.8182 - val_accuracy: 0.4313 - val_loss: 1.5760
Epoch 3/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5007 - loss: 1.4596 - val_accuracy: 0.5500 - val_loss: 1.3383
Epoch 4/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6192 - loss: 1.2108 - val_accuracy: 0.6000 - val_loss: 1.1774
Epoch 5/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6516 - loss: 1.0708 - val_accuracy: 0.6375 - val_loss: 1.0911
Epoch 6/15
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6661 - loss: 0.9478 - val_accuracy: 0.6438 - val_loss: 1.0422
Epoch 7/15
[1m20/20[0m [32m━━━━━━━━━

## Evaluation

In [51]:
loss, accuracy = model.evaluate(X_test, y_test)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.6467 - loss: 1.1599 
