In [1]:
import librosa
import numpy as np
import pandas as pd
import os

In [2]:
dataset_path = "GTZAN_dataset/"

In [11]:
def extract_features_audio(audio_data, sr):
    test_data = []
    #chroma
    S = np.abs(librosa.stft(audio_data, n_fft=4096))**2
    chroma = librosa.feature.chroma_stft(S=S, sr=sr)
    #chroma_stft_mean
    chroma_mean = round(np.mean(chroma), 6)
    test_data.append(chroma_mean)
    #chrome_stft_var
    chroma_var = round(np.var(chroma), 6)
    test_data.append(chroma_var)

    #rms
    rms = librosa.feature.rms(y=audio_data)
    #rms_mean
    rms_mean = round(np.mean(rms), 6)
    test_data.append(rms_mean)
    #rms_var
    rms_var = round(np.var(rms), 6)
    test_data.append(rms_var)

    #spectral_centroid
    cent = librosa.feature.spectral_centroid(y=audio_data, sr=sr)
    #spectral_centroid_mean
    sc_mean = round(np.mean(cent), 6)
    test_data.append(sc_mean)
    #spectral_centroid_var
    sc_var = round(np.var(cent), 6)
    test_data.append(sc_var)

    #spectral_bandwidth
    spec_bw = librosa.feature.spectral_bandwidth(y=audio_data, sr=sr)
    #spectral_bandwidth_mean
    spec_bw_mean = round(np.mean(spec_bw), 6)
    test_data.append(spec_bw_mean)
    #spectral_bandwidth_var
    spec_bw_var = round(np.var(spec_bw), 6)
    test_data.append(spec_bw_var)

    #rolloff
    rolloff = librosa.feature.spectral_rolloff(y=audio_data, sr=sr)
    #rolloff_mean
    rolloff_mean = round(np.mean(rolloff), 6)
    test_data.append(rolloff_mean)
    #rolloff_var
    rolloff_var = round(np.var(rolloff), 6)
    test_data.append(rolloff_var)

    #zero_crossing_rate
    zcr = librosa.feature.zero_crossing_rate(audio_data)
    #zero_crossing_rate_mean
    zcr_mean = round(np.mean(zcr), 6)
    test_data.append(zcr_mean)
    #zero_crossing_rate_var
    zcr_var = round(np.var(zcr), 6)
    test_data.append(zcr_var)

    # harmony and perceptr
    y_harm, y_perc = librosa.effects.hpss(audio_data)
    #harmony_mean
    harmony_mean = round(np.mean(y_harm), 6)
    test_data.append(harmony_mean) 
    #harmony_var
    harmony_var = round(np.var(y_harm), 6)
    test_data.append(harmony_var)
    
    #perceptr_mean
    perceptr_mean = round(np.mean(y_perc), 6)
    test_data.append(perceptr_mean) 
    #perceptr_var
    perceptr_var = round(np.var(y_perc), 6)
    test_data.append(perceptr_var)

    #tempo
    hop_length = 512
    oenv = librosa.onset.onset_strength(y=audio_data, sr=sr, hop_length=hop_length)
    tempo = librosa.beat.tempo(onset_envelope=oenv, sr=sr, hop_length=hop_length)[0]
    tempo = round(tempo, 6)
    test_data.append(tempo)

    d = librosa.feature.mfcc(y=np.array(audio_data).flatten(), sr=sr, n_mfcc=20)
    d_var = d.var(axis=1).tolist()
    d_mean = d.mean(axis=1).tolist()
    for i in range(20):
        test_data.append(d_mean[i])
        test_data.append(d_var[i])

    return test_data

In [12]:
columns = ['chroma_stft_mean', 'chroma_stft_var', 'rms_mean', 'rms_var', 'spectral_centroid_mean', 'spectral_centroid_var',
           'spectral_bandwidth_mean', 'spectral_bandwidth_var', 'rolloff_mean', 'rolloff_var', 'zero_crossing_rate_mean', 
           'zero_crossing_rate_var', 'harmony_mean', 'harmony_var', 'perceptr_mean', 'perceptr_var', 'tempo', 'mfcc1_mean', 
           'mfcc1_var', 'mfcc2_mean', 'mfcc2_var', 'mfcc3_mean', 'mfcc3_var', 'mfcc4_mean', 'mfcc4_var', 'mfcc5_mean', 
           'mfcc5_var', 'mfcc6_mean', 'mfcc6_var', 'mfcc7_mean', 'mfcc7_var', 'mfcc8_mean', 'mfcc8_var', 'mfcc9_mean', 
           'mfcc9_var', 'mfcc10_mean', 'mfcc10_var', 'mfcc11_mean', 'mfcc11_var', 'mfcc12_mean', 'mfcc12_var', 
           'mfcc13_mean', 'mfcc13_var', 'mfcc14_mean', 'mfcc14_var', 'mfcc15_mean', 'mfcc15_var', 'mfcc16_mean', 
           'mfcc16_var', 'mfcc17_mean', 'mfcc17_var', 'mfcc18_mean', 'mfcc18_var', 'mfcc19_mean', 'mfcc19_var', 
           'mfcc20_mean', 'mfcc20_var', 'label']

In [13]:
len(columns)

58

Generate Dataset with 30 seconds clips

In [None]:
dflist = []
for genre in os.listdir(dataset_path):
    print("Processing: " + genre)
    for file in os.listdir(dataset_path + genre):
        audio_data, sr = librosa.load(dataset_path + genre + "/" + file, offset=0, duration=30)
        features = extract_features_audio(audio_data, sr)
        features.append(genre)
        dflist.append(features)
df30s = pd.DataFrame(dflist, columns=columns)

In [7]:
df30s

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,0.341922,0.086494,0.130192,0.002830,1784.416546,129739.837401,2002.657106,85829.345908,3806.418650,9.013297e+05,...,52.464138,-1.688358,36.564606,-0.409329,41.643627,-2.298869,55.076675,1.219947,46.975636,blues
1,0.312170,0.094144,0.095892,0.002374,1529.871314,376011.650101,2038.612143,213889.873843,3548.986873,2.976792e+06,...,55.364296,-0.717028,60.113293,0.292333,48.137741,-0.285717,51.131210,0.532144,45.823235,blues
2,0.348819,0.083174,0.175494,0.002753,1552.637786,156538.294709,1747.382028,76150.688778,3041.089944,7.829465e+05,...,40.620197,-7.721793,47.700275,-1.816497,52.418175,-3.434354,46.630062,-2.231391,30.626228,blues
3,0.325007,0.095534,0.141139,0.006340,1070.110059,184506.655421,1596.244204,166637.568438,2185.061787,1.494194e+06,...,44.456993,-3.320055,50.236969,0.637263,37.351917,-0.617507,37.291164,-3.406940,31.988441,blues
4,0.310032,0.086206,0.091540,0.002305,1835.507009,343334.237041,1748.367477,88460.061228,3581.003346,1.572133e+06,...,86.012779,-5.453736,75.328178,-0.924468,53.577564,-4.408076,62.928513,-11.701833,55.215115,blues
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,0.306411,0.080754,0.079441,0.000346,2008.537045,282144.491236,2106.349206,88618.861488,4254.124276,1.222833e+06,...,45.016090,-13.300427,41.649334,2.473230,36.610485,-6.719625,54.840275,-1.199268,50.000675,rock
995,0.349015,0.076799,0.076425,0.000590,2006.009248,181437.000824,2068.224879,81920.961356,4147.166589,1.041906e+06,...,33.877956,-10.854527,39.240723,1.872500,32.014000,-7.467166,39.204880,-2.797982,31.698040,rock
996,0.407769,0.077484,0.081583,0.000323,2077.166788,231713.157178,1926.895810,74675.603778,4030.750627,8.043587e+05,...,33.546669,-12.854931,36.345619,3.451785,35.959087,-12.594253,42.538330,-2.104668,29.896814,rock
997,0.290691,0.086668,0.083834,0.001206,1398.581575,240591.986495,1817.813570,109079.454152,3014.673437,1.332874e+06,...,46.330338,-4.421223,43.654388,1.560510,34.356209,-5.046413,47.208336,-3.582565,41.294521,rock


In [8]:
df30s.to_csv("music_genre_dataset_30s.csv", index=False)

Generate Dataset with 3 seconds clips

In [None]:
dflist = []
for genre in os.listdir(dataset_path):
    for file in os.listdir(dataset_path + genre):
        audio_data, sr = librosa.load(dataset_path + genre + "/" + file, offset=0, duration=30)
        splits = np.array_split(audio_data, 10)
        for split in splits:
            features = extract_features_audio(split, sr)
            features.append(genre)
            dflist.append(features)
df3s = pd.DataFrame(dflist, columns=columns)

In [10]:
df3s

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,0.330147,0.089642,0.130189,0.003559,1773.358004,169450.829520,1972.334258,117272.640189,3714.063439,1.083179e+06,...,39.547070,-3.230046,36.606857,0.696385,37.766136,-5.035945,33.668549,-0.239585,43.818882,blues
1,0.340871,0.083571,0.112119,0.001491,1817.244034,90766.297254,2010.751494,65940.666243,3870.510442,6.721332e+05,...,64.819786,-6.025472,40.548809,0.127131,51.048935,-2.808956,97.221489,5.771882,60.360344,blues
2,0.309781,0.087560,0.130895,0.004552,1790.722357,110071.206973,2088.184750,73391.498001,4000.206581,7.859502e+05,...,68.306793,-1.714475,28.136944,2.329553,47.211426,-1.925621,52.922428,2.466996,33.164001,blues
3,0.355867,0.083294,0.131349,0.002338,1660.545231,109496.936296,1967.920582,79805.901351,3579.149639,9.032748e+05,...,48.543198,-3.786987,28.419546,1.153315,35.682701,-3.501979,50.610344,3.580636,32.325871,blues
4,0.319974,0.084974,0.142370,0.001734,1634.465077,77425.419232,1954.633566,57359.695604,3480.096905,5.889230e+05,...,30.829544,0.635797,44.645561,1.591108,51.415867,-3.364909,26.421085,0.501505,29.109531,blues
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9985,0.309662,0.080016,0.049680,0.000098,1503.238681,163419.049525,1725.514991,85476.555705,3024.586839,8.471816e+05,...,44.396152,-9.084438,38.929996,-4.216517,31.358318,-5.681930,47.543434,1.725083,38.689384,rock
9986,0.320678,0.082181,0.057615,0.000096,1852.706840,277236.230048,1914.481257,97378.034054,3754.728065,1.159307e+06,...,32.134663,-12.323830,68.098274,-3.130633,53.029491,-11.916322,63.618256,0.384525,18.866629,rock
9987,0.292206,0.088421,0.051906,0.000674,1348.383673,662359.245401,1566.961070,137898.245964,2445.509315,2.599228e+06,...,79.231087,-2.683274,22.830883,4.947066,25.907824,1.718840,47.727337,-0.214869,41.377605,rock
9988,0.346380,0.087723,0.065967,0.000312,2086.306423,200732.442407,2021.144027,20772.677177,4317.655687,4.819789e+05,...,27.924114,-5.355849,17.127592,6.417248,21.935261,2.347247,24.930906,0.666209,12.577224,rock


In [11]:
# df3s.to_csv("music_genre_dataset_3s.csv", index=False)