In [2]:
import sys
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install librosa
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install pydub
!{sys.executable} -m pip install scikit-learn
!{sys.executable} -m pip install ipywidgets
!{sys.executable} -m pip install tqdm
!{sys.executable} -m pip install seaborn







In [3]:
# Import libraries 
%matplotlib inline
import matplotlib.pyplot as plt
import librosa.display
import IPython.display as ipd
import warnings
warnings.filterwarnings('ignore')
import os
import librosa
import numpy as np
import pandas as pd
import seaborn as sns
from pydub import AudioSegment
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from sklearn.decomposition import PCA


plt.rcParams['figure.figsize'] = [12, 6]

In [4]:
"""
    - Main Functionalities for Analysing audio files 
"""

def load_file(audio_path, sr=None):
    """ It Loads a audio file 
        returns a signal x and the sample rate sr."""
    x, sr = librosa.load(audio_path, sr=sr)
    return x, sr


def compute_mfccs(x, sr, n_mfcc=20):
    """It computes the Mel Cepstral Coefficients"""
    mfccs = librosa.feature.mfcc(y=x, sr=sr, n_mfcc=n_mfcc)
    return mfccs


def compute_zero_crossing_rate(x, frame_length=2048, hop_length=512):
    """It computes the Zero-Crossing Rate feature"""
    zcrs = librosa.feature.zero_crossing_rate(x, frame_length=frame_length,
                                              hop_length=hop_length)
    return zcrs


def compute_chromagram(x, sr, n_chroma=12, n_octaves=7, hop_length=512):
    """It computes the Chromagram representation."""
    chromagram = librosa.feature.chroma_cqt(x, sr=sr, n_chroma=n_chroma, 
                                            n_octaves=n_octaves, hop_length=hop_length)
    return chromagram


def get_pitches():
    pitches = ["A", "A#", "B", "C", "C#", "D", "D#", "E","F", "F#" ,"G", "G#"]
    return pitches


def get_dictionary_from_numpy(x, N, name="chroma"):
    """It generates a dictionary of features given a signal x,
    N chunks and the name of the feature."""
    
    stats_names = ["_mean", "_std", "_median", "_min", "_max"]
    if name == "chroma":
        pitches = get_pitches()
        feat_keys = [name + "_" + pitches[j] + z + "_" + str(i+1)  for i in range(N) for j in range(x.shape[1]) for z in stats_names]
    else:
        feat_keys = [name + "_" + str(j+1) + z + "_" + str(i+1)  for i in range(N) for j in range(x.shape[1]) for z in stats_names]
    
    # Flatten 
    x_flatten = x.flatten().tolist()
    feature_dictionary = dict(zip(feat_keys, x_flatten))
    return feature_dictionary    


def compute_temporal_stat(x, N, axis=1, stat=np.mean):
    """Given a signal x, a number of chunks N and a statistical function, 
    it splits the array into N chunks and compute the statistics regarding
    stat parameter"""
    x_split = np.array_split(x, N, axis=axis)
    x_temp_stat = [stat(x_s, axis=axis) for x_s in x_split]
    x_temp_stat = np.array(x_temp_stat)
    x_temp_stat = x_temp_stat[:,:,np.newaxis]
    return x_temp_stat


def get_feature_matrix(x, N, axis=1):
    """Automatically compute all the statistics for the N chunks of the signal x."""
    stats = [np.mean, np.std, np.median, np.min, np.max]
    x_stat_n = np.array([])
    for ii, stat in enumerate(stats):
        x_stat_temp = compute_temporal_stat(x,N, axis, stat) # N x m
        
        if ii == 0:
            x_stat_n = x_stat_temp
        else:
            x_stat_n = np.concatenate([x_stat_n, x_stat_temp], axis=2)
        
    return x_stat_n


def compute_audio_features(x_chroma=None, x_mfcc=None, x_zcrs=None, N=3):
    if x_chroma is not None:
        # Chromagram feature matrix
        chrom_feature_matrix = get_feature_matrix(x_chroma, N)
    else:
        chrom_feature_matrix = None
    
    if x_mfcc is not None:
        # MFCC feature matrix
        mfcc_feature_matrix = get_feature_matrix(x_mfcc, N)
    else:
        mfcc_feature_matrix = None
    
    if x_zcrs is not None:
        # Zero-Crossing Rate
        zcrs_feature_matrix = get_feature_matrix(x_zcrs, N)
    else:
        zcrs_feature_matrix = None
    return (chrom_feature_matrix, mfcc_feature_matrix, zcrs_feature_matrix)


def get_audio_feature_dict(chrom_feature_matrix=None, mfcc_feature_matrix=None,
                           zcrs_feature_matrix=None, N=3):
    """
        Given a set of features, it generates a dictionary where all the features are included.
    """
    feature_dictionary = {}
    if chrom_feature_matrix is not None:
        feature_dictionary.update(get_dictionary_from_numpy(chrom_feature_matrix, N, name="chroma"))
    if mfcc_feature_matrix is not None:
        feature_dictionary.update(get_dictionary_from_numpy(mfcc_feature_matrix, N, name="mfcc"))
    if zcrs_feature_matrix is not None:
        feature_dictionary.update(get_dictionary_from_numpy(zcrs_feature_matrix, N, name="zcrs"))
    return feature_dictionary


def process_audio_file_batch(parent_dir, audio_files, sr=None, n_fft=2048, hop_length=512,
                             n_mfcc=20, n_chroma=12, n_octaves=7, chunks=3, chroma=True,
                             mfcc=True, zcrs=True):
    """
        It receives a list of audio files and for each audio it computs the different audio features. 
        It returns a list of features corresponding to each audio file.
    """
    data_features = []
    
    with tqdm(total=len(audio_files)) as pbar:
        for i, audio in enumerate(audio_files):
            pbar.update(1)
            # Generate full path
            audio_path = os.path.join(parent_dir, audio)

            # Load file
            x, sr = load_file(audio_path, sr=sr)
            
            if mfcc:
                # Compute MFCC
                x_mfcc = compute_mfccs(x, sr=sr, n_mfcc=n_mfcc)
            else:
                x_mfcc = None
            
            if chroma:
                # Compute Chromagram
                x_chroma = compute_chromagram(x, sr=sr, n_chroma=n_chroma, n_octaves=n_octaves,
                                              hop_length=hop_length)
            else:
                x_chroma = None
            
            if zcrs:
                # Compute Zero-Crossing Rate
                x_zcrs = compute_zero_crossing_rate(x, frame_length=n_fft, hop_length=hop_length)
            else:
                x_zcrs = None
            
            
            # Compute audio features
            chrom_feature_matrix, mfcc_feature_matrix, zcrs_feature_matrix = compute_audio_features(x_chroma,
                                                                                                    x_mfcc,
                                                                                                    x_zcrs,
                                                                                                    N=chunks)

            # Generate feature dictionary
            feature_dictionary = get_audio_feature_dict(chrom_feature_matrix, mfcc_feature_matrix,
                                                        zcrs_feature_matrix, N=chunks)

            # Concatenate data
            data_features.append(feature_dictionary)
    return data_features


def generate_data(parent_dir, folders, sr, n_fft, hop_length, n_mfcc, n_chroma, n_octaves, chunks,
                 apply_chroma=True, apply_mfcc=True, apply_zcrs=True):
    """
        Given a set of folders where the audio files are included, it processes them and generates 
        a solid feature vector.
    """
    labels = []
    audio_full_paths = []
    # Go Through the folders
    for folder in os.listdir(parent_dir):
        if (folder in folders) and (folder!='hip-hop'):                
            print("Retrieving data from folder {}".format(folder))
            # Get audio files
            new_files = os.listdir(os.path.join(parent_dir, folder))
            audio_full_paths += [os.path.join(os.path.join(parent_dir, folder), jj) for jj in new_files]
            labels += [folder for i in range(len(new_files))]
    print(parent_dir)
    print(audio_full_paths[:2])
    print(len(audio_full_paths))
    
    # Process data
    data_features = process_audio_file_batch('', audio_full_paths, sr, n_fft,
                                             hop_length, n_mfcc, n_chroma, n_octaves, chunks,
                                             chroma=apply_chroma, mfcc=apply_mfcc, zcrs=apply_zcrs)
    return data_features, labels, audio_full_paths


def get_dataframe_from_data(data_features, audio_full_paths, labels, save_locally=False, filepath=None):
    """Given a dictionary of features, it generates a Pandas DataFrame."""
    df = pd.DataFrame(data_features)
    df["genre"] = labels 
    df["filepath"] = audio_full_paths
    if save_locally:
        df.to_csv(filepath)
    return df

In [5]:
data_path1="music_dataset_part1\\audio_files"
data_path2="music_dataset_part2\\audio_files"
folder1='edm'

In [6]:
chunks=3 # Number of chunks of the audio file
sr=44100 # Sample rate
n_mfcc=20 # Number of Mel cepstral coefficients
n_fft=2048 # Number of FFT points
hop_length=int(n_fft/4) # Number of Overlapped samples
n_chroma=12 # Number of pitches
n_octaves=7 #Number of octaves
save_locally=True # save data locally
apply_chroma=False # apply chromagram representation
apply_mfcc=True # apply MFCC representation
apply_zcrs=True # apply zcrs representation

In [7]:
data_features1, labels1, audio_full_paths1 = generate_data(parent_dir=data_path1, folders=['edm', 'hip-hop','jazz','rock'], sr=sr, n_fft=n_fft, hop_length=int(n_fft/4), n_mfcc=n_mfcc, n_chroma=n_chroma, n_octaves=n_octaves, chunks=chunks,
                 apply_chroma=apply_chroma, apply_mfcc=apply_mfcc, apply_zcrs=apply_zcrs)

Retrieving data from folder edm
Retrieving data from folder jazz
Retrieving data from folder rock
music_dataset_part1\audio_files
['music_dataset_part1\\audio_files\\edm\\.ipynb_checkpoints', 'music_dataset_part1\\audio_files\\edm\\03nmxeZEIZuKvgnXdz88mT.mp3']
200


  0%|          | 0/200 [00:00<?, ?it/s]

PermissionError: [Errno 13] Permission denied: 'music_dataset_part1\\audio_files\\edm\\.ipynb_checkpoints'

In [40]:
#let's make it a dataset now
df_audios = pd.DataFrame(data=data_features1)

In [43]:
df_audios['label'] = labels1
df_audios['path'] = audio_full_paths1
df_audios.head()

Unnamed: 0,mfcc_1_mean_1,mfcc_1_std_1,mfcc_1_median_1,mfcc_1_min_1,mfcc_1_max_1,mfcc_2_mean_1,mfcc_2_std_1,mfcc_2_median_1,mfcc_2_min_1,mfcc_2_max_1,...,zcrs_1_median_2,zcrs_1_min_2,zcrs_1_max_2,zcrs_1_mean_3,zcrs_1_std_3,zcrs_1_median_3,zcrs_1_min_3,zcrs_1_max_3,label,path
0,-51.72504,36.968002,-51.594624,-159.173386,30.148195,117.064514,25.855074,111.552765,69.914009,191.043732,...,0.028809,0.001465,0.197754,0.024488,0.024057,0.015137,0.001465,0.09668,edm,music_dataset_part1\audio_files\edm\02RDuI6zo3...
1,-65.397697,35.161774,-66.009445,-165.860092,32.887779,127.540009,21.831797,126.908455,75.1483,210.059143,...,0.043457,0.005859,0.178711,0.069806,0.034919,0.063477,0.011719,0.209473,edm,music_dataset_part1\audio_files\edm\03nmxeZEIZ...
2,-58.372334,27.371698,-57.319206,-210.204498,21.280075,128.934586,13.924896,126.979568,87.889809,177.037872,...,0.069336,0.013184,0.143555,0.054915,0.034011,0.044434,0.006348,0.160645,edm,music_dataset_part1\audio_files\edm\05DQBenN58...
3,-84.450386,45.985455,-79.961288,-230.65152,38.036293,138.263046,23.019567,138.813553,61.430801,192.606842,...,0.049805,0.002441,0.191895,0.042981,0.025546,0.036133,0.001953,0.152344,edm,music_dataset_part1\audio_files\edm\0757375sae...
4,-74.631905,56.134987,-70.735184,-308.824493,31.577541,138.344818,22.805267,138.826492,62.536049,192.78891,...,0.063477,0.00293,0.194824,0.076392,0.04656,0.067383,0.009766,0.318359,edm,music_dataset_part1\audio_files\edm\07uAmmU5pe...


Now, we want to classify some musics. SO let's use for exemple a machine learning algorithm such as a random forest and see if ti works well

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [55]:
X = df_audios.drop(["label", "path"], axis=1)
y = df_audios["label"]

#split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

rf = RandomForestClassifier(n_estimators=250, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

#some accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8


With this algorithm, I obtain a maximum accuracy of about 80%, no matter how many trees I use for the classification. Let's make a bigger dataset to see if I have better results.

In [73]:
#do the same for the second folder
#here I had an issue with the folders given so I created new ones
data_features2, labels2, audio_full_paths2 = generate_data(parent_dir=data_path2, folders=['edm', 'hip-hop','jazz','rock'], sr=sr, n_fft=n_fft, hop_length=int(n_fft/4), n_mfcc=n_mfcc, n_chroma=n_chroma, n_octaves=n_octaves, chunks=chunks,
                 apply_chroma=apply_chroma, apply_mfcc=apply_mfcc, apply_zcrs=apply_zcrs)

Retrieving data from folder edm
Retrieving data from folder jazz
Retrieving data from folder rock
music_dataset_part2\audio_files
['music_dataset_part2\\audio_files\\edm\\02RDuI6zo3Y0dFhxA0iKIx.mp3', 'music_dataset_part2\\audio_files\\jazz\\082zyi189u5lRp2gPkE8Lb.mp3']
129


  0%|          | 0/129 [00:00<?, ?it/s]

music_dataset_part2\audio_files\edm\02RDuI6zo3Y0dFhxA0iKIx.mp3
music_dataset_part2\audio_files\jazz\082zyi189u5lRp2gPkE8Lb.mp3
music_dataset_part2\audio_files\jazz\0MOXs5rKfhXY7TooT6iiiP.mp3
music_dataset_part2\audio_files\jazz\0sGFsWPYyhN4X9LKT1EiOv.mp3
music_dataset_part2\audio_files\jazz\0T38cI9uIhptfZPlf1El09.mp3
music_dataset_part2\audio_files\jazz\0Ya7lMXJGmETT1G3bW0uX2.mp3
music_dataset_part2\audio_files\jazz\0zIqs8nFS5eHLgL0ZprFuP.mp3
music_dataset_part2\audio_files\jazz\14aFuIi1l2arBrZfBv9fBP.mp3
music_dataset_part2\audio_files\jazz\1axzRYUUASTfN4OOtrjfQd.mp3
music_dataset_part2\audio_files\jazz\1qKVWhOUXHlm8baMCOsANy.mp3
music_dataset_part2\audio_files\jazz\1XrEY8nz2P4L9rVoq9MdXs.mp3
music_dataset_part2\audio_files\jazz\1y0FhVFVZ2eG82sn0M7H7v.mp3
music_dataset_part2\audio_files\jazz\21anQnN5XQjhsVoyHkwoVo.mp3
music_dataset_part2\audio_files\jazz\25cJePcRAgKUKtjH8Wm1MQ.mp3
music_dataset_part2\audio_files\jazz\29dFStdDNJHdgGPhwyKdbB.mp3
music_dataset_part2\audio_files\jazz\2BjX

In [83]:
df_audios2 = pd.DataFrame(data=data_features2)
df_audios2['label'] = labels2
df_audios2['path'] = audio_full_paths2
df_audios2.head()

Unnamed: 0,mfcc_1_mean_1,mfcc_1_std_1,mfcc_1_median_1,mfcc_1_min_1,mfcc_1_max_1,mfcc_2_mean_1,mfcc_2_std_1,mfcc_2_median_1,mfcc_2_min_1,mfcc_2_max_1,...,zcrs_1_median_2,zcrs_1_min_2,zcrs_1_max_2,zcrs_1_mean_3,zcrs_1_std_3,zcrs_1_median_3,zcrs_1_min_3,zcrs_1_max_3,label,path
0,-51.72504,36.968002,-51.594624,-159.173386,30.148195,117.064514,25.855074,111.552765,69.914009,191.043732,...,0.028809,0.001465,0.197754,0.024488,0.024057,0.015137,0.001465,0.09668,edm,music_dataset_part2\audio_files\edm\02RDuI6zo3...
1,-427.757599,41.782753,-435.005585,-538.387329,-271.668457,181.456757,23.384613,182.835114,96.523819,252.730118,...,0.019531,0.00293,0.090332,0.019532,0.00993,0.017578,0.004883,0.106934,jazz,music_dataset_part2\audio_files\jazz\082zyi189...
2,-100.432243,29.886513,-97.466415,-173.598251,-21.343803,154.977127,12.58707,155.194702,87.48378,192.715637,...,0.044922,0.011719,0.216797,0.045955,0.026346,0.041016,0.006348,0.161133,jazz,music_dataset_part2\audio_files\jazz\0MOXs5rKf...
3,-81.861382,33.203468,-82.98616,-170.002472,-5.552082,160.10321,23.01273,162.865295,98.37159,211.797333,...,0.04541,0.008301,0.137207,0.042587,0.019654,0.040527,0.003418,0.116211,jazz,music_dataset_part2\audio_files\jazz\0sGFsWPYy...
4,-243.069611,47.051453,-238.296524,-348.078827,-135.520462,172.24231,21.763165,176.134064,91.21151,213.23468,...,0.038086,0.013672,0.130859,0.038776,0.01915,0.033203,0.013184,0.124023,jazz,music_dataset_part2\audio_files\jazz\0T38cI9uI...


In [1]:
df = pd.concat([df_audios, df_audios2])
print(len(df))
df.head()

NameError: name 'pd' is not defined

In [87]:
#NOW WE TRY AGAIN WITH THE ADDITIONAL DATA
X = df_audios.drop(["label", "path"], axis=1)
y = df_audios["label"]

#split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

rf = RandomForestClassifier(n_estimators=250, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

#some accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8


but we still have similar results ...

I forgot to use some preprocessing, maybe this will help :

In [110]:
def get_numpy_db(df, drop_cols):
    """It removes useless cols regarding parameter drop_cols and it converts df into a numpy array """
    # Preprocess data
    df_x = df.drop(drop_cols, axis=1)
    print("Original DataFrame shape: {}".format(df_x.shape))
    X = df_x.to_numpy()
    return X


def preprocess_db(df, drop_cols, apply_pca=True, explained_var=.9):
    """It applies different preprocessing procedures to the whole dataset in order to 
    scale and transform the original data.
        It returns both the whole dataset and the new query sample after being preprocessed. 
    """
    X = get_numpy_db(df, drop_cols)
    df_dropped = df[drop_cols].reset_index()
    # Scale features
    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X)
    
    # PCA Transformation
    if apply_pca:
        pca = PCA(explained_var)
        X_tr = pca.fit_transform(X_tr)
    #get it back to a dataframe
    df_tr = pd.DataFrame(data=X_tr).reset_index()
    df_tr = pd.concat([df_tr, df_dropped], axis=1)
    return df_tr

In [96]:
# Parameters
apply_pca=True
explained_var=.90
drop_cols=["label", "path"]

In [115]:
new_df = preprocess_db(df, drop_cols, apply_pca)
new_df.head()

Original DataFrame shape: (389, 315)


Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,37,38,39,40,41,42,43,index.1,label,path
0,0,-3.998302,-6.257196,-9.786042,-5.339698,2.306622,7.754866,-0.918783,-3.110031,1.778595,...,-0.027456,-1.203986,0.395735,0.531426,0.529817,0.786214,-0.64042,0,edm,music_dataset_part1\audio_files\edm\02RDuI6zo3...
1,1,-10.040385,-1.848138,-5.418832,1.591806,-0.977965,1.333289,-2.323295,0.397375,0.13391,...,-0.435909,0.915469,0.584292,1.887929,0.025197,-0.601872,-0.161961,1,edm,music_dataset_part1\audio_files\edm\03nmxeZEIZ...
2,2,-4.757672,-3.746397,-6.012074,2.678091,2.93831,-2.135856,3.238497,-3.540915,-2.028268,...,-0.084792,1.534239,0.217041,-1.677691,0.343925,0.071504,-0.899259,2,edm,music_dataset_part1\audio_files\edm\05DQBenN58...
3,3,-2.181224,-5.53744,-8.100303,3.419478,1.319312,-1.849413,-3.448708,-2.956501,2.550411,...,1.240903,-0.36671,-0.015507,0.360024,0.565218,1.486136,-0.19659,3,edm,music_dataset_part1\audio_files\edm\0757375sae...
4,4,-2.421254,-7.373567,-0.6234,2.868409,-0.946563,-2.452132,-2.088908,-0.798194,5.556367,...,-0.110242,-0.477823,-0.70768,1.783109,1.112692,-0.310625,-0.966609,4,edm,music_dataset_part1\audio_files\edm\07uAmmU5pe...


In [125]:
#remove these weird columns 'index' used for the concatenation
final_df = new_df.drop(columns=[new_df.columns[0], new_df.columns[45]])
final_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,label,path
0,-3.998302,-6.257196,-9.786042,-5.339698,2.306622,7.754866,-0.918783,-3.110031,1.778595,-4.058142,...,0.362764,-0.027456,-1.203986,0.395735,0.531426,0.529817,0.786214,-0.64042,edm,music_dataset_part1\audio_files\edm\02RDuI6zo3...
1,-10.040385,-1.848138,-5.418832,1.591806,-0.977965,1.333289,-2.323295,0.397375,0.13391,-0.644628,...,0.729871,-0.435909,0.915469,0.584292,1.887929,0.025197,-0.601872,-0.161961,edm,music_dataset_part1\audio_files\edm\03nmxeZEIZ...
2,-4.757672,-3.746397,-6.012074,2.678091,2.93831,-2.135856,3.238497,-3.540915,-2.028268,-1.568277,...,0.340017,-0.084792,1.534239,0.217041,-1.677691,0.343925,0.071504,-0.899259,edm,music_dataset_part1\audio_files\edm\05DQBenN58...
3,-2.181224,-5.53744,-8.100303,3.419478,1.319312,-1.849413,-3.448708,-2.956501,2.550411,2.316367,...,-0.086373,1.240903,-0.36671,-0.015507,0.360024,0.565218,1.486136,-0.19659,edm,music_dataset_part1\audio_files\edm\0757375sae...
4,-2.421254,-7.373567,-0.6234,2.868409,-0.946563,-2.452132,-2.088908,-0.798194,5.556367,-0.347246,...,0.69599,-0.110242,-0.477823,-0.70768,1.783109,1.112692,-0.310625,-0.966609,edm,music_dataset_part1\audio_files\edm\07uAmmU5pe...


In [133]:
#NOW WE TRY AGAIN ON THE PREPROCESSED DATA ...
X = final_df.drop(["label", "path"], axis=1)
y = final_df["label"]

#split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

#some accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8673469387755102


We obtain slightly better results, up to 87% accuracy on the prediction