In [2]:


import os
import librosa
import numpy as np
import pandas as pd

from glob import glob
from joblib import Parallel, delayed

# Code for segmenting 1 audio file in a folder into 3 sec segments + librosa feature extraction 

In [18]:
%%time

#Define the extract features function that will also work on segments of a song
def extract_features_from_segment(segment, sr, file_name, segment_index):
    # Length of the segment (in samples)
    length = segment.shape[0]
    
    # Zero Crossing Rate
    zero_crossings = librosa.zero_crossings(segment, pad=False)
    zero_crossings_rate_mean = np.mean(zero_crossings)
    zero_crossings_rate_var = np.var(zero_crossings)
    
    # Harmonics & Percussive Components (HPSS)
    y_harm, y_perc = librosa.effects.hpss(segment)
    harmony_mean = np.mean(y_harm)
    harmony_var = np.var(y_harm)
    perceptr_mean = np.mean(y_perc)
    perceptr_var = np.var(y_perc)
    
    # Tempo
    tempo_value, _ = librosa.beat.beat_track(y=segment, sr=sr)
    tempo = tempo_value.item()
    
    # Spectral Centroid
    spectral_centroids = librosa.feature.spectral_centroid(y=segment, sr=sr)[0]
    spectral_centroid_mean = np.mean(spectral_centroids)
    spectral_centroid_var = np.var(spectral_centroids)
    
    # Spectral Rolloff
    spectral_rolloff = librosa.feature.spectral_rolloff(y=segment, sr=sr)[0]
    rolloff_mean = np.mean(spectral_rolloff)
    rolloff_var = np.var(spectral_rolloff)
    
    # Spectral Bandwidth
    bandwidth = librosa.feature.spectral_bandwidth(y=segment, sr=sr)
    spectral_bandwidth_mean = np.mean(bandwidth)
    spectral_bandwidth_var = np.var(bandwidth)
    
    # Chroma Frequencies
    hop_length = 5000  # Adjust as needed
    chromagram = librosa.feature.chroma_stft(y=segment, sr=sr, hop_length=hop_length)
    chroma_mean = np.mean(chromagram)
    chroma_var = np.var(chromagram)
    
    # RMS Energy
    rms_values = librosa.feature.rms(y=segment)
    rms_mean = np.mean(rms_values)
    rms_var = np.var(rms_values)
    
    # MFCCs (default 20 coefficients)
    mfccs = librosa.feature.mfcc(y=segment, sr=sr)
    mfcc_means = np.mean(mfccs, axis=1)
    mfcc_vars = np.var(mfccs, axis=1)
    
    # Build feature dictionary
    features = {
        'file_name': file_name,
        'segment_index': segment_index,
        'length_samples': length,
        'zero_crossings_rate_mean': zero_crossings_rate_mean,
        'zero_crossings_rate_var': zero_crossings_rate_var,
        'harmony_mean': harmony_mean,
        'harmony_var': harmony_var,
        'perceptr_mean': perceptr_mean,
        'perceptr_var': perceptr_var,
        'tempo': tempo,
        'spectral_centroid_mean': spectral_centroid_mean,
        'spectral_centroid_var': spectral_centroid_var,
        'rolloff_mean': rolloff_mean,
        'rolloff_var': rolloff_var,
        'spectral_bandwidth_mean': spectral_bandwidth_mean,
        'spectral_bandwidth_var': spectral_bandwidth_var,
        'chroma_mean': chroma_mean,
        'chroma_var': chroma_var,
        'rms_mean': rms_mean,
        'rms_var': rms_var
    }
    
    # Add MFCC features as separate columns
    for i in range(len(mfcc_means)):
        features[f'mfcc_mean_{i+1}'] = mfcc_means[i]
        features[f'mfcc_var_{i+1}'] = mfcc_vars[i]
    
    return features

# Main file path
file_path = '../../raw_data/Data/test_mp3/wav_no-copyright-music-happy-306601.wav'
file_name = os.path.basename(file_path)

# /home/tford/code/zmokhtari89/k_means_klang/Notebooks/librosa

# Load and trim the audio file
y, sr = librosa.load(file_path)
audio, _ = librosa.effects.trim(y)

# Define the segment duration in seconds and calculate the number of samples per segment
segment_duration = 3  # seconds
segment_length = int(segment_duration * sr)

# Determine the total number of full segments in the audio file
num_segments = len(audio) // segment_length

# #list comprehension option to save time 
# segments = [(i, audio[i * segment_length : (i+1) * segment_length]) for i in range(num_segments)]

# Create an empty list to store segments and their indices
# Loop over the number of segments and extract each segment
segments = []

for i in range(num_segments):
    start = i * segment_length
    end = start + segment_length
    segment = audio[start:end]
    segments.append((i, segment))

# Process segments in parallel
data_list = Parallel(n_jobs=-1)(
    delayed(extract_features_from_segment)(segment, sr, file_name, idx) for idx, segment in segments
)

# Create DataFrame from the list of feature dictionaries
df = pd.DataFrame(data_list)


CPU times: user 903 ms, sys: 107 ms, total: 1.01 s
Wall time: 5.25 s


In [19]:
df.head()

Unnamed: 0,file_name,segment_index,length_samples,zero_crossings_rate_mean,zero_crossings_rate_var,harmony_mean,harmony_var,perceptr_mean,perceptr_var,tempo,...,mfcc_mean_16,mfcc_var_16,mfcc_mean_17,mfcc_var_17,mfcc_mean_18,mfcc_var_18,mfcc_mean_19,mfcc_var_19,mfcc_mean_20,mfcc_var_20
0,wav_no-copyright-music-happy-306601.wav,0,66150,0.083311,0.07637,-1.230882e-07,0.000158,-6.02958e-07,4e-06,89.102909,...,-3.713993,54.536076,1.013263,26.288309,6.811085,74.159538,3.596346,138.407455,4.543487,221.356873
1,wav_no-copyright-music-happy-306601.wav,1,66150,0.063114,0.059131,1.918705e-06,0.001442,2.783503e-07,4.6e-05,89.102909,...,-3.103975,29.858402,1.79589,104.68071,12.200061,106.684135,-0.3014,313.787872,-13.128267,186.983749
2,wav_no-copyright-music-happy-306601.wav,2,66150,0.064021,0.059922,-2.47456e-06,0.001318,-3.958111e-06,1.9e-05,172.265625,...,-15.222053,501.565338,-2.408067,83.611954,8.072178,43.650497,10.95409,213.87674,13.717305,282.303864
3,wav_no-copyright-music-happy-306601.wav,3,66150,0.05167,0.049001,-2.277924e-05,0.014683,2.382913e-05,0.004272,92.285156,...,-6.450809,53.364441,1.392189,69.755234,11.07711,149.064377,-0.363129,51.13205,-2.685,165.237473
4,wav_no-copyright-music-happy-306601.wav,4,66150,0.06384,0.059764,-2.664415e-05,0.00995,6.376861e-05,0.009708,89.102909,...,-0.117921,74.189682,2.838147,62.029839,10.862969,139.634537,-0.844028,108.815834,-3.754081,158.434875


In [17]:
df.shape


(53, 60)

# Code on converting all audio files from multiple folders into 3 sec segments + librosa feature extraction 


**Note**
- This segmenting will not be need to provide a prediction
- However, this could be useful to gather data to enrich our data set, and 'improve' our model over time. 1 example could be to add other labeled genres to the dataset,

In [24]:
%%time

import os
import librosa
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from sklearn.preprocessing import MinMaxScaler



#Define the extract features function that will also work on segments of a song
def extract_features_from_segment(segment, sr, file_name, segment_index):
    # Length of the segment (in samples)
    length = segment.shape[0]
    
    # Zero Crossing Rate
    zero_crossings = librosa.zero_crossings(segment, pad=False)
    zero_crossings_rate_mean = np.mean(zero_crossings)
    zero_crossings_rate_var = np.var(zero_crossings)
    
    # Harmonics & Percussive Components (HPSS)
    y_harm, y_perc = librosa.effects.hpss(segment)
    harmony_mean = np.mean(y_harm)
    harmony_var = np.var(y_harm)
    perceptr_mean = np.mean(y_perc)
    perceptr_var = np.var(y_perc)
    
    # Tempo
    tempo_value, _ = librosa.beat.beat_track(y=segment, sr=sr)
    tempo = tempo_value.item()
    
    # Spectral Centroid
    spectral_centroids = librosa.feature.spectral_centroid(y=segment, sr=sr)[0]
    spectral_centroid_mean = np.mean(spectral_centroids)
    spectral_centroid_var = np.var(spectral_centroids)
    
    # Spectral Rolloff
    spectral_rolloff = librosa.feature.spectral_rolloff(y=segment, sr=sr)[0]
    rolloff_mean = np.mean(spectral_rolloff)
    rolloff_var = np.var(spectral_rolloff)
    
    # Spectral Bandwidth
    bandwidth = librosa.feature.spectral_bandwidth(y=segment, sr=sr)
    spectral_bandwidth_mean = np.mean(bandwidth)
    spectral_bandwidth_var = np.var(bandwidth)
    
    # Chroma Frequencies
    hop_length = 5000  # Adjust as needed
    chromagram = librosa.feature.chroma_stft(y=segment, sr=sr, hop_length=hop_length)
    chroma_mean = np.mean(chromagram)
    chroma_var = np.var(chromagram)
    
    # RMS Energy
    rms_values = librosa.feature.rms(y=segment)
    rms_mean = np.mean(rms_values)
    rms_var = np.var(rms_values)
    
    # MFCCs (default 20 coefficients)
    mfccs = librosa.feature.mfcc(y=segment, sr=sr)
    mfcc_means = np.mean(mfccs, axis=1)
    mfcc_vars = np.var(mfccs, axis=1)
    
    # Build feature dictionary
    features = {
        'file_name': file_name,
        'segment_index': segment_index,
        'length_samples': length,
        'zero_crossings_rate_mean': zero_crossings_rate_mean,
        'zero_crossings_rate_var': zero_crossings_rate_var,
        'harmony_mean': harmony_mean,
        'harmony_var': harmony_var,
        'perceptr_mean': perceptr_mean,
        'perceptr_var': perceptr_var,
        'tempo': tempo,
        'spectral_centroid_mean': spectral_centroid_mean,
        'spectral_centroid_var': spectral_centroid_var,
        'rolloff_mean': rolloff_mean,
        'rolloff_var': rolloff_var,
        'spectral_bandwidth_mean': spectral_bandwidth_mean,
        'spectral_bandwidth_var': spectral_bandwidth_var,
        'chroma_mean': chroma_mean,
        'chroma_var': chroma_var,
        'rms_mean': rms_mean,
        'rms_var': rms_var
    }
    
    # Add MFCC features as separate columns
    for i in range(len(mfcc_means)):
        features[f'mfcc_mean_{i+1}'] = mfcc_means[i]
        features[f'mfcc_var_{i+1}'] = mfcc_vars[i]
    
    return features

#define a function for processing the files i.e. splitting into segments, and then applying he extract_features_form_segment function from above
def process_file(file_path, segment_duration=3):
    # Extract file name
    file_name = os.path.basename(file_path)
    
    # Load and trim the audio file
    y, sr = librosa.load(file_path)
    audio, _ = librosa.effects.trim(y)
    
    # Calculate segment length in samples
    segment_length = int(segment_duration * sr)
    
    # Calculate number of full segments
    num_segments = len(audio) // segment_length
    
    # Create list to store features from all segments of the file
    file_features = []
    
    # Loop over segments (using a simple loop for clarity)
    for i in range(num_segments):
        start = i * segment_length
        end = start + segment_length
        segment = audio[start:end]
        
        # Extract features from the segment
        features = extract_features_from_segment(segment, sr, file_name, i)
        file_features.append(features)
    
    return file_features

# Define main folder path containing subfolders with audio files
main_folder_path = '../../raw_data/Data/genres_original'

# Collect file paths recursively using os.walk
file_paths = []
for root, dirs, files in os.walk(main_folder_path):
    for filename in files:
        if filename.lower().endswith(('.wav', '.mp3', '.flac')):
            file_paths.append(os.path.join(root, filename))

# Process files in parallel. This parallelizes at the file level.
all_features = Parallel(n_jobs=-1)(
    delayed(process_file)(fp) for fp in file_paths
)

# Flatten the list (each file returns a list of segments)
data_list = [segment for file_features in all_features for segment in file_features]

# Create a DataFrame from the list of feature dictionaries
df = pd.DataFrame(data_list)

#Apply Standard Scaling to all numeric columns (but will nede to remove segment index)
scaler = MinMaxScaler()
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns]= scaler.fit_transform(df[numeric_columns])

print(df.head())


  return pitch_tuning(


CPU times: user 15 s, sys: 3.28 s, total: 18.2 s
Wall time: 18min 29s


In [25]:
df.head()

Unnamed: 0,file_name,segment_index,length_samples,zero_crossings_rate_mean,zero_crossings_rate_var,harmony_mean,harmony_var,perceptr_mean,perceptr_var,tempo,...,mfcc_mean_16,mfcc_var_16,mfcc_mean_17,mfcc_var_17,mfcc_mean_18,mfcc_var_18,mfcc_mean_19,mfcc_var_19,mfcc_mean_20,mfcc_var_20
0,hiphop.00023.wav,0,66150,0.109055,0.097162,-6.1e-05,0.020119,-0.000401,0.014121,184.570312,...,6.574983,40.556477,-0.938712,47.518406,10.512637,40.463409,-0.174228,39.308609,0.947258,38.224289
1,hiphop.00023.wav,1,66150,0.102389,0.091905,6.6e-05,0.016613,-0.000223,0.013312,184.570312,...,5.47307,41.973972,-3.854284,25.16847,6.942848,32.675552,1.945795,25.792435,2.531149,26.519554
2,hiphop.00023.wav,2,66150,0.111036,0.098707,-4.5e-05,0.018252,-0.000588,0.010422,184.570312,...,3.757442,33.603489,-1.962889,40.849285,9.944297,35.131123,5.055728,35.489174,5.089661,34.549763
3,hiphop.00023.wav,3,66150,0.113348,0.100501,1.9e-05,0.017,-0.000309,0.010818,95.703125,...,2.491102,36.853142,-2.474415,55.91703,9.717512,45.788292,6.312342,39.237965,6.673666,44.209713
4,hiphop.00023.wav,4,66150,0.107831,0.096203,-8e-06,0.017416,-0.000523,0.012154,184.570312,...,2.641358,41.961292,-3.557897,33.492554,8.407369,27.455303,3.760981,32.807541,2.335147,55.256927


In [27]:
# sort values alphabetically 
df.sort_values('file_name', ascending=True, inplace=True)

In [28]:
df.head()

Unnamed: 0,file_name,segment_index,length_samples,zero_crossings_rate_mean,zero_crossings_rate_var,harmony_mean,harmony_var,perceptr_mean,perceptr_var,tempo,...,mfcc_mean_16,mfcc_var_16,mfcc_mean_17,mfcc_var_17,mfcc_mean_18,mfcc_var_18,mfcc_mean_19,mfcc_var_19,mfcc_mean_20,mfcc_var_20
2747,blues.00000.wav,2,66150,0.0722,0.066987,-7e-06,0.012531,8.8e-05,0.004318,123.046875,...,4.730752,68.306793,-1.714476,28.136944,2.329553,47.211426,-1.925621,52.922432,2.466996,33.164005
2746,blues.00000.wav,1,66150,0.088209,0.080428,-9.8e-05,0.004937,-4.2e-05,0.00488,123.046875,...,4.050664,64.819786,-6.025473,40.548809,0.127131,51.048943,-2.808956,97.221497,5.771881,60.360348
2748,blues.00000.wav,3,66150,0.070204,0.065275,-3.6e-05,0.008459,3.8e-05,0.00593,123.046875,...,-1.45431,48.543198,-3.786987,28.419546,1.153315,35.682697,-3.501979,50.610344,3.580636,32.325874
2749,blues.00000.wav,4,66150,0.070899,0.065873,-1.5e-05,0.009666,-0.000108,0.005828,123.046875,...,2.053744,30.829544,0.635798,44.645554,1.591107,51.415867,-3.364908,26.421085,0.501504,29.109529
2750,blues.00000.wav,5,66150,0.093394,0.084671,2.1e-05,0.008253,-0.000126,0.005501,129.199219,...,2.091273,30.950649,-3.461423,34.216366,-0.450124,39.934879,-3.643696,32.52187,3.015992,28.928495


In [29]:
df.shape

(9980, 60)

In [31]:
#Apply Standard Scaling to all numeric columns (but will nede to remove segment index)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns]= scaler.fit_transform(df[numeric_columns])

df.head()

Unnamed: 0,file_name,segment_index,length_samples,zero_crossings_rate_mean,zero_crossings_rate_var,harmony_mean,harmony_var,perceptr_mean,perceptr_var,tempo,...,mfcc_mean_16,mfcc_var_16,mfcc_mean_17,mfcc_var_17,mfcc_mean_18,mfcc_var_18,mfcc_mean_19,mfcc_var_19,mfcc_mean_20,mfcc_var_20
2747,blues.00000.wav,0.222222,0.0,0.17319,0.249441,0.629681,0.098633,0.568722,0.072573,0.375552,...,0.477834,0.098241,0.424224,0.04652,0.400177,0.063937,0.433082,0.038922,0.54601,0.035985
2746,blues.00000.wav,0.111111,0.0,0.220557,0.3121,0.627548,0.038855,0.560429,0.082019,0.375552,...,0.467516,0.093125,0.354468,0.070428,0.361988,0.070113,0.418041,0.077818,0.593395,0.065722
2748,blues.00000.wav,0.333333,0.0,0.167285,0.241463,0.629015,0.066585,0.565544,0.099662,0.375552,...,0.383995,0.069246,0.390689,0.047064,0.379782,0.04538,0.40624,0.036891,0.561977,0.035068
2749,blues.00000.wav,0.444444,0.0,0.169343,0.244247,0.629487,0.076083,0.556191,0.097937,0.375552,...,0.437219,0.043259,0.462253,0.078319,0.387373,0.070704,0.408574,0.015652,0.517829,0.031552
2750,blues.00000.wav,0.555556,0.0,0.235899,0.331883,0.630341,0.06496,0.555087,0.092447,0.398969,...,0.437788,0.043437,0.395957,0.05823,0.351979,0.052224,0.403827,0.021009,0.553882,0.031354
