# Testing mp3 file and wav file for the same 'song'


In [6]:
import os
print(os.getcwd())


/home/tford/code/zmokhtari89/k_means_klang/notebooks/librosa


In [1]:
%%time

#need to scale this

import os
import librosa
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from sklearn.preprocessing import MinMaxScaler

#defining a function to extract features 
def extract_features(file_path):

    # Load and trim the audio file
    y, sr = librosa.load(file_path)
    audio_file, _ = librosa.effects.trim(y)
    
    # Length (in samples)
    length = audio_file.shape[0]
    
    # Zero Crossing Rate
    zero_crossings = librosa.zero_crossings(audio_file, pad=False)
    zero_crossing_rate_mean = np.mean(zero_crossings)
    zero_crossing_rate_var = np.var(zero_crossings)
    
    # Harmonics & Percussive Components (HPSS)
    y_harm, y_perc = librosa.effects.hpss(audio_file)
    harmony_mean = np.mean(y_harm)
    harmony_var = np.var(y_harm)
    perceptr_mean = np.mean(y_perc)
    perceptr_var = np.var(y_perc)
    
    # Tempo
    tempo_value, _ = librosa.beat.beat_track(y=audio_file, sr=sr)
    tempo = tempo_value.item()
    
    # Spectral Centroid
    spectral_centroids = librosa.feature.spectral_centroid(y=audio_file, sr=sr)[0]
    spectral_centroid_mean = np.mean(spectral_centroids)
    spectral_centroid_var = np.var(spectral_centroids)
    
    # Spectral Rolloff
    spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_file, sr=sr)[0]
    rolloff_mean = np.mean(spectral_rolloff)
    rolloff_var = np.var(spectral_rolloff)
    
    # Spectral Bandwidth
    bandwidth = librosa.feature.spectral_bandwidth(y=audio_file, sr=sr)
    spectral_bandwidth_mean = np.mean(bandwidth)
    spectral_bandwidth_var = np.var(bandwidth)
    
    # Chroma Frequencies
    hop_length = 5000  # Adjust for granularity
    chromagram = librosa.feature.chroma_stft(y=audio_file, sr=sr, hop_length=hop_length)
    chroma_mean = np.mean(chromagram)
    chroma_var = np.var(chromagram)
    
    # RMS Energy
    rms_values = librosa.feature.rms(y=audio_file)
    rms_mean = np.mean(rms_values)
    rms_var = np.var(rms_values)
    
    # MFCCs (20 coefficients)
    mfccs = librosa.feature.mfcc(y=audio_file, sr=sr)
    mfcc_means = np.mean(mfccs, axis=1)
    mfcc_vars = np.var(mfccs, axis=1)
    
    # Build feature dictionary
    features = {
        'filename': os.path.basename(file_path),
        'length': length,
        'chroma_stft_mean': chroma_mean,
        'chroma_stft_var': chroma_var,
        'rms_mean': rms_mean,
        'rms_var': rms_var,
        'spectral_centroid_mean': spectral_centroid_mean,
        'spectral_centroid_var': spectral_centroid_var,
        'spectral_bandwidth_mean': spectral_bandwidth_mean,
        'spectral_bandwidth_var': spectral_bandwidth_var,
        'rolloff_mean': rolloff_mean,
        'rolloff_var': rolloff_var,
        'zero_crossing_rate_mean': zero_crossing_rate_mean,
        'zero_crossing_rate_var': zero_crossing_rate_var,
        'harmony_mean': harmony_mean,
        'harmony_var': harmony_var,
        'perceptr_mean': perceptr_mean,
        'perceptr_var': perceptr_var,
        'tempo': tempo,
    }
    
    # Add MFCC features (20 coefficients)
    for i in range(len(mfcc_means)):
        features[f'mfcc{i+1}_mean'] = mfcc_means[i]
        features[f'mfcc{i+1}_var'] = mfcc_vars[i]
    
    return features

# Define main folder path containing subfolders with audio files
main_folder_path = '../../raw_data/Data/test_mp3'

# # Collect file paths from all subfolders using os.walk()
# file_paths = []
# for root, dirs, files in os.walk(main_folder_path):
#     for filename in files:
#         if filename.lower().endswith(('.wav', '.mp3', '.flac')):
#             file_paths.append(os.path.join(root, filename))

# Collect file paths from all subfolders using os.walk()
file_paths = []
for root, dirs, files in os.walk(main_folder_path):
    for filename in files:
        if filename.lower().endswith(('.wav')):
            file_paths.append(os.path.join(root, filename))

# Use joblib to process files in parallel
data_list = Parallel(n_jobs=-1)(delayed(extract_features)(fp) for fp in file_paths)

# Create a DataFrame from the list of feature dictionaries
df = pd.DataFrame(data_list)

# #Apply Scaling to all numeric columns 
# scaler = MinMaxScaler()
# numeric_columns = df.select_dtypes(include=[np.number]).columns
# df[numeric_columns]= scaler.fit_transform(df[numeric_columns])





CPU times: user 1.73 s, sys: 189 ms, total: 1.92 s
Wall time: 17.4 s


In [2]:
df.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
0,wav_no-copyright-music-happy-306601.wav,3518976,0.332647,0.097487,0.143359,0.006432,1270.949652,578311.66602,1787.417101,467784.988855,...,-0.834998,95.083801,1.528376,83.621841,7.481932,104.85273,1.367486,126.39212,0.426527,158.375793


In [9]:
# # # create a csv file 
# df.to_csv('not_scaled_no-copyright-music-happy-306601.csv', index=False) 

In [3]:
#drop columns 
X_features = df.drop(["filename", "length"], axis=1)
X_features


Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
0,0.332647,0.097487,0.143359,0.006432,1270.949652,578311.66602,1787.417101,467784.988855,2455.749984,4157951.0,...,-0.834998,95.083801,1.528376,83.621841,7.481932,104.85273,1.367486,126.39212,0.426527,158.375793


In [4]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
#and then scale
pd.set_option('display.float_format', '{:.20f}'.format)
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_features), columns=X_features.columns)
X_scaled



Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# # create a csv file 
df.to_csv('scaled_no-copyright-music-happy-306601.csv', index=False) 

## reasons for slight differneces in features 
**There are a few reasons why you might see slight differences in the feature extraction results when comparing a .wav file to an .mp3 file of the same audio:**

- Lossy Compression: MP3 is a lossy compression format, meaning that some of the original audio information is discarded to reduce file size. This can affect the fine details in the spectral content, which in turn can lead to differences in features like MFCCs.
- Decoding Differences: Librosa uses different backends to decode audio files (e.g., PySoundFile for WAV and audioread for MP3). These decoders might introduce subtle differences in how the audio data is read and processed.
- Bitrate and Quality Variations: MP3 files can have different bitrates. A lower bitrate MP3 will lose more detail than a high-quality WAV file, affecting the extracted features.
- Sampling and Preprocessing: Even if both files have the same nominal sample rate, the preprocessing (trimming, resampling, etc.) might behave slightly differently due to compression artifacts or metadata differences.
- All these factors can cause slight misalignments in features like MFCCs between the two versions of the audio file.