In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
df_train = pd.read_csv('training_data_final.csv')
df_val = pd.read_csv('validation_data_final.csv')

In [5]:
df_train.describe()

Unnamed: 0.1,Unnamed: 0,song_id,0,1,2,3,4,5,6,7,...,14,15,16,17,18,19,label,avg,first_der,Spectral_Bandwidth
count,2571688.0,2571688.0,2571688.0,2571688.0,2571688.0,2571688.0,2571688.0,2571688.0,2571688.0,2571688.0,...,2571688.0,2571688.0,2571688.0,2571688.0,2571688.0,2571688.0,2571688.0,2571688.0,2571686.0,2571688.0
mean,262033.7,7.732505,-184.0672,160.3907,-40.29035,28.12147,-0.6071339,5.265247,-4.475622,1.530516,...,-0.7745443,-3.74503,-0.005219728,-2.782827,-0.1662403,-1.902725,3.318604,-3.146505,-6.970197e-05,-6.632178
std,160280.6,6.753933,94.46983,44.54711,38.59133,34.30488,23.70341,20.6464,16.27559,15.30715,...,10.39811,9.794225,10.80832,9.979153,10.14578,10.14417,1.787124,6.218659,1.194906,25.64719
min,0.0,1.0,-616.5092,-118.9609,-161.561,-110.1264,-98.80458,-74.87053,-86.93895,-79.69452,...,-52.90346,-59.96405,-52.87412,-54.33993,-48.33425,-51.87897,0.0,-30.82546,-19.3172,-154.1273
25%,122928.0,1.0,-244.9527,133.3805,-68.2879,5.249542,-17.53157,-9.742997,-15.15791,-8.800862,...,-7.642842,-10.2172,-7.213601,-9.262317,-6.885947,-8.560415,2.0,-7.001982,-0.6254794,-17.60693
50%,256869.0,6.0,-172.5987,164.5124,-42.66308,33.91434,-0.09043165,4.512479,-4.130829,1.065307,...,-0.9121526,-4.229023,-0.5039742,-3.357756,-0.6998017,-2.67389,3.0,-2.62502,0.04624171,-3.497698
75%,391138.0,14.0,-115.6562,192.842,-14.45634,52.86643,16.18911,20.0459,6.461272,11.63819,...,5.605886,1.947588,6.439716,2.801736,5.790803,3.614494,5.0,1.15724,0.7127167,10.09997
max,616432.0,23.0,93.27194,276.2949,116.428,154.4931,96.54688,100.815,72.74289,78.44859,...,75.15225,63.6604,69.02464,65.72739,68.63934,63.87963,6.0,14.27362,25.16984,57.84696


In [None]:
import numpy as np
import pandas as pd
import librosa

# Assuming df has columns named 0 to 19 for the 20 MFCC coefficients and other columns like label
mfcc_columns = [str(i) for i in range(20)]  # Column names for the MFCC coefficients
mfcc_data = np.array(df_train[mfcc_columns])  # Convert the MFCC columns to a NumPy array

# 1. Autocorrelation Features (calculated per row)
def autocorrelation_features(mfcc_data):
    peak_values, peak_lags, sum_autocorrs = [], [], []
    
    for row in mfcc_data:
        peak_vals_row, peak_lags_row, sum_autocorrs_row = [], [], []
        for coeff in row:
            autocorr = librosa.autocorrelate(np.array([coeff]))

            if len(autocorr) > 1:
                peak_value = np.max(autocorr[1:])  # Exclude lag 0
                peak_lag = np.argmax(autocorr[1:]) + 1
                sum_autocorr = np.sum(autocorr)
            else:
                peak_value, peak_lag, sum_autocorr = 0, 0, 0
            
            peak_vals_row.append(peak_value)
            peak_lags_row.append(peak_lag)
            sum_autocorrs_row.append(sum_autocorr)
        
        peak_values.append(np.mean(peak_vals_row))
        peak_lags.append(np.mean(peak_lags_row))
        sum_autocorrs.append(np.mean(sum_autocorrs_row))
    
    return peak_values, peak_lags, sum_autocorrs

# 2. Spectral Centroid and Bandwidth
freq_centers = np.linspace(200, 8000, 20)  # Example center frequencies for 20 MFCC coefficients

def spectral_features(mfcc_data, freq_centers):
    centroids, bandwidths = [], []
    
    for mfcc_values in mfcc_data:
        if np.sum(mfcc_values) == 0:
            centroids.append(0)
            bandwidths.append(0)
            continue

        centroid = np.sum(freq_centers * mfcc_values) / np.sum(mfcc_values)
        centroids.append(centroid)
        
        bandwidth = np.sqrt(np.sum(((freq_centers - centroid) ** 2) * mfcc_values) / np.sum(mfcc_values))
        bandwidths.append(bandwidth)
    
    return centroids, bandwidths

# 3. Energy and Power Calculation (per row)
def compute_energy_and_power(mfcc_data):
    energies = np.sum(mfcc_data ** 2, axis=1)  # Sum of squares per observation
    powers = np.mean(mfcc_data ** 2, axis=1)   # Mean of squares per observation
    return energies, powers

# 4. Zero-Crossing Rate Calculation (per row) with smaller frame_length
def zero_crossing_rate(mfcc_data, frame_length=20):
    zcrs = []
    for row in mfcc_data:
        zcr_row = np.mean(librosa.feature.zero_crossing_rate(row.reshape(1, -1), frame_length=frame_length, hop_length=frame_length // 2, center=False))
        zcrs.append(zcr_row)
    return zcrs

# Applying feature engineering
peak_values, peak_lags, sum_autocorrs = autocorrelation_features(mfcc_data)
centroids, bandwidths = spectral_features(mfcc_data, freq_centers)
energies, powers = compute_energy_and_power(mfcc_data)
zcrs = zero_crossing_rate(mfcc_data)

# Adding engineered features to DataFrame
df_train['autocorr_peak_values'] = peak_values
df_train['autocorr_peak_lags'] = peak_lags
df_train['autocorr_sum'] = sum_autocorrs
df_train['spectral_centroid'] = centroids
df_train['spectral_bandwidth'] = bandwidths
df_train['energy'] = energies
df_train['power'] = powers
df_train['zero_crossing_rate'] = zcrs

# The final DataFrame df will now include original columns, labels, and the new feature-engineered columns.

In [None]:
import numpy as np
import pandas as pd
import librosa

# Assuming df has columns named 0 to 19 for the 20 MFCC coefficients and other columns like label
mfcc_columns = [str(i) for i in range(20)]  # Column names for the MFCC coefficients
mfcc_data = np.array(df_val[mfcc_columns])  # Convert the MFCC columns to a NumPy array

# 1. Autocorrelation Features (calculated per row)
def autocorrelation_features(mfcc_data):
    peak_values, peak_lags, sum_autocorrs = [], [], []
    
    for row in mfcc_data:
        peak_vals_row, peak_lags_row, sum_autocorrs_row = [], [], []
        for coeff in row:
            autocorr = librosa.autocorrelate(np.array([coeff]))

            if len(autocorr) > 1:
                peak_value = np.max(autocorr[1:])  # Exclude lag 0
                peak_lag = np.argmax(autocorr[1:]) + 1
                sum_autocorr = np.sum(autocorr)
            else:
                peak_value, peak_lag, sum_autocorr = 0, 0, 0
            
            peak_vals_row.append(peak_value)
            peak_lags_row.append(peak_lag)
            sum_autocorrs_row.append(sum_autocorr)
        
        peak_values.append(np.mean(peak_vals_row))
        peak_lags.append(np.mean(peak_lags_row))
        sum_autocorrs.append(np.mean(sum_autocorrs_row))
    
    return peak_values, peak_lags, sum_autocorrs

# 2. Spectral Centroid and Bandwidth
freq_centers = np.linspace(200, 8000, 20)  # Example center frequencies for 20 MFCC coefficients

def spectral_features(mfcc_data, freq_centers):
    centroids, bandwidths = [], []
    
    for mfcc_values in mfcc_data:
        if np.sum(mfcc_values) == 0:
            centroids.append(0)
            bandwidths.append(0)
            continue

        centroid = np.sum(freq_centers * mfcc_values) / np.sum(mfcc_values)
        centroids.append(centroid)
        
        bandwidth = np.sqrt(np.sum(((freq_centers - centroid) ** 2) * mfcc_values) / np.sum(mfcc_values))
        bandwidths.append(bandwidth)
    
    return centroids, bandwidths

# 3. Energy and Power Calculation (per row)
def compute_energy_and_power(mfcc_data):
    energies = np.sum(mfcc_data ** 2, axis=1)  # Sum of squares per observation
    powers = np.mean(mfcc_data ** 2, axis=1)   # Mean of squares per observation
    return energies, powers

# 4. Zero-Crossing Rate Calculation (per row) with smaller frame_length
def zero_crossing_rate(mfcc_data, frame_length=20):
    zcrs = []
    for row in mfcc_data:
        zcr_row = np.mean(librosa.feature.zero_crossing_rate(row.reshape(1, -1), frame_length=frame_length, hop_length=frame_length // 2, center=False))
        zcrs.append(zcr_row)
    return zcrs

# Applying feature engineering
peak_values, peak_lags, sum_autocorrs = autocorrelation_features(mfcc_data)
centroids, bandwidths = spectral_features(mfcc_data, freq_centers)
energies, powers = compute_energy_and_power(mfcc_data)
zcrs = zero_crossing_rate(mfcc_data)

# Adding engineered features to DataFrame
df_val['autocorr_peak_values'] = peak_values
df_val['autocorr_peak_lags'] = peak_lags
df_val['autocorr_sum'] = sum_autocorrs
df_val['spectral_centroid'] = centroids
df_val['spectral_bandwidth'] = bandwidths
df_val['energy'] = energies
df_val['power'] = powers
df_val['zero_crossing_rate'] = zcrs

# The final DataFrame df will now include original columns, labels, and the new feature-engineered columns.

In [None]:
df_train_new = df_train

In [None]:
df_train_new.drop('autocorr_peak_values', axis=1)
df_train_new.drop('autocorr_peak_lags', axis=1)
df_train_new.drop('autocorr_sum', axis=1)
df_train_new.drop('avg', axis=1)
df_train_new.drop('first_der', axis=1)

In [None]:
df