In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import soundfile as sf
import audioread
import random

# Define the file path of the single audio file
file_path = "/content/drive/MyDrive/Chammak Challo - Ra-One 320 Kbps.wav"  # Change to your file path

data = []

try:
    # Load the audio file
    with audioread.audio_open(file_path) as f:
        y, sr = librosa.load(file_path, sr=None)

    # 1. Temporal Features
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    zcr = np.mean(librosa.feature.zero_crossing_rate(y))
    rmse = np.mean(librosa.feature.rms(y=y))

    # 2. Spectral Features
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
    spectral_flatness = np.mean(librosa.feature.spectral_flatness(y=y))

    # 3. Rhythmic Features
    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
    onset_strength = np.mean(onset_env)
    beats_per_segment = len(librosa.beat.beat_track(y=y, sr=sr)[1])
    hnr = np.mean(librosa.effects.harmonic(y))

    # 4. Tonal Features
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    key = np.argmax(np.mean(chroma, axis=1))  # Approximate key detection
    mode = 1 if key in [0, 2, 4, 5, 7, 9, 11] else 0  # Major = 1, Minor = 0
    tonnetz = np.mean(librosa.feature.tonnetz(y=y, sr=sr), axis=1)  # Tonal centroid features

    # 5. High-Level Features
    timbre = np.mean(librosa.feature.tempogram(y=y, sr=sr))
    danceability = np.mean(onset_env) / np.std(onset_env) if np.std(onset_env) != 0 else 0
    valence = np.mean(spectral_centroid) / np.max(spectral_centroid) if np.max(spectral_centroid) != 0 else 0
    instrumentalness = 1 - np.mean(chroma)  # Approximate instrumental measure
    acousticness = np.mean(1 - spectral_flatness)  # Higher = more acoustic

    # MFCCs (13 coefficients)
    mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1)
    spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr), axis=1)

    # Append extracted features
    features = [file_path, tempo, zcr, rmse, spectral_centroid, spectral_bandwidth, spectral_rolloff, spectral_flatness,
                onset_strength, beats_per_segment, hnr, key, mode, timbre, danceability, valence, instrumentalness, acousticness] \
                + list(mfccs) + list(chroma.mean(axis=1)) + list(spectral_contrast) + list(tonnetz)

    data.append(features)

except Exception as e:
    print(f"Error processing {file_path}: {e}")

# Shuffle the dataset for diversity
random.shuffle(data)

# Define column names
columns = ['File', 'Tempo', 'Zero_Crossing_Rate', 'RMSE', 'Spectral_Centroid', 'Spectral_Bandwidth', 'Spectral_Rolloff', 'Spectral_Flatness',
           'Onset_Strength', 'Beats_Per_Segment', 'Harmonic-to-Noise', 'Key', 'Mode', 'Timbre', 'Danceability', 'Valence', 'Instrumentalness', 'Acousticness']

# Add column names for MFCCs, Chroma, Spectral Contrast, and Tonnetz
columns += [f'MFCC_{i+1}' for i in range(13)]
columns += [f'Chroma_{i+1}' for i in range(12)]
columns += [f'Spectral_Contrast_{i+1}' for i in range(7)]
columns += [f'Tonnetz_{i+1}' for i in range(6)]

# Convert to DataFrame
df = pd.DataFrame(data, columns=columns)

# Save as CSV
df.to_csv("audio_features_single.csv", index=False)

print("Feature extraction completed and saved to audio_features_single.csv")

Feature extraction completed and saved to audio_features_single.csv


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load the CSV file
def load_and_analyze_missing_values(file_path):
    # Load data
    df = pd.read_csv(file_path)

    # Check for missing values
    missing_counts = df.isnull().sum()
    missing_percentage = (missing_counts / len(df)) * 100

    # Create summary of missing values
    missing_summary = pd.DataFrame({
        'Missing Values': missing_counts,
        'Percentage (%)': missing_percentage
    })

    # Filter to only show columns with missing values
    missing_summary = missing_summary[missing_summary['Missing Values'] > 0].sort_values('Missing Values', ascending=False)

    if missing_summary.empty:
        print("No missing values found in the dataset!")
        return df, False
    else:
        print("Missing Values Summary:")
        print(missing_summary)

        # Visualize missing values
        plt.figure(figsize=(12, 6))
        sns.heatmap(df.isnull(), cbar=False, yticklabels=False, cmap='viridis')
        plt.title('Missing Values Heatmap')
        plt.tight_layout()
        plt.show()

        return df, True

def impute_missing_values_with_rf(df):
    # Make a copy of the dataframe
    df_imputed = df.copy()

    # Get list of columns with missing values
    cols_with_missing = df.columns[df.isnull().any()].tolist()

    for col in cols_with_missing:
        # Create mask for missing values
        missing_mask = df[col].isnull()

        # Skip if all values are missing
        if missing_mask.all():
            print(f"Column {col} has all missing values. Cannot impute.")
            continue

        # Determine if column is numerical or categorical
        if df[col].dtype in ['int64', 'float64']:
            # For numerical features
            # Create a temporary dataset excluding the column to be imputed
            temp_df = df.drop(columns=[col])

            # Split into rows with and without missing values in the target column
            known_values = df[~missing_mask]
            missing_values = df[missing_mask]

            # If we have too few known samples, skip this column
            if len(known_values) < 10:
                print(f"Too few known values in column {col}. Cannot impute reliably.")
                continue

            # Prepare training data: X = other features, y = values of column to be imputed
            X_train = known_values.drop(columns=[col])
            y_train = known_values[col]

            # Handle missing values in features
            X_train = X_train.fillna(X_train.mean())

            # Get features for prediction
            X_missing = missing_values.drop(columns=[col])
            X_missing = X_missing.fillna(X_train.mean())  # Fill missing predictors with mean

            # Train a Random Forest Regressor
            rf_model = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
            rf_model.fit(X_train, y_train)

            # Predict missing values
            predicted_values = rf_model.predict(X_missing)

            # Update the original dataframe with imputed values
            df_imputed.loc[missing_mask, col] = predicted_values

            print(f"Imputed {missing_mask.sum()} missing values in column '{col}' using Random Forest Regressor")

        else:
            # For categorical features
            # Convert to categorical and get dummies
            dummies = pd.get_dummies(df.drop(columns=[col]))

            # Split into rows with and without missing values in the target column
            known_values = df[~missing_mask]
            missing_values = df[missing_mask]

            # If we have too few known samples, skip this column
            if len(known_values) < 10:
                print(f"Too few known values in column {col}. Cannot impute reliably.")
                continue

            # Prepare training data
            X_train = pd.get_dummies(known_values.drop(columns=[col]))
            y_train = known_values[col]

            # Handle missing values in features
            X_train = X_train.fillna(0)  # For dummy variables, fill with 0

            # Get features for prediction
            X_missing = pd.get_dummies(missing_values.drop(columns=[col]))

            # Align X_missing with X_train columns
            X_missing, _ = X_missing.align(X_train, axis=1, fill_value=0)

            # Train a Random Forest Classifier
            rf_model = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
            rf_model.fit(X_train, y_train)

            # Predict missing values
            predicted_values = rf_model.predict(X_missing)

            # Update the original dataframe with imputed values
            df_imputed.loc[missing_mask, col] = predicted_values

            print(f"Imputed {missing_mask.sum()} missing values in column '{col}' using Random Forest Classifier")

    # Verify imputation
    remaining_missing = df_imputed.isnull().sum().sum()
    if remaining_missing > 0:
        print(f"\nWarning: There are still {remaining_missing} missing values in the dataset.")
        print("Columns with remaining missing values:")
        print(df_imputed.columns[df_imputed.isnull().any()].tolist())
    else:
        print("\nAll missing values have been successfully imputed!")

    return df_imputed

def process_and_impute_csv(file_path, output_path=None):
    print(f"Processing file: {file_path}")

    # Step 1: Load and analyze missing values
    df, has_missing = load_and_analyze_missing_values(file_path)

    # Step 2: Impute if needed, or just assign original df
    if has_missing:
        print("\nStarting imputation process with Random Forest...")
        df_imputed = impute_missing_values_with_rf(df)
    else:
        print("\nNo missing values detected. Saving original data as imputed.")
        df_imputed = df.copy()

    # Step 3: Save the (original or imputed) dataset
    if output_path:
        df_imputed.to_csv(output_path, index=False)
        print(f"\nImputed (or original) dataset saved to: {output_path}")

    return df_imputed

# Example usage
if __name__ == "__main__":
    # Replace with your file path
    file_path = "audio_features_single.csv"
    output_path = "audio_features_single_imputed.csv"

    # Process the file
    imputed_df = process_and_impute_csv(file_path, output_path)

    # Display a sample of the imputed data
    print("\nSample of the imputed dataset:")
    print(imputed_df.head())

Processing file: audio_features_single.csv
No missing values found in the dataset!

No missing values detected. Saving original data as imputed.

Imputed (or original) dataset saved to: audio_features_single_imputed.csv

Sample of the imputed dataset:
                                                File           Tempo  \
0  /content/drive/MyDrive/Chammak Challo - Ra-One...  [132.51201923]   

   Zero_Crossing_Rate      RMSE  Spectral_Centroid  Spectral_Bandwidth  \
0            0.103053  0.240854        4198.570834          4266.65838   

   Spectral_Rolloff  Spectral_Flatness  Onset_Strength  Beats_Per_Segment  \
0       9378.833533           0.002825        1.264563                500   

   ...  Spectral_Contrast_4  Spectral_Contrast_5  Spectral_Contrast_6  \
0  ...            17.651343            17.095897            17.362567   

   Spectral_Contrast_7  Tonnetz_1  Tonnetz_2  Tonnetz_3  Tonnetz_4  Tonnetz_5  \
0            45.700582  -0.007839   0.078576  -0.005035     0.0137   0.

In [None]:
imputed_df.head()

Unnamed: 0,File,Tempo,Zero_Crossing_Rate,RMSE,Spectral_Centroid,Spectral_Bandwidth,Spectral_Rolloff,Spectral_Flatness,Onset_Strength,Beats_Per_Segment,...,Spectral_Contrast_4,Spectral_Contrast_5,Spectral_Contrast_6,Spectral_Contrast_7,Tonnetz_1,Tonnetz_2,Tonnetz_3,Tonnetz_4,Tonnetz_5,Tonnetz_6
0,/content/drive/MyDrive/Chammak Challo - Ra-One...,[132.51201923],0.103053,0.240854,4198.570834,4266.65838,9378.833533,0.002825,1.264563,500,...,17.651343,17.095897,17.362567,45.700582,-0.007839,0.078576,-0.005035,0.0137,0.007464,0.022107


In [None]:
imputed_df.drop(columns=['File'], inplace=True)

In [None]:
imputed_df.head()

Unnamed: 0,Tempo,Zero_Crossing_Rate,RMSE,Spectral_Centroid,Spectral_Bandwidth,Spectral_Rolloff,Spectral_Flatness,Onset_Strength,Beats_Per_Segment,Harmonic-to-Noise,...,Spectral_Contrast_4,Spectral_Contrast_5,Spectral_Contrast_6,Spectral_Contrast_7,Tonnetz_1,Tonnetz_2,Tonnetz_3,Tonnetz_4,Tonnetz_5,Tonnetz_6
0,[132.51201923],0.103053,0.240854,4198.570834,4266.65838,9378.833533,0.002825,1.264563,500,-1.7e-05,...,17.651343,17.095897,17.362567,45.700582,-0.007839,0.078576,-0.005035,0.0137,0.007464,0.022107


In [None]:
# Handle values that are strings representing lists
def convert_to_float(x):
    if isinstance(x, str) and x.startswith('[') and x.endswith(']'):
        # Remove the brackets and convert to float
        return float(x.strip('[]'))
    elif isinstance(x, list):
        # If it's actually a list, take the first element
        return float(x[0])
    else:
        # Otherwise try direct conversion
        return float(x)

# Apply the conversion function
imputed_df['Tempo'] = imputed_df['Tempo'].apply(convert_to_float)

# Verify the conversion worked
print(imputed_df['Tempo'].head(10))
print("Data type:", imputed_df['Tempo'].dtype)

0    132.512019
Name: Tempo, dtype: float64
Data type: float64


In [None]:
# Get column data types
column_types = imputed_df.dtypes

# Print all column types
print(column_types)

Tempo                  float64
Zero_Crossing_Rate     float64
RMSE                   float64
Spectral_Centroid      float64
Spectral_Bandwidth     float64
Spectral_Rolloff       float64
Spectral_Flatness      float64
Onset_Strength         float64
Beats_Per_Segment        int64
Harmonic-to-Noise      float64
Key                      int64
Mode                     int64
Timbre                 float64
Danceability           float64
Valence                float64
Instrumentalness       float64
Acousticness           float64
MFCC_1                 float64
MFCC_2                 float64
MFCC_3                 float64
MFCC_4                 float64
MFCC_5                 float64
MFCC_6                 float64
MFCC_7                 float64
MFCC_8                 float64
MFCC_9                 float64
MFCC_10                float64
MFCC_11                float64
MFCC_12                float64
MFCC_13                float64
Chroma_1               float64
Chroma_2               float64
Chroma_3

In [None]:
imputed_df.to_csv('audio_features_single_imputed_preprocess.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model

# 1. Load the preprocessed test dataset (only one song)
test_df = pd.read_csv("audio_features_single_imputed_preprocess.csv")

# 2. Load the trained model
model = load_model("/content/drive/MyDrive/spotify_skip_prediction_model.h5")

# 3. Define listener category labels
listener_categories = [
    'casual_listener',
    'party_listener',
    'focus_listener',
    'adventurous_listener'
]

# 4. Prepare input features
X_test = test_df.values  # Assuming the CSV has only one row and no label columns

# 5. Predict on the single song
raw_predictions = model.predict(X_test)

# 6. Handle output format
if isinstance(raw_predictions, list):
    predictions = np.hstack([pred for pred in raw_predictions])
else:
    predictions = raw_predictions

# 7. Binarize and convert to 'skip' / 'not skip'
binary_prediction = (predictions[0] > 0.5).astype(int)
verdicts = ['not skip' if val == 1 else 'skip' for val in binary_prediction]

# 8. Print result in human-friendly format
print("\nThe song's predictions are:\n")
for category, verdict in zip(listener_categories, verdicts):
    print(f"- {category}: {verdict}")

print("\nPrediction printed successfully!")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step

The song's predictions are:

- casual_listener: not skip
- party_listener: not skip
- focus_listener: skip
- adventurous_listener: not skip

Prediction printed successfully!
