# Audio Classification - Beginner Project

This is beginner friendly notebook for audio classification on mini dataset.

We’ll use subset of dataset i.e 4-5 classes only.

| Class       | Example sounds                 |
| ----------- | ------------------------------ |
| 🐶 Dog bark | dog_bark_1.wav, dog_bark_2.wav |
| 🚗 Car horn | car_horn_1.wav, car_horn_2.wav |
| 🌧 Rain     | rain_1.wav, rain_2.wav         |
| 🔔 Bell     | bell_1.wav, bell_2.wav         |


In [3]:
# import necessary libraries

import os, zipfile, requests
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

Download small sample dataset

In [6]:
# Create data directory if it doesn't exist
data_dir = "../datasets/audio/esc50_light_classes"
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    print(f"📁 Created directory: {data_dir}")

url = "https://github.com/karoldvl/ESC-50/archive/refs/heads/master.zip"
dataset_path = os.path.join(data_dir, "ESC-50-master")
zip_path = os.path.join(data_dir, "esc50.zip")

if not os.path.exists(dataset_path):
    print("Downloading sample dataset (~100MB)...")
    r = requests.get(url, stream=True)
    with open(zip_path, "wb") as f:
        for chunk in tqdm(r.iter_content(chunk_size=1024*1024)):
            if chunk:
                f.write(chunk)
    
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(data_dir)
    
    # Clean up zip file after extraction
    os.remove(zip_path)
    print("✅ Dataset extracted to:", dataset_path)


# -----------------------------
# 2️⃣ Use only 4 light classes from ESC-50
# -----------------------------
import pandas as pd
meta = pd.read_csv("../datasets/audio/esc50_light_classes/ESC-50-master/meta/esc50.csv")
classes = ['dog', 'car_horn', 'rain', 'church_bells']
meta_small = meta[meta['category'].isin(classes)].sample(40, random_state=42)

Feature extraction

In [8]:
def extract_features(file):
    y, sr = librosa.load(file, duration=3, offset=0.5)
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T, axis=0)
    zcr = np.mean(librosa.feature.zero_crossing_rate(y))
    centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
    return np.hstack([mfcc, zcr, centroid, rolloff])

X, y = [], []
for _, row in tqdm(meta_small.iterrows(), total=len(meta_small)):
    path = os.path.join("../datasets/audio/esc50_light_classes/ESC-50-master/audio", row['filename'])
    X.append(extract_features(path))
    y.append(row['category'])

X = np.array(X)
y = np.array(y)
print("✅ Features extracted:", X.shape)

100%|██████████| 40/40 [00:02<00:00, 16.86it/s]

✅ Features extracted: (40, 16)





Data Preprocessing

In [9]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

train/test split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

Training a Model : RandomForest

In [11]:
model = RandomForestClassifier(n_estimators=150, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("\n✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))


✅ Accuracy: 0.75

Classification Report:
               precision    recall  f1-score   support

    car_horn       1.00      1.00      1.00         1
church_bells       0.60      1.00      0.75         3
         dog       0.00      0.00      0.00         2
        rain       1.00      1.00      1.00         2

    accuracy                           0.75         8
   macro avg       0.65      0.75      0.69         8
weighted avg       0.60      0.75      0.66         8



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Model Inference

In [None]:
def predict_sound(file_path):
    feat = extract_features(file_path)
    feat_scaled = scaler.transform([feat])
    pred = model.predict(feat_scaled)
    return le.inverse_transform(pred)[0]

# new file for prediction
test_file = "../datasets/audio/esc50_light_classes/ESC-50-master/audio/1-30344-A-14.wav"  # Example sound file
print("\n🎧 Predicted Class:", predict_sound(test_file))



  y, sr = librosa.load(file, duration=3, offset=0.5)


FileNotFoundError: [Errno 2] No such file or directory: '../datasets/audio/esc50_light_classes/ESC-50-master/audio/1-30344-A-14.wav'