# Audio Classification - Beginner Project

This is beginner friendly notebook for audio classification on mini dataset.

We’ll use subset of dataset i.e 4-5 classes only.

| Class       | Example sounds                 |
| ----------- | ------------------------------ |
| 🐶 Dog bark | dog_bark_1.wav, dog_bark_2.wav |
| 🚗 Car horn | car_horn_1.wav, car_horn_2.wav |
| 🌧 Rain     | rain_1.wav, rain_2.wav         |
| 🔔 Bell     | bell_1.wav, bell_2.wav         |


In [None]:
# import necessary libraries

import os, zipfile, requests
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

Download small sample dataset

In [None]:
url = "https://github.com/karoldvl/ESC-50/archive/refs/heads/master.zip"
if not os.path.exists("ESC-50-master"):
    print("Downloading sample dataset (~100MB)...")
    r = requests.get(url, stream=True)
    with open("esc50.zip", "wb") as f:
        for chunk in tqdm(r.iter_content(chunk_size=1024*1024)):
            if chunk:
                f.write(chunk)
    with zipfile.ZipFile("esc50.zip", "r") as zip_ref:
        zip_ref.extractall(".")
    print("✅ Dataset extracted!")


# -----------------------------
# 2️⃣ Use only 4 light classes from ESC-50
# -----------------------------
import pandas as pd
meta = pd.read_csv("ESC-50-master/meta/esc50.csv")
classes = ['dog', 'car_horn', 'rain', 'church_bells']
meta_small = meta[meta['category'].isin(classes)].sample(40, random_state=42)

Feature extraction

In [None]:
def extract_features(file):
    y, sr = librosa.load(file, duration=3, offset=0.5)
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T, axis=0)
    zcr = np.mean(librosa.feature.zero_crossing_rate(y))
    centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
    return np.hstack([mfcc, zcr, centroid, rolloff])

X, y = [], []
for _, row in tqdm(meta_small.iterrows(), total=len(meta_small)):
    path = os.path.join("ESC-50-master/audio", row['filename'])
    X.append(extract_features(path))
    y.append(row['category'])

X = np.array(X)
y = np.array(y)
print("✅ Features extracted:", X.shape)

Data Preprocessing

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

train/test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

Training a Model : RandomForest

In [None]:
model = RandomForestClassifier(n_estimators=150, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("\n✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

Model Inference

In [None]:
def predict_sound(file_path):
    feat = extract_features(file_path)
    feat_scaled = scaler.transform([feat])
    pred = model.predict(feat_scaled)
    return le.inverse_transform(pred)[0]

test_file = "ESC-50-master/audio/1-30344-A-14.wav"  # Example sound file
print("\n🎧 Predicted Class:", predict_sound(test_file))

