<a href="https://colab.research.google.com/github/surbhi498/Technical_Challenge/blob/main/Technical_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
!pip install sounddevice



In [38]:
!apt-get update
!apt-get install -y libportaudio2

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Connecting to archive.ubuntu.com (185.125.190.81)] [1 InRelease 14.2 kB/129                                                                               Get:2 https://cli.github.com/packages stable InRelease [3,917 B]
0% [Connecting to archive.ubuntu.com (185.125.190.81)] [1 InRelease 70.6 kB/129                                                                               Hit:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
                                                                               Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
0% [Waiting for headers] [1 InRelease 70.6 kB/129 kB 55%] [Waiting for headers]0% [Waiting for headers] [Waiting for headers] [Connected to ppa.launchpadconte                                                                               Hit:5 http://archive.ub

In [39]:
import io
import joblib
import numpy as np
import pandas as pd
import soundfile as sf
import librosa
import sounddevice as sd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# ---------- FEATURE EXTRACTION ----------
def extract_logmel(y, sr, n_mels=40, hop_length=160, win_length=400, fmin=50, fmax=8000):
    """Extract compact log-mel features + delta for spoken digits."""
    M = librosa.feature.melspectrogram(
        y=y, sr=sr, n_mels=n_mels, hop_length=hop_length, win_length=win_length,
        fmin=fmin, fmax=min(fmax, sr/2), power=2.0
    )
    logM = librosa.power_to_db(M + 1e-10)
    d = librosa.feature.delta(logM)
    feat = np.concatenate([logM.mean(axis=1), logM.std(axis=1), d.mean(axis=1)], axis=0)
    return feat.astype(np.float32)


def decode_wav_bytes(b):
    """Decode WAV bytes into mono 16k waveform."""
    y, sr = sf.read(io.BytesIO(b), dtype='float32', always_2d=False)
    if y.ndim > 1:
        y = np.mean(y, axis=1)  # convert to mono
    if sr != 16000:
        y = librosa.resample(y, orig_sr=sr, target_sr=16000)
        sr = 16000
    y, _ = librosa.effects.trim(y, top_db=30)  # trim silence
    return y, sr


def build_feature_matrix(df):
    """Convert dataframe with audio bytes & labels into feature matrix and label array."""
    X, y = [], []
    for _, row in df.iterrows():
        b = row['audio']['bytes']
        yi = int(row['label'])
        y_wav, sr = decode_wav_bytes(b)
        feat = extract_logmel(y_wav, sr)
        X.append(feat)
        y.append(yi)
    return np.vstack(X), np.array(y)


# ---------- TRAINING ----------
def train_model(df):
    X, y = build_feature_matrix(df)
    print("Feature matrix:", X.shape)

    scaler = StandardScaler()
    Xz = scaler.fit_transform(X)

    clf = LogisticRegression(C=1.0, max_iter=1000, n_jobs=-1, random_state=42)

    # Cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    y_pred = cross_val_predict(clf, Xz, y, cv=skf, n_jobs=-1)

    acc = accuracy_score(y, y_pred)
    print(f"CV Accuracy: {acc:.4f}")
    print("Classification report:\n", classification_report(y, y_pred, digits=3))
    print("Confusion matrix:\n", confusion_matrix(y, y_pred))

    # Fit final model
    clf.fit(Xz, y)
    joblib.dump(scaler, "scaler.joblib")
    joblib.dump(clf, "digit_lr.joblib")
    np.save("X_features.npy", X)
    np.save("y_labels.npy", y)


# ---------- PREDICTION ----------
def predict_digit(wav_bytes, scaler_path="scaler.joblib", model_path="digit_lr.joblib"):
    sc = joblib.load(scaler_path)
    model = joblib.load(model_path)
    y_wav, sr = decode_wav_bytes(wav_bytes)
    feat = extract_logmel(y_wav, sr)[None, :]
    feat = sc.transform(feat)
    proba = model.predict_proba(feat)[0]
    pred = int(np.argmax(proba))
    return pred, proba


# ---------- USAGE EXAMPLES ----------
# 1. TRAIN MODEL:
df = pd.read_parquet("hf://datasets/mteb/free-spoken-digit-dataset/data/train-00000-of-00001.parquet")
train_model(df)




Feature matrix: (2700, 120)
CV Accuracy: 0.9678
Classification report:
               precision    recall  f1-score   support

           0      0.978     0.970     0.974       270
           1      0.977     0.959     0.968       270
           2      0.967     0.963     0.965       270
           3      0.928     0.956     0.942       270
           4      0.993     0.989     0.991       270
           5      0.989     0.989     0.989       270
           6      0.947     0.919     0.932       270
           7      0.981     0.974     0.978       270
           8      0.949     0.970     0.960       270
           9      0.971     0.989     0.980       270

    accuracy                          0.968      2700
   macro avg      0.968     0.968     0.968      2700
weighted avg      0.968     0.968     0.968      2700

Confusion matrix:
 [[262   1   1   3   0   0   1   1   0   1]
 [  0 259   1   0   0   3   0   0   1   6]
 [  1   1 260   6   1   0   1   0   0   0]
 [  1   0   5 258   0

In [40]:
# Load test split
df_test = pd.read_parquet("hf://datasets/mteb/free-spoken-digit-dataset/data/test-00000-of-00001.parquet")

# Build features (same preprocessing as training)
X_test, y_test = build_feature_matrix(df_test)

# Load trained model + scaler
scaler = joblib.load("scaler.joblib")
clf = joblib.load("digit_lr.joblib")

# Transform features
X_test_scaled = scaler.transform(X_test)

# Predict
y_pred = clf.predict(X_test_scaled)

# Evaluate
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {acc:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Test Accuracy: 0.9733
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98        30
           1       1.00      0.97      0.98        30
           2       1.00      1.00      1.00        30
           3       0.96      0.90      0.93        30
           4       0.97      1.00      0.98        30
           5       1.00      1.00      1.00        30
           6       0.90      0.90      0.90        30
           7       1.00      1.00      1.00        30
           8       0.97      0.97      0.97        30
           9       0.97      1.00      0.98        30

    accuracy                           0.97       300
   macro avg       0.97      0.97      0.97       300
weighted avg       0.97      0.97      0.97       300

Confusion Matrix:
 [[30  0  0  0  0  0  0  0  0  0]
 [ 0 29  0  0  0  0  0  0  0  1]
 [ 0  0 30  0  0  0  0  0  0  0]
 [ 1  0  0 27  0  0  2  0  0  0]
 [ 0  0  0  0 30  0  0  0  0  0]
 [ 0  0  0 

In [36]:
import io
import numpy as np
import joblib
import soundfile as sf
from base64 import b64decode
from google.colab import output
from IPython.display import Javascript, HTML
from pydub import AudioSegment
from io import BytesIO

# MIC RECORDING TEST(BONUS CHALLENGE)

def predict_digit(wav_bytes, scaler_path="scaler.joblib", model_path="digit_lr.joblib"):
    sc = joblib.load(scaler_path)
    model = joblib.load(model_path)
    y_wav, sr = sf.read(io.BytesIO(wav_bytes), dtype='float32', always_2d=False)
    # Mono
    if y_wav.ndim > 1:
        y_wav = np.mean(y_wav, axis=1)
    # Resample to 16k
    if sr != 16000:
        import librosa
        y_wav = librosa.resample(y_wav, orig_sr=sr, target_sr=16000)
        sr = 16000
    # Trim silence
    import librosa.effects
    y_wav, _ = librosa.effects.trim(y_wav, top_db=30)

    feat = extract_logmel(y_wav, sr)[None, :]
    feat = sc.transform(feat)
    proba = model.predict_proba(feat)[0]
    pred = int(np.argmax(proba))
    return pred, proba

def webm_to_wav(webm_bytes):
    audio = AudioSegment.from_file(BytesIO(webm_bytes), format="webm")
    wav_io = BytesIO()
    audio.export(wav_io, format="wav")
    return wav_io.getvalue()

def record_and_predict_colab(filename='recorded.wav', duration=2):
    def save_audio(b64data):
        try:
            webm_bytes = b64decode(b64data)
            if len(webm_bytes) < 1000:
                print("❌ Recording too short or empty.")
                return
            wav_bytes = webm_to_wav(webm_bytes)
            with open(filename, 'wb') as f:
                f.write(wav_bytes)
            print(f"✅ Saved WAV: {filename} ({len(wav_bytes)} bytes)")
            pred, proba = predict_digit(wav_bytes)
            print(f"🎯 Predicted digit: {pred}")
            print(f"📊 Probabilities: {np.round(proba,3)}")
        except Exception as e:
            print(f"❌ Prediction failed: {e}")

    output.register_callback('notebook.save_audio', save_audio)

    display(HTML('<button id="recordBtn" style="font-size:20px;padding:10px;">🎙 Record</button>'))

    js_code = f"""
    const recordButton = document.getElementById("recordBtn");
    recordButton.onclick = async () => {{
        try {{
            const stream = await navigator.mediaDevices.getUserMedia({{ audio: true }});
            const mediaRecorder = new MediaRecorder(stream, {{ mimeType: 'audio/webm;codecs=opus' }});
            let chunks = [];
            mediaRecorder.ondataavailable = e => {{
                if (e.data.size > 0) chunks.push(e.data);
            }};
            mediaRecorder.onstop = async () => {{
                if (chunks.length === 0) {{
                    console.error("No audio chunks recorded.");
                    return;
                }}
                const blob = new Blob(chunks, {{ type: 'audio/webm;codecs=opus' }});
                const reader = new FileReader();
                reader.onloadend = () => {{
                    const base64data = reader.result.split(',')[1];
                    google.colab.kernel.invokeFunction('notebook.save_audio', [base64data], {{}});
                }};
                reader.readAsDataURL(blob);
            }};
            mediaRecorder.start();
            console.log("🎙 Recording...");
            setTimeout(() => mediaRecorder.stop(), {duration*1000});
        }} catch (err) {{
            console.error("Mic error:", err);
            alert("Microphone access denied or unavailable.");
        }}
    }};
    """
    display(Javascript(js_code))

# Run in Colab
record_and_predict_colab(duration=2)

<IPython.core.display.Javascript object>

✅ Saved WAV: recorded.wav (380204 bytes)
🎯 Predicted digit: 6
📊 Probabilities: [0.001 0.038 0.    0.    0.001 0.    0.637 0.323 0.    0.   ]
✅ Saved WAV: recorded.wav (380204 bytes)
🎯 Predicted digit: 1
📊 Probabilities: [0.    0.994 0.    0.    0.    0.    0.002 0.004 0.    0.   ]
✅ Saved WAV: recorded.wav (380204 bytes)
🎯 Predicted digit: 1
📊 Probabilities: [0.    0.963 0.017 0.    0.    0.    0.01  0.009 0.    0.   ]
