In [1]:
import json
import os
import time
import numpy as np
import sounddevice as sd
import soundfile as sf
import torch
import torchaudio
from scipy.spatial.distance import cosine
from speechbrain.pretrained import EncoderClassifier


  from .autonotebook import tqdm as notebook_tqdm
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.
torchvision is not available - cannot save figures
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.


In [None]:

# -----------------------------
# CONFIG
# -----------------------------
JSON_FILE = "enrolled_users.json"  # store embeddings here
MIN_DURATION = 10  # minimum recording time in seconds
SILENCE_THRESHOLD = 0.1  # RMS threshold to detect silence
SILENCE_DURATION = 2  # seconds of silence to auto-stop

# Load model once
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")

# -----------------------------
# Functions
# -----------------------------

def record_audio(min_duration=MIN_DURATION):
    """Record audio for at least min_duration and stop when user is silent."""
    print(f"🎙 Speak now (at least {min_duration} seconds). Recording will stop when you're silent...")

    samplerate = 16000
    channels = 1

    recording = []
    silence_time = 0
    start_time = time.time()

    stream = sd.InputStream(samplerate=samplerate, channels=channels, dtype='float32')
    stream.start()

    try:
        while True:
            data, _ = stream.read(int(samplerate * 0.2))  # 0.2 sec chunks
            chunk = np.copy(data[:, 0])
            recording.append(chunk)

            # RMS to detect silence
            rms = np.sqrt(np.mean(chunk ** 2))
            if rms < SILENCE_THRESHOLD:
                silence_time += 0.2
            else:
                silence_time = 0

            duration = time.time() - start_time
            if duration >= min_duration and silence_time >= SILENCE_DURATION:
                break
    finally:
        stream.stop()
        stream.close()

    audio = np.concatenate(recording, axis=0)
    print(f"✅ Recorded {len(audio)/samplerate:.2f} seconds of audio.")
    return audio, samplerate

def extract_embedding(audio, samplerate):
    """Extract speaker embedding from audio array."""
    temp_path = "temp_audio.wav"
    sf.write(temp_path, audio, samplerate)

    signal, fs = torchaudio.load(temp_path)  # [channels, time]

    # Convert to mono if stereo
    if signal.shape[0] > 1:
        signal = torch.mean(signal, dim=0, keepdim=True)

    # Resample if needed
    if fs != samplerate:
        transform = torchaudio.transforms.Resample(orig_freq=fs, new_freq=samplerate)
        signal = transform(signal)

    # Now make it [batch, time]
    signal = signal.squeeze(0)            # [time]
    signal = signal.unsqueeze(0)          # [1, time] = batch size 1

    # Pass directly to encode_batch
    embedding_tensor = classifier.encode_batch(signal)
    embedding = embedding_tensor.squeeze(0).detach().cpu().numpy()

    os.remove(temp_path)
    return embedding.tolist()

def load_enrolled_data():
    if os.path.exists(JSON_FILE):
        with open(JSON_FILE, "r") as f:
            return json.load(f)
    return {}

def save_enrolled_data(data):
    with open(JSON_FILE, "w") as f:
        json.dump(data, f, indent=4)

def enroll_user(name):
    """Enroll a user with the given name"""
    try:
        audio, sr = record_audio(min_duration=MIN_DURATION)
        embedding = extract_embedding(audio, sr)
        enrolled_data = load_enrolled_data()
        
        # If name already exists, append to existing embeddings
        if name in enrolled_data:
            enrolled_data[name].append(embedding)
        else:
            enrolled_data[name] = [embedding]
            
        save_enrolled_data(enrolled_data)
        return True, f"User '{name}' enrolled successfully!"
    except Exception as e:
        return False, f"Error during enrollment: {str(e)}"

def check_user(threshold=0.3):
    """Check if a user is already enrolled"""
    try:
        enrolled_data = load_enrolled_data()
        if not enrolled_data:
            return False, "No users enrolled yet.", None, None

        audio, sr = record_audio(min_duration=MIN_DURATION)
        test_emb = np.array(extract_embedding(audio, sr)).flatten()

        best_name = None
        best_score = float('inf')

        for name, emb_list in enrolled_data.items():
            for emb in emb_list:
                emb_vec = np.array(emb).flatten()
                score = cosine(test_emb, emb_vec)
                if score < best_score:
                    best_score = score
                    best_name = name

        if best_score < threshold:
            return True, f"Match: {best_name} (distance={best_score:.3f})", best_name, best_score
        else:
            return False, f"No match under threshold. Closest guess: {best_name} (distance={best_score:.3f})", best_name, best_score
    except Exception as e:
        return False, f"Error during verification: {str(e)}", None, None

def get_enrolled_users():
    """Get list of all enrolled users"""
    enrolled_data = load_enrolled_data()
    return list(enrolled_data.keys())

In [1]:
!pip freeze

asttokens==3.0.0
blinker==1.9.0
certifi==2025.8.3
cffi==2.0.0
charset-normalizer==3.4.3
click==8.3.0
colorama==0.4.6
comm==0.2.3
debugpy==1.8.17
decorator==5.2.1
exceptiongroup==1.3.0
executing==2.2.1
filelock==3.19.1
Flask==2.3.3
fsspec==2025.9.0
huggingface-hub==0.35.0
HyperPyYAML==1.2.2
idna==3.10
ipykernel==6.30.1
ipython==8.37.0
itsdangerous==2.2.0
jedi==0.19.2
Jinja2==3.1.6
joblib==1.5.2
jupyter_client==8.6.3
jupyter_core==5.8.1
MarkupSafe==3.0.2
matplotlib-inline==0.1.7
mpmath==1.3.0
nest-asyncio==1.6.0
networkx==3.4.2
numpy==1.24.3
packaging==25.0
parso==0.8.5
platformdirs==4.4.0
prompt_toolkit==3.0.52
psutil==7.1.0
pure_eval==0.2.3
pycparser==2.23
Pygments==2.19.2
python-dateutil==2.9.0.post0
pywin32==311
PyYAML==6.0.2
pyzmq==27.1.0
requests==2.32.5
ruamel.yaml==0.18.15
ruamel.yaml.clib==0.2.12
scipy==1.11.3
sentencepiece==0.2.1
six==1.17.0
sounddevice==0.4.6
soundfile==0.12.1
speechbrain==0.5.15
stack-data==0.6.3
sympy==1.14.0
torch==2.0.1
torchaudio==2.0.2
tornado==6.5.2
tqd

In [2]:
!pip freeze > requirements.txt