In [24]:
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import numpy as np
import sounddevice as sd
from scipy.spatial.distance import cosine
from sklearn.preprocessing import StandardScaler

# Constants
SAMPLE_RATE = 16000
DURATION = 10

# Load pre-trained model and processor
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

def record_audio(duration, sample_rate):
    print(f"Recording for {duration} seconds...")
    audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='float32')
    sd.wait()
    print("Recording complete.")
    return audio.flatten()

def process_audio(audio):
    input_values = processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt").input_values
    return input_values

def extract_features(audio):
    input_values = process_audio(audio)
    with torch.no_grad():
        outputs = model(input_values)
    return outputs.logits.mean(dim=1).squeeze().numpy()

def speech_to_text(audio):
    input_values = process_audio(audio)
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    return transcription

def train_voice_model(target_voice_samples, other_voice_samples):
    target_features = np.array(target_voice_samples)
    other_features = np.array(other_voice_samples)
    
    scaler = StandardScaler()
    all_features = np.vstack([target_features, other_features])
    scaler.fit(all_features)
    
    target_centroid = np.mean(scaler.transform(target_features), axis=0)
    return target_centroid, scaler

def recognize_voice(features, target_centroid, scaler, threshold=0.5):
    scaled_features = scaler.transform(features.reshape(1, -1))
    distance = cosine(scaled_features.squeeze(), target_centroid)
    return distance < threshold, distance

def sentence_match(transcribed_text, target_sentence, threshold=0.7):
    transcribed_words = set(transcribed_text.lower().split())
    target_words = set(target_sentence.lower().split())
    similarity = len(transcribed_words.intersection(target_words)) / len(target_words)
    return similarity >= threshold, similarity

def recognize_voice_and_sentence(audio, target_centroid, scaler, target_sentence):
    features = extract_features(audio)
    is_target_voice, voice_distance = recognize_voice(features, target_centroid, scaler)
    print(f"Voice recognition distance: {voice_distance:.4f}")
    
    transcribed_text = speech_to_text(audio)
    print(f"Transcribed text: {transcribed_text}")
    
    is_target_sentence, sentence_similarity = sentence_match(transcribed_text, target_sentence)
    print(f"Sentence similarity: {sentence_similarity:.4f}")
    
    return is_target_voice and is_target_sentence

if __name__ == "__main__":
    target_sentence = "Hello world"
    
    print("Collecting training data...")
    print("Please record 5 samples of the target voice saying anything.")
    target_voice_samples = [extract_features(record_audio(DURATION, SAMPLE_RATE)) for _ in range(5)]
    
    print("Now, please record 5 samples of other voices saying anything.")
    other_voice_samples = [extract_features(record_audio(DURATION, SAMPLE_RATE)) for _ in range(5)]
    
    print("Training the voice recognition model...")
    target_centroid, scaler = train_voice_model(target_voice_samples, other_voice_samples)
    
    print(f"\nNow, let's test the system. Please say: '{target_sentence}'")
    test_audio = record_audio(DURATION, SAMPLE_RATE)
    
    result = recognize_voice_and_sentence(test_audio, target_centroid, scaler, target_sentence)
    print(f"Recognition result: {result}")

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

Collecting training data...
Please record 5 samples of the target voice saying anything.
Recording for 10 seconds...
Recording complete.
Recording for 10 seconds...
Recording complete.
Recording for 10 seconds...
Recording complete.
Recording for 10 seconds...
Recording complete.
Recording for 10 seconds...
Recording complete.
Now, please record 5 samples of other voices saying anything.
Recording for 10 seconds...
Recording complete.
Recording for 10 seconds...
Recording complete.
Recording for 10 seconds...
Recording complete.
Recording for 10 seconds...
Recording complete.
Recording for 10 seconds...
Recording complete.
Training the voice recognition model...

Now, let's test the system. Please say: 'Hello world'
Recording for 10 seconds...
Recording complete.
Voice recognition distance: 0.5636
Transcribed text: OW WE EXPRESS OURSELF CAN MAKE IT DIFFICULT FOR US NORFO E MUST MEAN TO UNDERSTAND US PROWL FRECTLY BUT WI ON GOING ROM AND THESE FELLERS HARAD WE ON
Sentence similarity: 0.

In [26]:
    print(f"\nNow, let's test the system. Please say: '{target_sentence}'")
    test_audio = record_audio(DURATION, SAMPLE_RATE)
    
    result = recognize_voice_and_sentence(test_audio, target_centroid, scaler, target_sentence)
    print(f"Recognition result: {result}")


Now, let's test the system. Please say: 'Hello world'
Recording for 10 seconds...
Recording complete.
Voice recognition distance: 1.0674
Transcribed text: SO THIS IS HALLO WALD HALLO WARL HALLO WARLD HALLO WARLD HALLO WARLD HALLO WARD S
Sentence similarity: 0.0000
Recognition result: False
