In [1]:
#Getting our audio
import sounddevice as sd
import numpy as np
import wave

def record_audio(filename, duration=5, fs=44100):
    """Records audio from the microphone and saves it to a WAV file.

    Args:
        filename (str): The name of the file to save the audio to.
        duration (int): The duration of the recording in seconds.
        fs (int): The sampling rate (samples per second).
    """
    print("Recording...")
    audio_data = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype=np.int16)
    sd.wait()
    print("Finished recording.")

    with wave.open(filename, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)  # 2 bytes because of np.int16
        wf.setframerate(fs)
        wf.writeframes(audio_data.tobytes())

In [2]:
#test display our audio
import librosa
import IPython.display as ipd

y, sr = librosa.load("output.wav")
ipd.display(ipd.Audio(y, rate=sr))

In [3]:
from phoneme_extractor import PhonemeExtractor
extractor = PhonemeExtractor()

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from grapheme_to_phoneme import grapheme_to_phoneme
ground_truth_phonemes = grapheme_to_phoneme("toblerone")

In [5]:
from process_audio import process_audio_array

output = process_audio_array(ground_truth_phonemes=ground_truth_phonemes, audio_array=y, sampling_rate=16000, extraction_model=extractor)
print(output)
print(ground_truth_phonemes)

(['DH-AH0 K-W-IH1-K P-R-AW1-N F-A1-K-S JH-AH1-M-P-T OW1-V-ER0 DH-AH0 L-EY1-Z-IY0 D-AO1-G'], 4.0, ['DH', 'AH', 'K', 'W', 'IH', 'K', 'P', 'R', 'AW', 'N', 'F', 'A', 'K', 'S', 'JH', 'AH', 'M', 'P', 'T', 'OW', 'V', 'ER', 'DH', 'AH', 'L', 'EY', 'Z', 'IY', 'D', 'AO', 'G'])
['T', 'OW', 'B', 'L', 'ER', 'OW', 'N']


In [6]:
#Combining everything into a single function
def record_and_process_pronunciation(text, extraction_model):
    record_audio("output.wav")
    ground_truth_phonemes = grapheme_to_phoneme(text)
    y, sr = librosa.load("output.wav")
    output = process_audio_array(ground_truth_phonemes=ground_truth_phonemes, audio_array=y, sampling_rate=16000, extraction_model=extraction_model)
    return output, ground_truth_phonemes

In [14]:
scentence = "The dog is happy"
print(f"Say: {scentence}")
output, ground_truth_phonemes = record_and_process_pronunciation(scentence, extractor)
print(f"Output: {output}")
print(f"Ground truth: {ground_truth_phonemes}")


#GPT stuff
print(f'Attempted scentence: {scentence}')
print(f'Ground truth phonemes: {ground_truth_phonemes}')
print(f'Student phonemes: {output[2]}')

Say: The dog is happy
Recording...
Finished recording.
Output: ([''], 1.0, [''])
Ground truth: ['DH', 'AH', 'D', 'AO', 'G', 'IH', 'Z', 'HH', 'AE', 'P', 'IY']
Attempted scentence: The dog is happy
Ground truth phonemes: ['DH', 'AH', 'D', 'AO', 'G', 'IH', 'Z', 'HH', 'AE', 'P', 'IY']
Student phonemes: ['']
