In [8]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

In [None]:
# device = "cuda:0" if torch.cuda.is_available() else "cpu" #windows
device = "mps" #macos
torch_dtype = torch.float16 #if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

sample = "harvard.wav"

result = pipe(sample)
print(result["text"])


In [None]:
#LLM interaction
# https://ollama.com/library/llama3.2
import ollama

from ollama import chat
from ollama import ChatResponse

response: ChatResponse = chat(model='llama3.2', messages=[
  {
    'role': 'user',
    'content': 'Why is the sky blue?',
  },
])
print(response['message']['content'])
# or access fields directly from the response object
print(response.message.content)


In [None]:
def get_response(prompt):
    response = ollama.chat(model='llama3.2', messages=[
        {
            'role': 'user',
            'content': prompt,
        },
    ])
    return response['message']['content']

get_response("How are you doing today")

In [2]:
import pyaudio
import wave

def record_audio(filename="prompt.mp3", duration=3, sample_rate=44100, channels=1, chunk=1024):

    p = pyaudio.PyAudio()

    stream = p.open(format=pyaudio.paInt16,
                    channels=channels,
                    rate=sample_rate,
                    input=True,
                    frames_per_buffer=chunk)

    print("Recording...")

    frames = []

    for i in range(0, int(sample_rate / chunk * duration)):
        data = stream.read(chunk)
        frames.append(data)

    print("Recording finished.")

    stream.stop_stream()
    stream.close()
    p.terminate()

    # Save the recorded data as a WAV file
    wf = wave.open(filename.replace('.mp3', '.wav'), 'wb')
    wf.setnchannels(channels)
    wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
    wf.setframerate(sample_rate)
    wf.writeframes(b''.join(frames))
    wf.close()

    print(f"Audio saved as {filename.replace('.mp3', '.wav')}")

# Example usage:            
record_audio()


In [5]:
def transcribe_audio(audio_file):
    result = pipe(audio_file)
    return result["text"]


In [None]:
record_audio()
transcribe_audio("prompt.wav")
prompt = transcribe_audio("prompt.wav")
print(prompt)

response = get_response(prompt)
print(response)
