# Installs & Imports

In [1]:
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q sounddevice wavio
!pip install -q ipywebrtc notebook

!apt install -q ffmpeg
!apt-get install -q libportaudio2

!pip install -q langchain langchain_huggingface transformers sentence_transformers
!pip install -q ipywidgets

!jupyter nbextension enable --py widgetsnbextension

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.4/209.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m260.7/260.7 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hReading package lists...
Building dependency tree...
Reading state information...
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists...
Building dependency tree...
Reading state information...
The following NEW packages will be installed:
  libportaudio2
0 upgraded, 1 newly installed,

In [2]:
import os
import time
import numpy as np

try:
    import tensorflow
except ImportError:
    pass

import torch
import whisper

from IPython.display import Audio, display, clear_output
import ipywidgets as widgets
from ipywebrtc import AudioRecorder, CameraStream

from langchain import PromptTemplate, LLMChain
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

from google.colab import output
output.enable_custom_widget_manager()

# Recording Audio

In [3]:
def record_audio_widget():
    """
    Displays an audio recorder widget and saves the recording as a WebM file.

    Returns:
        str: The path to the saved WebM file.
    """
    # Set up the camera stream and recorder
    camera = CameraStream(constraints={'audio': True, 'video': False})
    recorder = AudioRecorder(stream=camera)
    display(recorder)

    print("Please use the recorder widget above to record your audio.")
    print("Click 'Record', speak into your microphone, and then click 'Stop'.")

    # Wait until the user has recorded audio
    def _wait_for_audio():
        import time
        while recorder.audio.value is None:
            time.sleep(0.1)
    _wait_for_audio()

    # Save the recording as a WebM file
    webm_filename = 'recording.webm'
    with open(webm_filename, 'wb') as f:
        f.write(recorder.audio.value)

    print(f"Audio recording saved as {webm_filename}")
    return webm_filename, recorder

In [4]:
# Record audio
print("### Record Audio ###")
webm_file, recorder = record_audio_widget()

### Record Audio ###


AudioRecorder(audio=Audio(value=b'', format='webm'), stream=CameraStream(constraints={'audio': True, 'video': …

Please use the recorder widget above to record your audio.
Click 'Record', speak into your microphone, and then click 'Stop'.
Audio recording saved as recording.webm


In [5]:
# Convert the WebM file to WAV
with open('recording.webm', 'wb') as f:
    f.write(recorder.audio.value)
!ffmpeg -i recording.webm -ac 1 -f wav my_recording.wav -y -hide_banner -loglevel panic
wav_file = 'my_recording.wav'

# Check if WAV file exists
if wav_file and os.path.exists(wav_file):
    file_size = os.path.getsize(wav_file)
    print(f"'{wav_file}' exists with size {file_size} bytes.")
else:
    print(f"'{wav_file}' does not exist.")

'my_recording.wav' exists with size 414798 bytes.


# Transcription

In [6]:
def transcribe_audio(audio_filename, model_size='base'):
    """
    Transcribes the given audio file using OpenAI's Whisper model.

    Args:
        audio_filename (str): The path to the audio file to transcribe.
        model_size (str): The size of the Whisper model to use.

    Returns:
        str: The transcribed text.
    """
    # Load the Whisper model
    print(f"Loading Whisper model '{model_size}'...")
    model = whisper.load_model(model_size)

    # Transcribe the audio file
    print("Transcribing audio...")
    result = model.transcribe(audio_filename)

    transcribed_text = result['text']
    print("Transcription complete.")
    return transcribed_text

In [7]:
# Transcribe audio
print("### Transcribe Audio ###")
transcribed_text = transcribe_audio(audio_filename=wav_file, model_size='base')

# Display the transcription
print("\nTranscribed Text:")
print(transcribed_text)

### Transcribe Audio ###
Loading Whisper model 'base'...


100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 134MiB/s]
  checkpoint = torch.load(fp, map_location=device)


Transcribing audio...
Transcription complete.

Transcribed Text:
 What is the capital of France?


# Loading the Model

In [12]:
# Function to load HuggingFace model with forced GPU loading
def load_huggingface_model(model_name="EleutherAI/gpt-neo-2.7B"):
    """
    Loads a HuggingFace model and wraps it in a LangChain HuggingFacePipeline.

    Args:
        model_name (str): The HuggingFace model name.

    Returns:
        HuggingFacePipeline: The LangChain pipeline wrapping the model.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load the model and move it to GPU
    print(f"Loading model '{model_name}' onto GPU...")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,  # Use half-precision to save memory
        low_cpu_mem_usage=True
    ).to('cuda')

    # Create a text generation pipeline
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device=0,  # Specify GPU device
        max_new_tokens=150,
        temperature=0.01,
        top_p=0.95,
        repetition_penalty=10.0,
        pad_token_id=tokenizer.eos_token_id
    )

    return HuggingFacePipeline(pipeline=pipe)

In [13]:
# Load the HuggingFace model
print("### Loading HuggingFace Model ###")
llm = load_huggingface_model(model_name="EleutherAI/gpt-neo-2.7B")

### Loading HuggingFace Model ###




Loading model 'EleutherAI/gpt-neo-2.7B' onto GPU...


# Question & Answer

In [22]:
def answer_question(question, llm):
    """
    Answers a question using a language model, utilizing LangChain.

    Args:
        question (str): The question to answer.
        llm (HuggingFacePipeline): The language model pipeline.

    Returns:
        str: The answer to the question.
    """
    # Create a PromptTemplate
    template = """Answer the following question accurately and concisely.
    Question: {question}"""

    prompt = PromptTemplate(template=template, input_variables=["question"])

    # Create the chain
    chain = prompt | llm

    # Generate the answer
    answer = chain.invoke({"question": question})

    return answer

In [23]:
# Use the transcribed text as the question
print("### Generating Answer ###")
answer = answer_question(question=transcribed_text, llm=llm)

print("\nAnswer:")
print(answer)

### Generating Answer ###
Generating answer...

Answer:
Answer the following question accurately and concisely.
    Question:  What is the capital of France?
    Answer:  Paris, with a population of over 1 million people.
                                                                                                                                   


In [26]:
# Play back the recorded audio
print("### Play Back Recorded Audio ###")
display(Audio(wav_file))

### Play Back Recorded Audio ###
