In [25]:
# With this function, you can record some audio but other audio files can also be provided for testing the code.

import wave
import sys
import pyaudio

CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1 if sys.platform == "darwin" else 2
RATE = 44100

def record_audio(seconds: int):
    output_path = "output.wav"
    with wave.open(output_path, "wb") as wf:
        p = pyaudio.PyAudio()
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(p.get_sample_size(FORMAT))
        wf.setframerate(RATE)

        stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True)

        print("Recording...")
        for index in range(0, RATE // CHUNK * seconds):
            if index % (RATE // CHUNK) == 0:
                print(f"{index // (RATE // CHUNK)} / {seconds}s")
            wf.writeframes(stream.read(CHUNK))
        print("Done")

        stream.close()
        p.terminate()
    print(f"File saved at {output_path}")
    return output_path

In [2]:
record_audio(10)

Recording...
0 / 10s
1 / 10s
2 / 10s
3 / 10s
4 / 10s
5 / 10s
6 / 10s
7 / 10s
8 / 10s
9 / 10s
Done
File saved at output.wav


'output.wav'

In [8]:
audio_file = open("woman_speaking.wav", "rb")

In [5]:
import openai
from dotenv import dotenv_values

config = dotenv_values(".env")
openai.api_key = config["OPENAI_API_KEY"]

In [6]:
from openai import OpenAI
client = OpenAI()

In [39]:
# This function creates text from an English speaker's audio recording. Intresting, that even though the speech's finishing
# part has been cut off, the model completes it by itself, even though we didn't provide any instructions for it to do so.

transcription = client.audio.transcriptions.create(
  model="whisper-1", 
  file=audio_file
)
print(transcription.text)

I woke up this morning and the weather was so nice outside and the sun was shining and half an hour later it's already cloudy but I mean this is England what can you do?


In [16]:
# I'm talking about DALL-E in this audio file. Without context, whisper misspells DALL-E.

audio_file_dalle = open("dalle.wav", "rb")

transcription_dalle = client.audio.transcriptions.create(
  model="whisper-1", 
  file=audio_file_dalle
)
print(transcription_dalle.text)

Hi! I'm here to tell you about Dolly. Dolly is an image generating model. Dolly takes in a text and spits out an image.


In [17]:
# With an additional prompt, we can provide context.

transcription_dalle_with_context = client.audio.transcriptions.create(
    model="whisper-1", 
    file=audio_file_dalle,
    prompt="Woman talks about OpenAI's DALL-E model."
)
print(transcription_dalle_with_context.text)

Hi, I'm here to tell you about DALL-E. DALL-E is an image generating model. DALL-E takes in a text and spits out an image.


In [21]:
# The following API call transcribes a Hungarian sound track.
# Apart from two minor inaccuracies and one "misheard" word, the written text is mostly accurate.

audio_file_hungarian = open("wiki-Hungary.m4a", "rb")

hungarian_text = client.audio.transcriptions.create(
  model="whisper-1", 
  file=audio_file_hungarian
)
print(hungarian_text.text)

A következő szöveg forrása Wikipedia. 2010 óta a demokrácia szintjén jelentős hanyadlás következett be Magyarországon. 2020-ban olyan országokkal kapott hasonló besorolás, mint Brazília, Indonézia, vagy Európában Albánia. A 2020-as brüsszeli jogállamisági jelentés egy sor problémát állt fel. 2022-ben az Európai Parlament nagy többséggel elfogadott egy állásfoglalást, amelyben kimondják, hogy Magyarország már nem teljes értekű demokrácia, hanem választási autokrácia, egy hibrid rezsim.


In [20]:
# The following API call translates a Hungarian audio into English text.
# I can confirm that the translation is accurate, dispite the transcription inaccuracies in the Hungarian transcription.

audio_file_hungarian = open("wiki-Hungary.m4a", "rb")

hungarian_text_translation = client.audio.translations.create(
  model="whisper-1", 
  file=audio_file_hungarian
)
print(hungarian_text_translation.text)

The source of the following text is Wikipedia. Since 2010, there has been a significant decrease in the level of democracy in Hungary. In 2020, similar rankings were obtained with countries such as Brazil, Indonesia, or Albania in Europe. The 2020 Brussels Statistical Report raised a series of problems. In 2022, the European Parliament mostly adopted a position in which it is said that Hungary is no longer a full-fledged democracy, but an election autocracy, a hybrid regime.


In [43]:
# Text-to-speach with the Audio API.

def text_to_speech(prompt, speech_file_path):
    response = client.audio.speech.create(
      model="tts-1",
      voice="nova",
      input=prompt
    )
    response.write_to_file(speech_file_path)


In [34]:
text_to_speech("Today is a wonderful day to build something people love!", "english_speech.mp3")

In [44]:
text_to_speech("Ezen a szép napon alkossunk valami szépet, ami az emberek örömére szolgál!", "hungarian_speech.mp3")