In [2]:
import os
from dotenv import load_dotenv

load_dotenv()
from openai import OpenAI

openai_api_key = os.getenv(
    "OPENAI_API_KEY"
)  # get the api key which is kept inside the .env file
client = OpenAI()


## ASR (Automatic Speech Recognition)

In [3]:
def speech_to_text_conversion(file_path):

    """Converts audio format message to text using OpenAI's Whisper model."""
    audio_file= open(file_path, "rb") # Opening the audio file in binary read mode
    transcription = client.audio.transcriptions.create(
    model="whisper-1",  # Model to use for transcription
    file=audio_file  # Audio file to transcribe
    )
    return transcription.text

In [15]:
speech_to_text_conversion("C:/Code/Voice_Based_Chatbot_OpenAI/20241128015307_speech.webm")


'The Atlanta Braves won the World Series in 2021, defeating the Houston Astros four games to two.'

## Text Chat


In [7]:
def text_chat(text):
    # Generate response using OpenAI
    response = client.chat.completions.create(
        model= "o1-mini",
        messages=[
        # {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Who won the world series in 2020?"},
        {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
        {"role": "user", "content": text}
        ])
    return response.choices[0].message.content

In [8]:
text_chat("How about 2021?")

'The Atlanta Braves won the World Series in 2021, defeating the Houston Astros four games to two.'

## TTS (Text-to-Speech)

In [13]:
import tempfile
import datetime
def text_to_speech_conversion(text):
    """Converts text to audio format message using OpenAI's text-to-speech model - tts-1."""
    if text:  # Check if converted_text is not empty
        speech_file_path = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_speech.webm"
        # response = client.audio.speech.create(
        # model="tts-1",# Model to use for text-to-speech conversion
        # voice="fable",# Voice to use for speech synthesis
        # input=text #Text to convert to speech
        # )
        # # '''response is binary data, when using strean_to_file function, it will write the binary data in a file'''
        # response.stream_to_file(speech_file_path) # Streaming synthesized speech to file
        
        with client.audio.speech.with_streaming_response.create(
        model="tts-1",# Model to use for text-to-speech conversion
        voice="fable",# Voice to use for speech synthesis
        input=text #Text to convert to speech
        ) as response:
            response.stream_to_file(speech_file_path) # Streaming synthesized speech to file
        
        # Read the audio file as binary data
        with open(speech_file_path, "rb") as audio_file:
            audio_data = audio_file.read()
        os.remove(speech_file_path)
        return audio_data

In [14]:
text_to_speech_conversion('The Atlanta Braves won the World Series in 2021, defeating the Houston Astros four games to two.')

b'\xff\xf3\xe4\xc4\x00^\x9c:\x08\x0c\xe6\xb0\x04\x10\xb6|\x87q\x96Z&\x9f*\x9c\x9a\xe7-\x99\xad^j\xd0\x99\xf4&t\xa9\xfa\xc7I\x1c\x18jA\x98\xc6c\x19\x08d \x08\x86P\x80\x86Y2\xc9\x96L\xb3e\xa7-9x\xcbh^\x04\x02 \x11\x00\x88\x04@"\x01\x11A\x07\xd0\x0e\x8a\xe8\xae\x91i\x8e\x98\xe9\x8e\xa0\xe9\x10\xa0\x8a\x91R*EH\xa9\x15"\xc4PuN\xa9\xd7z\xef]\xeb\xbdw\xae\xf5\x88\xbb\x17b\xec]\x8b\xb1v.\xc6 \xbb\xd7{\x13gl\xed\x9d\xb3\xb6v\xd7\xd5"\xa4]\x8b\xb1v.\xc5\xd8\xa9\x18\x83\x13gl\xed\x87\xb16v\xce\xdc\xb7-\xc8r\x1a\xe3\x90\xe49\rq\xc8\x7f\x1a\xfb\x96\xce\xda\xfb\x96\xe5\xb9n[\xbe\xe49\x0eC\x90\xe49\x0eC\x90\xee9n[\xfe\xe5\xbb\xef\xfb\xfe\xe5\xbb\xef\xe3\xf8\xfe?\x8c\xe1\x9c5\xc6\xb0\xee;\xee\xdb\x96\xe5\xb9n[\xbe\xff\xbb\xeeC\xf8\xfe9\x0eC\xf8\xfe9\x0e\xe3\x96\xe5\xb9n[\x96\xef\xbbn[\xbe\xe49\x0eC\x90\xe49\x0eC\x90\xee5\xf7-\xcbr\xdc\xb7-\xcbr\xdd\xf7q\xfcr\x1c\x87!\xc8w\x1f\xc6p\xbb\xd8\x9b_gl\xed\x9d\xb0\xf6v\xd7\xdc\x87!\xc8k\x0c\xe1\xc8g\rq\xdcr\xdf\xf7-\xaf\xb9n[\x96\xe5\xbb\xeeCXk\x8eCXk\x8eB\x