Get your openai api key from settings: https://platform.openai.com/account/api-keys

In [22]:
%pip install openai --upgrade
%pip install pydub simpleaudio
%pip install nltk
%pip install pyaudio numpy
%pip install sounddevice scipy
%pip install -U openai-whisper

Collecting openai
  Downloading openai-1.54.2-py3-none-any.whl.metadata (24 kB)
Downloading openai-1.54.2-py3-none-any.whl (389 kB)
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.54.1
    Uninstalling openai-1.54.1:
      Successfully uninstalled openai-1.54.1
Successfully installed openai-1.54.2
Note: you may need to restart the kernel to use updated packages.









Note: you may need to restart the kernel to use updated packages.




Note: you may need to restart the kernel to use updated packages.




Note: you may need to restart the kernel to use updated packages.




Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
     ---------------------------------------- 0.0/800.5 kB ? eta -:--:--
     ------------- -------------------------- 262.1/800.5 kB ? eta -:--:--
     -------------------------------------- 800.5/800.5 kB 2.9 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting more-itertools (from openai-whisper)
  Downloading more_itertools-10.5.0-py3-none-any.whl.metadata (36 kB)
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.8.0-cp311-cp311-win_amd64.whl.metadata (6.8 kB)
Downloading more_itertools-10.5.0-py3-none-any.whl (60 kB)
Downloading tiktoken-0.8.0-cp311-cp311-win_amd64.whl (884 kB)
   -----



In [None]:
import getpass
from openai import OpenAI
import os



In [None]:
MP3_FOLDER = "audio_chunks"  # Directory where audio chunks are stored

# MP3 Recording Parameters
SAMPLE_RATE = 44100  # Sample rate in Hz
DURATION_THRESHOLD = 1.5  # Duration in seconds to consider silence
SILENCE_THRESHOLD = 0.01  # Amplitude threshold for silence
MAX_CHUNK_SIZE = 25 * 1024 * 1024  # Maximum chunk size in bytes (25 MB)

### Record User Audio

In [None]:
# Records user audio and detects pauses (silence) to segment the recording into separate chunks.
# When silence is detected for a specified duration, the audio is saved as an MP3 file. This allows for efficient 
# segmentation of audio data based on user pauses. The program also ensures that audio chunks do not exceed a set size.

import sounddevice as sd
import numpy as np
import os
import time
from pydub import AudioSegment
from scipy.io.wavfile import write


# Create directory for audio chunks if it doesn't exist
if not os.path.exists(MP3_FOLDER):
    os.makedirs(MP3_FOLDER)

def is_silent(data):
    """Check if the audio data is silent by evaluating if its mean amplitude is below the silence threshold."""
    return np.abs(data).mean() < SILENCE_THRESHOLD

def record_audio():
    """Continuously record audio, saving each chunk when silence is detected for the threshold duration."""
    print("Recording... Speak now.")

    # Initialize the current chunk and silence detection timing
    current_chunk = []
    silence_start_time = None

    # Start the audio input stream
    with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype='float32') as stream:
        while True:
            # Read one second of audio data from the stream
            data = stream.read(SAMPLE_RATE)[0]
            current_chunk.extend(data.flatten())  # Add the new data to the current chunk

            # Check if the audio data is silent
            if is_silent(data):
                # If silence starts, begin a timer
                if silence_start_time is None:
                    silence_start_time = time.time()
            else:
                # Reset the timer if sound is detected
                silence_start_time = None

            # Stop recording if silence has lasted beyond the duration threshold
            if silence_start_time and (time.time() - silence_start_time) > DURATION_THRESHOLD:
                print("Silence detected. Stopping recording.")
                break  # Exit the loop to stop recording

    # Save the final chunk after breaking out of the loop
    if current_chunk:
        save_audio_chunk(current_chunk)
        print("Final chunk saved.")

def save_temp_wav(file_path, chunk):
    """Save the recorded audio chunk to a WAV file temporarily for further processing or conversion."""
    if len(chunk) == 0:
        return  # Don't save empty chunks
    
    # Convert to WAV and save
    audio_data = np.array(chunk, dtype=np.float32)
    write(file_path, SAMPLE_RATE, audio_data)

def convert_to_mp3(wav_file, mp3_file):
    """Convert a temporary WAV file to MP3 format using the Pydub library."""
    try:
        audio = AudioSegment.from_wav(wav_file)
        audio = audio.set_channels(1)  # Ensure mono channel
        audio = audio.set_frame_rate(16000)  # Set sample rate to 16 kHz (Whisper's recommended rate)
        audio.export(mp3_file, format="mp3", bitrate="128k")  # Set bitrate to 128 kbps
    except Exception as e:
        print(f"Error converting to MP3: {e}")


def save_audio_chunk(chunk):
    """Save the recorded audio chunk to an MP3 file, creating a unique file name."""
    if len(chunk) == 0:
        return  # Don't save empty chunks

    # Create a unique file name for the MP3 file
    chunk_index = len(os.listdir(MP3_FOLDER)) + 1
    mp3_file_path = os.path.join(MP3_FOLDER, f"chunk{chunk_index}.mp3")

    # Convert the chunk to WAV temporarily for saving
    temp_wav_path = os.path.join(MP3_FOLDER, "temp.wav")
    save_temp_wav(temp_wav_path, chunk)

    # Convert to MP3
    convert_to_mp3(temp_wav_path, mp3_file_path)

    print(f"Saved chunk to {mp3_file_path}")



In [None]:
# Trim Audio (dead air)
import os
from pydub import AudioSegment
from pydub.silence import detect_nonsilent

# Default configuration

def get_newest_file(directory):
    """Return the newest file in the specified directory."""
    files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
    if not files:
        return None
    files.sort(key=lambda f: os.path.getmtime(os.path.join(directory, f)), reverse=True)
    return os.path.join(directory, files[0])

def trim_silence(input_file=None, folder=MP3_FOLDER, silence_thresh=-40, min_silence_len=300, buffer_ms=200):
    """
    Trim dead air (silence) from the beginning and end of an MP3 file, leaving a small buffer,
    and save it with '_trim' appended to the filename.

    :param input_file: Path to the input MP3 file. Defaults to the newest file in folder if not specified.
    :param folder: Folder to search for the newest file if input_file is not specified.
    :param silence_thresh: Silence threshold in dB (default is -40 dB).
    :param min_silence_len: Minimum length of silence to consider (in milliseconds).
    :param buffer_ms: Extra milliseconds to leave at the beginning and end of the trimmed audio.
    """
    # Use the newest file in the folder if input_file is not specified
    file_path = input_file if input_file else get_newest_file(folder)
    
    if file_path and os.path.exists(file_path):
        # Load the audio file
        audio = AudioSegment.from_mp3(file_path)
        
        # Detect non-silent portions of the audio
        non_silent_ranges = detect_nonsilent(audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
        
        if non_silent_ranges:
            # Get the start and end of the non-silent range and add buffer
            start_trim = max(0, non_silent_ranges[0][0] - buffer_ms)
            end_trim = min(len(audio), non_silent_ranges[-1][1] + buffer_ms)
            
            # Trim the audio with the buffer
            trimmed_audio = audio[start_trim:end_trim]
            
            # Create output filename by appending '_trim' before the file extension
            base, ext = os.path.splitext(file_path)
            output_file = f"{base}_trim{ext}"
            
            # Export the trimmed audio to an MP3 file
            trimmed_audio.export(output_file, format="mp3")
            print(f"Trimmed audio saved as {output_file}")
        else:
            print("No non-silent segments detected. File not saved.")
    else:
        print("No valid audio file found. Please check the directory or specify a file.")

# Example usage
# trim_silence()  # Uses the newest file in the default "audio_chunks" folder
# Or specify a file directly
# trim_silence(input_file="specific_audio.mp3")


Trimmed audio saved as audio_chunks\chunk20_trim.mp3


In [100]:
# Transcibe Speech to Text

def transcribe_audio(mp3_folder=MP3_FOLDER, specified_file=None, use_newest=True):
    """
    Transcribes an audio file using the Whisper model. By default, uses the newest file in the directory,
    but can also use a specified file if provided.

    :param mp3_folder: Directory to search for audio files (default is "audio_chunks")
    :param specified_file: File path to a specific file to transcribe; overrides use_newest if provided
    :param use_newest: Boolean flag to determine whether to use the newest file in mp3_folder (default is True)
    :return: Transcription text if successful, or an error message
    """
      # Determine file path based on user input or defaults
    file_path = specified_file if specified_file else (get_newest_file(mp3_folder) if use_newest else None)

    if file_path and os.path.exists(file_path):
        with open(file_path, "rb") as audio_file:
            # Perform transcription using Whisper model
            transcription = client.audio.transcriptions.create(
                model="whisper-1",
                file=audio_file,
            )
            print(transcription.text)
            return transcription.text
    else:
        return "No valid audio file found. Please check the directory or specify a valid file."
    

# Example usage
# Or specify a file directly
# print(transcribe_audio(specified_file="specific_audio.mp3", use_newest=False))


In [None]:
# Prompt definitions (pre-written)
# MyPrompt1 find one error and give the complete correction
myPrompt1 = """
You will receive a paragraph written in Spanish. Your task is to identify and correct major grammatical errors while adhering to the following instructions:

1. Ignore gender errors.
2. If a mistake is made but then corrected within the same paragraph, do not include that correction (e.g., "yo comí" followed by "yo comer").
3. Ignore typing mistakes or missing accents; these are not considered errors.
4. Ignore minor clarity issues.
5. If there are no errors worth mentioning, output "No Errors."

Your output should consist only of the correct Spanish for each identified error, separated by periods. Do not provide any additional commentary or explanations. Aim for responses of 4-5 words unless more is needed.

Example:

Input: "Nosotros va a la playa mañana. Ellos trae sus juguetes."

Output: "Nosotros vamos. Ellos traen."
"""

# Archived prompts

myText = "Yo ir al tienda para comprar manzanas y platanos. Cuando llegué, vi muchas frutas y verduras. El cajero me dijo que el precio son muy alto, pero no me importa. Después, yo regresar a casa y preparé un plato grande para mi familia. Todos nos gusta comer juntos."

# MyPrompt2 is show all the errors in a list
myPrompt2 = """ 
You will receive a paragraph written in Spanish. Your task is to identify and correct one major grammatical error while adhering to the following instructions:
Ignore gender errors.
If a mistake is made but then corrected within the same paragraph, do not include that correction (e.g., "yo comí" followed by "yo comer").
Ignore typing mistakes or missing accents; these are not considered errors.
Ignore minor clarity issues.
If there are no significant errors, output "No errors."
Your output should consist of three sentences:

The original incorrect sentence, abreviated to show what was said, but not include extraneous detail before and after in the sentence.
The corrected version with the corrected word in all caps.
If no significant error is found, just write "No errors.". 
DO NOT SHOW AN ERROR IF THERE IS NOT ONE.
Example 1:

Input: Ayer, nosotros vamos a la playa mañana para solear.
Output: No errors.
Example 2:

Input: Ayer, nosotros va a la playa mañana para solear. Ellos trae sus juguetes.
Output: Nosotros va a la playa mañana. Nosotros VAMOS a la playa mañana.

"""
confirmPrompt = """ The following input is supposed to be a sentence in spanish that includes an error, and then the correction of that error. If the second sentence does not actually correct an error, then simply respond with 'No errors.' Otherwise repeat back exactly the input given to you without edits. 
Input:
"""

In [None]:
# Get the openai secret key
secret_key = getpass.getpass("Please enter your openai key: ")
os.environ["OPENAI_API_KEY"]= secret_key

In [40]:
# OpenAI set up:
client = OpenAI()
client.api_key = secret_key
# Set up OpenAI client with the provided API key

def complete(prompt, stop=None):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
             {
                "role": "system",
                "content": "You are a real-time voice chat assistant for Spanish language learning, specializing in Mexican Spanish with a natural Mexican accent."
            },
            {
            "role": "user",
            "content": prompt
            }
        ],
        stop=stop
    )
    return response.choices[0].message.content

complete("is this working?")

'¡Hola! Sí, esto está funcionando. ¿En qué puedo ayudarte hoy con tu aprendizaje de español?'

In [None]:

MP3_FOLDER = "audio_chunks"  # Directory where audio chunks are stored



play_audio(intro.mp3)

record_audio()

trim_silence()  # Uses the newest file in the default "audio_chunks" folder

myTransciption = transcribe_audio()  # Uses the newest file in the default "audio_chunks" folder

# Check transcription for grammatical errors
foundError = complete(f"{myPrompt2}\n\n{myTransciption}")
print(foundError)
# and then confirm it actually did it's job
foundError = complete(f"{confirmPrompt}\n\n{foundError}")
print(foundError)


#convert errors into audio and plays them
if foundError == "No errors.":
    errorsLeft = False
else:
    errorsLeft = True
    speech_file = "speech_file.mp3"
    speech_file_path = Path.cwd() / speech_file
    #generate the TTS Audio
    response = client.audio.speech.create(
        model="tts-1",
        voice="onyx",
        input=foundError
    )
    response.stream_to_file(speech_file_path)

    
    # play audio back
    play_audio(speech_file_path)

    # Record the user response
    # transcribe
    # Check if they got it right using a gpt call, if they did they change errorLeft
    # also check against other cases (as described below)
 
#No more errors, continue conversation, play 'cuentame.mp3' 
play_audio("cuentame.mp3")

Es interesante porque yo puedo hablar entre este sistema y si vas a hacer errores, debes repararlos.
Es interesante porque yo puedo hablar entre este sistema y si vas a hacer errores, DEBES repararlos.  
Es interesante porque yo puedo hablar entre este sistema y si vas a hacer errores, debes REPARARLOS.
Es interesante porque yo puedo hablar entre este sistema y si vas a hacer errores, DEBES repararlos.  
Es interesante porque yo puedo hablar entre este sistema y si vas a hacer errores, debes REPARARLOS.


  response.stream_to_file(speech_file_path)


Finished playing c:\Users\mattj\OneDrive\Documents\GitHub\learning\speech_file.mp3
Finished playing cuentame.mp3


In [87]:
def play_audio(mp3_filename):
    """
    Plays the specified MP3 file using pydub and simpleaudio.
    
    :param mp3_filename: Name of the MP3 file to play (should be located in the current working directory)
    """
    try:
        # Load audio file
        audio = AudioSegment.from_mp3(Path.cwd() / mp3_filename)
        
        # Play the audio file
        play_obj = sa.play_buffer(
            audio.raw_data,
            num_channels=audio.channels,
            bytes_per_sample=audio.sample_width,
            sample_rate=audio.frame_rate
        )
        play_obj.wait_done()  # Wait until playback is finished
        print(f"Finished playing {mp3_filename}")
    except Exception as e:
        print(f"Error playing audio: {e}")

In [75]:

''' 
Listen to audio

if the last words said were  
nevermind - just move one. this is tough because i might just completely move on, so you need logic to notice this has happened and ignore checking against old errors
repite - auto replay everything that was just said and ignore 
repite (algo) figure out what exactly i want repeated and repeat that audio slowly (consider slowing down the playback)

otherwise confirm that i repeated back the correction properly. 
'''

' \nif the last words said were  \nnevermind - just move one\nrepite - auto replay everything that was just said and ignore \nrepite (algo) figure out what exactly i want repeated and repeat that audio slowly (consider slowing down the playback)\n\nListen to audio\nI respond normally - one by one, go through the things that were said to me and check if i said them correctly back. \n    if yes, then move to the next item,\n    if no, add that item back to the audio output\n    if all yes, then continue conversation\n    otherwise, concat audio again and play new audio\n\n'