In [31]:
import whisper
import whisper_timestamped
import numpy as np
import IPython.display as ipd
import torch

model = whisper.load_model("medium.en")

# Check for GPU availability and move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-23): 24 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=1024, out_features=1024, bias=True)
          (key): Linear(in_features=1024, out_features=1024, bias=False)
          (value): Linear(in_features=1024, out_features=1024, bias=True)
          (out): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=1024, out_features=4096, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=4096, out_features=1024, bias=True)
        )
        (mlp_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm((

In [107]:
def split_phrases_on_pause(audio_path, silence_threshold=0.6):
    # Load and process the audio
    audio = whisper.load_audio(audio_path)

    # Transcribe the audio in full precision
    result = whisper_timestamped.transcribe(model, audio, language="en", fp16=False)

    phrase_start = 0  # Track the start time of the current phrase
    phrase_text = ""  # Accumulate text for the current phrase

    for segment in result["segments"]:
        words = segment["words"]  # Get the words in the segment

        for i, word in enumerate(words):

            time_start = word["start"]
            time_end = word["end"]
            text = word["text"]

            # Add new word to phrase_text
            phrase_text = phrase_text + text+ " "

            buffer=0.1

            # Check for a significant pause between this word and the previous word
            if time_end - time_start > silence_threshold:
                # If there's a significant pause, split here and start a new phrase
                # Display and play the previous phrase
                print(f"Phrase: {time_start:.2f}s - {time_end:.2f}s")
                print(f"Text: \"{phrase_text.strip()}\"")  # Print the accumulated phrase text
                start_idx = int(phrase_start * whisper.audio.SAMPLE_RATE)
                end_idx = int((time_end+buffer) * whisper.audio.SAMPLE_RATE)
                phrase_audio = audio[start_idx:end_idx]
                ipd.display(ipd.Audio(phrase_audio, rate=whisper.audio.SAMPLE_RATE))

                #Reset phrase_start and phrase_text for the new phrase
                phrase_start = time_end
                phrase_text = ""

            # For the last word in the segment, display and play the phrase
            elif i == len(words) - 1:
                print(f"Phrase: {phrase_start:.2f}s - {time_end:.2f}s")
                print(f"Text: \"{phrase_text.strip()}\"")  # Print the accumulated phrase text for the last phrase
                start_idx = int(phrase_start * whisper.audio.SAMPLE_RATE)
                end_idx = int((time_end+buffer) * whisper.audio.SAMPLE_RATE)
                phrase_audio = audio[start_idx:end_idx]
                ipd.display(ipd.Audio(phrase_audio, rate=whisper.audio.SAMPLE_RATE))

                #Reset phrase_start and phrase_text for the new phrase
                phrase_start = time_end
                phrase_text = ""

In [108]:
session_name="lab_session_1"
split_phrases_on_pause("./sessions/" + session_name + "/cut_audio.wav")

100%|██████████| 3750/3750 [00:16<00:00, 234.26frames/s]

Phrase: 0.00s - 0.66s
Text: "One"





Phrase: 0.66s - 5.46s
Text: "two three go all right. I want you to move towards me in this direction"


Phrase: 7.46s - 8.20s
Text: "Keep moving forward backwards."


Phrase: 9.52s - 10.26s
Text: "It's good. That's good now"


Phrase: 10.26s - 15.36s
Text: "I want you to move towards your side in this direction to keep moving towards you"


Phrase: 19.58s - 20.28s
Text: "Very good keep moving keep moving very good now"


Phrase: 21.90s - 22.56s
Text: "I will come back again in"


Phrase: 23.24s - 23.86s
Text: "my direction towards"


Phrase: 24.18s - 25.04s
Text: "me come"


Phrase: 25.38s - 26.26s
Text: "back keep"


Phrase: 26.26s - 27.70s
Text: "coming back keep coming back"


Phrase: 27.70s - 31.48s
Text: "That's very good, and now I will guide you towards you"


Phrase: 31.48s - 35.38s
Text: "So I want you to keep moving your hand in your direction"


Phrase: 35.38s - 37.18s
Text: "Good, okay"
