## Setup
Install required packages

In [None]:
!pip install yt-dlp
!pip install -q git+https://github.com/openai/whisper.git
!pip install ffmpeg-python

**PS**: Don't brew install jupyter, else it'll use the system python (and associated packages) instead of the local env. 

The recommended way to use run jupyter in a local env is:
```
    pip install jupyter ipykernel
    python -m ipykernel install --user --name=video-analysis-poc --display-name "Python (video-analysis-poc)"
    jupyter lab
```

## Transcribe video from URL

### Unable to extract filler words
* Using whisper doesn't return the filler words in the transcription
* Passing in an [initial prompt](https://github.com/openai/whisper/discussions/1174#discussioncomment-5490351) as mentioned in the doesn't help either, even when using a tiny model.

In [None]:
import whisper
model = whisper.load_model("tiny.en") # or choose another model like "small", "medium", etc.
result = model.transcribe(audio="test-with-fillers.m4a", initial_prompt="Umm, let me think like, hmm... Okay, here's what I'm, like, thinking.")

# 3. Print or process the transcription
print(result["text"])

* [Prompting](https://platform.openai.com/docs/guides/speech-to-text/prompting#prompting) doesn't help with the API either.

In [None]:
!pip install openai
from openai import OpenAI
client = OpenAI()

audio_file= open("test-with-fillers.m4a", "rb")
transcription = client.audio.transcriptions.create(
    model="whisper-1", 
    file=audio_file,
    prompt="Umm, let me think like, hmm... Okay, here's what I'm, like, thinking."
)

print(transcription.text)

* I may be able to use CrisperWhisper for testing. But it's really slow. Plus can't go to production.

In [None]:
!pip install git+https://github.com/nyrahealth/transformers.git@crisper_whisper
!pip install datasets transformers accelerate
import os
import sys
import torch

from datasets import load_dataset
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

def adjust_pauses_for_hf_pipeline_output(pipeline_output, split_threshold=0.12):
    """
    Adjust pause timings by distributing pauses up to the threshold evenly between adjacent words.
    """

    adjusted_chunks = pipeline_output["chunks"].copy()

    for i in range(len(adjusted_chunks) - 1):
        current_chunk = adjusted_chunks[i]
        next_chunk = adjusted_chunks[i + 1]

        current_start, current_end = current_chunk["timestamp"]
        next_start, next_end = next_chunk["timestamp"]
        pause_duration = next_start - current_end

        if pause_duration > 0:
            if pause_duration > split_threshold:
                distribute = split_threshold / 2
            else:
                distribute = pause_duration / 2

            # Adjust current chunk end time
            adjusted_chunks[i]["timestamp"] = (current_start, current_end + distribute)

            # Adjust next chunk start time
            adjusted_chunks[i + 1]["timestamp"] = (next_start - distribute, next_end)
    pipeline_output["chunks"] = adjusted_chunks

    return pipeline_output


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "nyrahealth/CrisperWhisper"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps='word',
    torch_dtype=torch_dtype,
    device=device,
)

#dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
#sample = dataset[0]["audio"]
with open("test-with-fillers.mp3", "rb") as f:
    sample = f.read()
hf_pipeline_output = pipe(sample)
crisper_whisper_result = adjust_pauses_for_hf_pipeline_output(hf_pipeline_output)
print(crisper_whisper_result)


It's faster to use an API than run inference locally of this 1.6B parameter model. Let's use Deepgram instead. They give $500 of free credits, have low cost per minute and I was able to transcribe my test file and see the transcript. I know it works.

In [None]:
!pip install deepgram-sdk python-dotenv

In [None]:
import os
from dotenv import load_dotenv


from deepgram import (
    DeepgramClient,
    PrerecordedOptions,
    FileSource,
)

# Path to the audio file
AUDIO_FILE = "test-with-fillers.mp3"

try:
    # STEP 1 Create a Deepgram client using the API key
    load_dotenv()
    deepgram = DeepgramClient(api_key=os.getenv("DEEPGRAM_API_KEY"))

    with open(AUDIO_FILE, "rb") as file:
        buffer_data = file.read()

    payload: FileSource = {
        "buffer": buffer_data,
    }

    #STEP 2: Configure Deepgram options for audio analysis
    options = PrerecordedOptions(
        model="nova-2",
        #smart_format=True,
        filler_words=True
    )

    # STEP 3: Call the transcribe_file method with the text payload and options
    response = deepgram.listen.rest.v("1").transcribe_file(payload, options)

    # STEP 4: Extract the transcription from the response
    transcription = response.results.channels[0].alternatives[0]

except Exception as e:
    print(f"Exception: {e}")


## Analyzing the quality of a presentation
1. Words per minute: 
  - Slow: < 120 wpm
  - Best: 120-160 wpm
  - Fast: > 120 wpm
2. Average number of filler words per minute: Deepgram is capable of transcribing the following [filler words](https://developers.deepgram.com/docs/filler-words) -
    * uh
    * um
    * mhmm
    * mm-mm
    * uh-uh
    * uh-huh
    * nuh-uh
3. Repetition: Count or identify instances of repetition of words or phrases.
4. Clarity: Analyze sentence structure and word choice for complexity and potential ambiguity. You can use libraries like NLTK and spaCy.
5. Structure: Identify transitions between topics/slides. Check the use of introductory phrases and conclusion statements

In [None]:
transcript = transcription.transcript
total_duration = transcription.words[-1].end - transcription.words[0].start
total_words = len(transcription.words)
print(f"Transcript: {transcript}\n")

# Calcuate words per minute
wpm = total_words / (total_duration / 60)
print(f"Words per minute: {wpm:.2f}")

# Calculate filler words per minute and repetitions
filler_words = ["uh", "um", "mhmm", "mm-mm", "uh-uh", "uh-huh", "nuh-uh"]
total_filler_words = 0
repetition_count = 0
last_word = None
for word in transcript.split():
    total_filler_words += 1 if word in filler_words else 0

    if word == last_word:
        repetition_count += 1
    last_word = word
repetitions_per_minute = repetition_count / (total_duration / 60)
print(f"Total repetitions: {repetition_count}. Repetitions per minute: {repetitions_per_minute:.2f}")
filler_wpm = total_filler_words / (total_duration / 60)
print(f"Total filler words: {total_filler_words}. Filler words per minute: {filler_wpm:.2f}")
