<a href="https://colab.research.google.com/github/tractorjuice/MLOpsAIKB/blob/main/Building_MLOps_AI_Body_of_Knowledge_Part_2_Speech_to_Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MLOps Body of Knowledge Using Langchain & OpenAI
## Part 2, transcribe the audio files

This example shows how to create and query an internal knowledge base using ChatGPT.

This requires a GPU/TPU runtime.

## Runtime Checks

Check we are running on a GPU and check the available memory

In [None]:
try:
  gpu_info = !nvidia-smi
except:
  print('No GPU')
else:
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
  print(gpu_info)

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

## Set Up


###Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Check files are stored on Google Drive

In [None]:
import os

KB_FOLDER = "/content/gdrive/MyDrive/MLOpsKB"  # Google drive folder to save the knowledgebase
YT_DATASTORE = os.path.join(KB_FOLDER, "youtube/datastore")  # Sub-directory for YouTube FAIS datastore files
YT_AUDIO_FOLDER = os.path.join(KB_FOLDER, "youtube/audio")  # Sub-directory for audio files
TRANSCRIPTS_FOLDER = os.path.join(YT_AUDIO_FOLDER, "transcripts")  # Sub-directory for transcripts of audio files
TRANSCRIPTS_TEXT_FOLDER = os.path.join(TRANSCRIPTS_FOLDER, "text")  # Sub-directory for text of audio files
TRANSCRIPTS_WHISPER_FOLDER = os.path.join(TRANSCRIPTS_FOLDER, "whisper_chunks")  # Sub-directory for Whisper chunks of audio files

# Check if directory exists and if not, create it
if not os.path.exists(KB_FOLDER):
    os.makedirs(KB_FOLDER)

# Check if directory exists and if not, create it
if not os.path.exists(YT_DATASTORE):
    os.makedirs(YT_DATASTORE)

# Check if sub-directory exists and if not, create it
if not os.path.exists(YT_AUDIO_FOLDER):
    os.makedirs(YT_AUDIO_FOLDER)

# Check if sub-directory exists and if not, create it
if not os.path.exists(TRANSCRIPTS_FOLDER):
    os.makedirs(TRANSCRIPTS_FOLDER)

# Check if sub-directory exists and if not, create it
if not os.path.exists(TRANSCRIPTS_TEXT_FOLDER):
    os.makedirs(TRANSCRIPTS_TEXT_FOLDER)

# Check if sub-directory exists and if not, create it
if not os.path.exists(TRANSCRIPTS_WHISPER_FOLDER):
    os.makedirs(TRANSCRIPTS_WHISPER_FOLDER)

# Part 2 - Transcribe the audio files
*(Skip to next section to load data store from files if it has been saved locally to save cost of embeddings)*

## Use OpenAI Whisper to convert to text

In [None]:
import jax
jax.devices()

In [None]:
!pip install jax[tpu] -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
!pip install --quiet git+https://github.com/sanchit-gandhi/whisper-jax.git datasets soundfile librosa
!pip install --quiet cached_property

In [None]:
from whisper_jax import FlaxWhisperPipline
import jax.numpy as jnp

#For most GPUs, the dtype should be set to jnp.float16. For A100 GPUs or TPUs, the dtype should be set to jnp.bfloat16:
pipeline = FlaxWhisperPipline("openai/whisper-large-v2", dtype=jnp.bfloat16, batch_size=16)

In [None]:
from jax.experimental.compilation_cache import compilation_cache as cc

cc.initialize_cache("./jax_cache")

In [None]:
unique_video_ids = []

with open(f'{YT_AUDIO_FOLDER}/videos.txt', 'r') as file:
    for line in file:
        # Remove linebreak which is the last character of the string
        curr_place = line[:-1]
        # Add item to the list
        unique_video_ids.append(curr_place)

## Transcribe the audio files and save the text to the Google drive

In [None]:
import re, json, os

def transcribe_file(filename):
    print (f"Transcribing New file: {filename}")
    transcription = pipeline(filename, return_timestamps=True)
    print(transcription)
    return transcription

transcriptions = []
total_videos = len(unique_video_ids)

for counter, video in enumerate(unique_video_ids, start=1):
    transcript_filename = f'{TRANSCRIPTS_WHISPER_FOLDER}/{video}_large.txt'
    audio_filename = f'{YT_AUDIO_FOLDER}/clips/{video}.webm'
    print(f"{counter} of {total_videos}")

    if not os.path.isfile(transcript_filename):
        if os.path.isfile(audio_filename):
            transcription = transcribe_file(audio_filename)
            transcriptions.append(transcription)
            with open(transcript_filename, 'w') as f:
                f.write(json.dumps(transcription))
        else:
            print (f"File does not exist: {audio_filename}")
    else:
        print(f"Existing File: {transcript_filename}")
        with open(transcript_filename, 'r') as f:
            transcription = json.load(f)
        print(transcription)
        with open(transcript_filename, 'w') as f:
            f.write(json.dumps(transcription))
