<a href="https://colab.research.google.com/github/tractorjuice/Building_BoK/blob/main/Building_Wardley_Mapping_Body_of_Knowledge_Part_2b_Speech_to_Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Wardley Mapping Body of Knowledge Using Langchain & OpenAI
## Part 2, transcribe the audio files

This example shows how to create and query an internal knowledge base using ChatGPT.

This requires a GPU runtime.

## Runtime Checks

Check we are running on a GPU and check the available memory

In [None]:
try:
  gpu_info = !nvidia-smi
except:
  print('No GPU')
else:
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
  print(gpu_info)

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

## Set Up


Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Check audio files are stored on Google Drive

In [None]:
import os

KB_FOLDER = "/content/gdrive/MyDrive/WardleyKB"  # Google drive folder to save the knowledgebase
YT = os.path.join(KB_FOLDER, "youtube")  # Sub-directory for YouTube FAIS datastore files
YT_DATASTORE = os.path.join(YT, "datastore")  # Sub-directory for YouTube FAIS datastore files
YT_AUDIO = os.path.join(YT, "audio")  # Sub-directory for audio files
YT_TRANSCRIPTS = os.path.join(YT_AUDIO, "transcripts")  # Sub-directory for transcripts of audio files
YT_TRANSCRIPTS_TEXT = os.path.join(YT_TRANSCRIPTS, "full_text")  # Sub-directory for text of audio files
YT_TRANSCRIPTS_WHISPER = os.path.join(YT_TRANSCRIPTS, "whisper_chunks")  # Sub-directory for Whisper chunks of audio files
YT_TRANSCRIPTS_WHISPER_DISTIL = os.path.join(YT_TRANSCRIPTS, "distil_whisper_chunks")  # Sub-directory for Whisper chunks of audio files
YT_TRANSCRIPTS_DATASTORE = os.path.join(YT_TRANSCRIPTS, "datastore")  # Sub-directory for books FAIS datastore file

# Check if directory exists and if not, create it
if not os.path.exists(KB_FOLDER):
    os.makedirs(KB_FOLDER)

# Check if sub-directory exists and if not, create it
if not os.path.exists(YT_AUDIO):
    os.makedirs(YT_AUDIO)

# Check if sub-directory exists and if not, create it
if not os.path.exists(YT_TRANSCRIPTS):
    os.makedirs(YT_TRANSCRIPTS)

# Check if sub-directory exists and if not, create it
if not os.path.exists(YT_TRANSCRIPTS_TEXT):
    os.makedirs(YT_TRANSCRIPTS_TEXT)

# Check if sub-directory exists and if not, create it
if not os.path.exists(YT_TRANSCRIPTS_WHISPER):
    os.makedirs(YT_TRANSCRIPTS_WHISPER)

# Check if sub-directory exists and if not, create it
if not os.path.exists(YT_TRANSCRIPTS_WHISPER_DISTIL):
    os.makedirs(YT_TRANSCRIPTS_WHISPER_DISTIL)

# Part 2 - Transcribe the audio files


# Setup Distil Whisper

## Transcribe the audio files and save the text to the Google drive

In [None]:
!pip install -q --upgrade pip
!pip install -q --upgrade transformers accelerate
!pip install -q flash-attn --no-build-isolation # Remove if you are using a CPU

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

device = "cuda:0" # This only works with an A100 as it is required for flash-attn. Change to CPU, using a CPU
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "distil-whisper/distil-large-v2"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True,
    use_flash_attention_2=True # Remove is using a CPU
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

In [None]:
pipeline = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=15,
    batch_size=16,
    torch_dtype=torch_dtype,
    device=device,
)

In [None]:
unique_video_ids = []

with open(f'{YT_AUDIO}/videos.txt', 'r') as file:
    for line in file:
        # Remove linebreak which is the last character of the string
        curr_place = line[:-1]
        # Add item to the list
        unique_video_ids.append(curr_place)
print(unique_video_ids)

Cycle through each file and create the transcript. Then save to the Google Drive

In [None]:
import re, json, os, time, math

def replace_wordly_with_wardley(transcription):
    # Case-insensitive replacement of "worldy" and "worldly" with "Wardley"
    for word in ["wordly", "worldly"]:
        pattern = re.compile(word, re.IGNORECASE)
        text = transcription['text']
        text = pattern.sub("Wardley", text)
        transcription['text'] = text
        for chunk in transcription['chunks']:
            chunk_text = chunk['text']
            chunk_text = pattern.sub("Wardley", chunk_text)
            chunk['text'] = chunk_text
    return transcription

def transcribe_file(filename):
    print (f"Transcribing New file: {filename}")
    transcription = pipeline(filename, return_timestamps=True)
    transcription = replace_wordly_with_wardley(transcription)
    return transcription

transcriptions = []
total_videos = len(unique_video_ids)

for counter, video in enumerate(unique_video_ids, start=1):
    transcript_filename = f'{YT_TRANSCRIPTS_WHISPER_DISTIL}/' + video + '_large.txt'
    audio_filename = f'{YT_AUDIO}/clips/{video}.webm'
    print(f"{counter} of {total_videos}")

    if not os.path.isfile(transcript_filename):
        if os.path.isfile(audio_filename):
            start = time.time()
            transcription = transcribe_file(audio_filename)
            runtime = time.time() - start
            rounded_runtime = math.ceil(runtime)  # Round up to the nearest second
            print("Runtime: ", rounded_runtime, " seconds")
            print(transcription['text'][:100])
            with open(transcript_filename, 'w') as f:
                f.write(json.dumps(transcription))
        else:
            print (f"File does not exist: {audio_filename}")
    else:
        print(f"Existing File: {transcript_filename}")
        with open(transcript_filename, 'r') as f:
            transcription = json.load(f)
        print(transcription['text'][:100])
