<a href="https://colab.research.google.com/github/vsmolii/vsmolii.github.io/blob/master/Voice_Recording_Summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Configuration

Optional: connect Google Drive

In [1]:
from google.colab import drive
drive.mount('/content')

Mounted at /content/drive


Configure the variables

In [4]:
whisper_model_size = 'medium'
# whisper_model_size = 'large-v2'


language = 'uk' # Ukrainian
# language = 'en' # English
# language = 'be' # Belarusian
# language = 'bg' # Bulgarian
# language = 'hr' # Croatian
# language = 'cs' # Czech
# language = 'da' # Danish
# language = 'nl' # Dutch
# language = 'et' # Estonian
# language = 'fi' # Finnish
# language = 'fr' # French
# language = 'de' # German
# language = 'el' # Greek
# language = 'hu' # Hungarian
# language = 'is' # Icelandic
# language = 'it' # Italian
# language = 'lv' # Latvian
# language = 'lt' # Lithuanian
# language = 'no' # Norwegian
# language = 'pl' # Polish
# language = 'pt' # Portuguese
# language = 'ro' # Romanian
# language = 'ru' # Russian
# language = 'sr' # Serbian
# language = 'sk' # Slovak
# language = 'sl' # Slovenian
# language = 'es' # Spanish
# language = 'sv' # Swedish
# language = 'tr' # Turkish

content_source = "youtube"
# content_source = "file"
# content_source = "google_drive"

## Youtube url is use
# youtube_url = "https://www.youtube.com/watch?v=qowrj7JbeZs"
youtube_url = "https://www.youtube.com/watch?v=JpFQkCMRQYc"

# file_path = "/content/audio_transcript.txt"
# file_path = "/content/drive/MyDrive/Meet Recordings/orq-twiv-trk (2023-08-30 15:39 GMT+1)" # 56 min mp4 video, 463 MB
# file_path = "/content/drive/MyDrive/Meet Recordings/wic-idnb-rfp (2023-08-31 13:12 GMT+1)" # 20 min mp4 video, 34 MB
# file_path = "/content/orq-twiv-trk (2023-08-30 15:39 GMT+1).mp3" # 56 min mp3 audio

## This will be used in the summarizing prompt:
# recording_type = "meeting"
# recording_type = "user interview"
recording_type = "youtube video"

## Meeting summary:
# summary_structure = """
  # ## Summary
  # 2-3 sentences

  # ## Topics
  # - one
  # - two
  # - ..

  # ## Decisions
  # - one
  # - two
  # - ..

  # ## Action items
  # - one
  # - two
  # - ..
# """

# Generic summary:
summary_structure = """
  ## Summary
  2-3 sentences

  ## Highlights
  - one
  - two
  - ..

"""

## User interview summary:
# summary_structure = """
#   ## Summary
#   Main highlights here. 3-7 paragraphs.

#   ## Jobs
#   - one
#   - two
#   - ..

#   ## Pains
#   - one
#   - two
#   - ..

#   ## Gains
#   - one
#   - two
#   - ..
# """


## Installing the dependencies

In [None]:
!pip install git+https://github.com/openai/whisper.git  -q
!pip install langchain moviepy openai tiktoken pytube

Enter your OpenAI API key

In [None]:
import openai
from getpass import getpass

openai.api_key = getpass('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')

Optional: check if the API key is valid by listing models

In [None]:
# we can confirm that the API key works by listing all the OpenAI models
models = openai.Model.list()
for model in models["data"]:
  print (model["root"])

Download the Whisper model (only once)

In [None]:
%%time

import pathlib
import whisper

model_path = pathlib.Path("/content/whisper/"+whisper_model_size+".pt")
if model_path.exists():
  print ("Model has been downloaded, no re-download necessary")
else:
  print ("Starting download of Whisper Model")
  whisper._download(whisper._MODELS[whisper_model_size], '/content/whisper/', False)

##3. Transcribe the recording
- The video file will be automatically converted into audio before the transcription.
- Text file will be immediately added as a transcription

In [None]:
from moviepy.editor import *
import os
from pytube import YouTube
text_trascript_file_path = False

def download_audio_from_youtube(youtube_url):
  print ("Ok, a youtube link. Converting to audio file...")
  output_path='/content/youtube_audio/'
  yt = YouTube(youtube_url)
  audio_stream = yt.streams.filter(only_audio=True, file_extension='mp4').first()
  download_location = audio_stream.download(output_path)
  new_location = download_location.replace('.mp4', '.mp3')
  os.rename(download_location, new_location)
  file_path = os.path.abspath(new_location)
  print ("All done. Saved to "+file_path)
  return file_path

def transcribe_recording(whisper_model_size, file_path):
  # if the file is a video, convert it to the .mp3 audio
  if not file_path.endswith('.mp3'):
    print ("File is not an mp3. Converting to audio...")
    video = VideoFileClip(file_path)
    base_name = os.path.basename(file_path)
    output_name = os.path.splitext(base_name)[0] + ".mp3"
    output_path = os.path.join("/content/", output_name)
    video.audio.write_audiofile(output_path)
    file_path = output_path
  # load the whisper model
  print ("Starting the audio transcription...")
  whisper_model = whisper.load_model(whisper_model_size, device='cuda', download_root='/content/whisper/')
  print ("Loaded the '"+whisper_model_size+"' Whisper model...")
  result = whisper_model.transcribe(file_path, language=language)
  return result['text']


if content_source == "youtube":
  file_path = download_audio_from_youtube(youtube_url)

if file_path.endswith('.txt'):
  print ("The submitted file is a text file.")
  text_trascript_file_path = file_path
  with open(text_trascript_file_path, "r") as file:
    interview_transcript = file.read()
else:
  print ("File is not a text. Parsing the media...")
  interview_transcript = transcribe_recording(whisper_model_size, file_path)
  base_name = os.path.basename(file_path)
  text_output_name = "transcript__"+os.path.splitext(base_name)[0] + ".txt"
  print ("The transcript is ready. Saving as a "+text_output_name+" ...")
  with open(text_output_name, "w") as file:
      file.write(interview_transcript)
  text_trascript_file_path = "/content/" + text_output_name

import tiktoken
enc = tiktoken.encoding_for_model("gpt-3.5-turbo-16k")
print ("Number of tokens:", len(enc.encode(interview_transcript)))

In [None]:
# Check the transcription happened correctly by peeking into the first 1000 characters
interview_transcript[:1000]

##4. Summarize the transcription

The script will automatically choose the summarization method depending on the size of transcript:
- gpt-4 for less than 8000 tokens
- gpt-3.5-turbo-16k for less than 16000 tokens
- map-reduce langchain method for more than 16000 tokens

In [None]:
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain.chat_models import ChatOpenAI
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.mapreduce import MapReduceChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain, StuffDocumentsChain
from langchain.text_splitter import RecursiveCharacterTextSplitter

import tiktoken

def summarize_big_text(interview_transcript, max_tokens, model, recording_type, summary_structure):


  map_template = """The following is a set of """+recording_type+""" recordings of a single """+recording_type+""".
    {docs}
    Based on this list of recordings, please identify the main themes.

    Structure the summary as following:

    """+summary_structure+"""

    Helpful Answer:"""

  reduce_template = """The following is set of summaries of parts of a """+recording_type+""":
    {doc_summaries}
    Take these and distill it into a final, consolidated summary.

    Structure the summary as following:

    """+summary_structure+"""

    Helpful Answer:"""

  def break_into_chunks(text, max_tokens):
    return RecursiveCharacterTextSplitter(
        chunk_size = max_tokens,
        chunk_overlap  = 0,
        length_function = len,
        add_start_index = True,
    ).create_documents([text])

  docs = break_into_chunks(interview_transcript, max_tokens)

  for idx, chunk in enumerate(docs, 1):
      print(f"Chunk {idx} has {len(tiktoken.encoding_for_model('gpt-3.5-turbo-16k').encode(chunk.page_content))} tokens")

  llm = ChatOpenAI(temperature=0, openai_api_key=openai.api_key, model_name=model)
  # Map

  map_prompt = PromptTemplate.from_template(map_template)
  map_chain = LLMChain(llm=llm, prompt=map_prompt)

  # Reduce

  reduce_prompt = PromptTemplate.from_template(reduce_template)

  # Run chain
  reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

  # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
  combine_documents_chain = StuffDocumentsChain(
      llm_chain=reduce_chain, document_variable_name="doc_summaries"
  )

  # Combines and iteravely reduces the mapped documents
  reduce_documents_chain = ReduceDocumentsChain(
      # This is final chain that is called.
      combine_documents_chain=combine_documents_chain,
      # If documents exceed context for `StuffDocumentsChain`
      collapse_documents_chain=combine_documents_chain,
      # The maximum number of tokens to group documents into.
      token_max=max_tokens,
  )

  # Combining documents by mapping a chain over them, then combining results
  map_reduce_chain = MapReduceDocumentsChain(
      # Map chain
      llm_chain=map_chain,
      # Reduce chain
      reduce_documents_chain=reduce_documents_chain,
      # The variable name in the llm_chain to put the documents in
      document_variable_name="docs",
      # Return the results of the map steps in the output
      return_intermediate_steps=False,
  )

  text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
      chunk_size=1000, chunk_overlap=0
  )
  split_docs = text_splitter.split_documents(docs)
  return map_reduce_chain.run(split_docs)

def summarize_single_shot(interview_transcript, model, recording_type, summary_structure):
  instructPrompt = """
  Instructions:
  Summarize the following """+recording_type+""" text into the """+recording_type+""" notes. Respond with markdown.

  Structure the summary as following:

  """+summary_structure+"""

  Text:
  """

  request = instructPrompt + interview_transcript

  llm = ChatOpenAI(temperature=0, openai_api_key=openai.api_key, model_name=model)
  chatOutput = llm([HumanMessage(content=request)])
  return chatOutput.content

target_model = "gpt-3.5-turbo-16k"
chunk_max_tokens = 16000

actual_tokens = len(tiktoken.encoding_for_model(target_model).encode(interview_transcript))
print ("Number of tokens in input prompt ", actual_tokens)

if actual_tokens < 8000:
  print ("Number of tokens is less than 8000, using gpt-4...")
  target_model = "gpt-4"
  summary_result = summarize_single_shot(interview_transcript, target_model, recording_type, summary_structure )
elif actual_tokens < 16000:
  print ("Number of tokens is less than 16000, using gpt-3.5-16k...")
  summary_result = summarize_single_shot(interview_transcript, target_model, recording_type, summary_structure )
else:
  print ("Number of tokens is more than 16000, breaking into chunks...")
  summary_result = summarize_big_text(interview_transcript, chunk_max_tokens, target_model, recording_type, summary_structure)

print(summary_result)

summary_result_output_name = "summary__"+os.path.splitext(base_name)[0] + ".txt"
with open(summary_result_output_name, "w") as file:
    file.write(summary_result)



Optional: write results to a .txt file

In [None]:
summary_result_output_name = "summary__"+os.path.splitext(base_name)[0] + ".txt"
with open(summary_result_output_name, "w") as file:
    file.write(summary_result)