In [73]:

def group_text_by_time_window(result: dict, duration: int, time_window_size: int = 30):
    """
    Group text output from OpenAI whisper into time windows
    :param result: dict, output from OpenAI whisper model
    :param time_window_size: int, size of time window in seconds

    :return: dict, start time in seconds as key, text as value
    """
    # get transcript segments and their start times
    seg_starts = [seg["start"] for seg in result["segments"]]
    seg_text = [seg["text"] for seg in result["segments"]]

    time_windows = {}

    # group text into buckets
    for time, text in zip(seg_starts, seg_text):
        time_window = int(time // time_window_size)

        if time_window not in time_windows:
            time_windows[time_window] = [text]
        else:
            time_windows[time_window].append(text)
    final_times = {}

    # create lists of start and ends times
    starts = [index * time_window_size for index in time_windows.keys()]
    ends = starts[1:] + [duration]

    final_result = []

    # create list of texts with startime and end timex
    for ((index, text), (start, end)) in zip(time_windows.items(), zip(starts, ends)):
        final_result.append({"start": start, "end": end, "text": "".join(text)})

    return final_result


In [74]:
import requests
import os
filename = "audio.mp3"

if not os.path.exists(filename):
    url = "https://nyt.simplecastaudio.com/3026b665-46df-4d18-98e9-d1ce16bbb1df/episodes/dd430a54-e475-46bd-b9fb-97a359c4161c/audio/128/default.mp3/default.mp3_ywr3ahjkcgo_63f2a6a9bc78a0a3100fbc9a815bf42d_62565420.mp3?aid=rss_feed&amp;awCollectionId=3026b665-46df-4d18-98e9-d1ce16bbb1df&amp;awEpisodeId=dd430a54-e475-46bd-b9fb-97a359c4161c&amp;feed=82FI35Px&hash_redirect=1&x-total-bytes=62565420&x-ais-classified=unclassified&listeningSessionID=0CD_382_16__be48a1539bfd29c334b850a7a8cf37e16ec362de"
    
    response = requests.get(url)
    with open(filename, "wb") as file:
        file.write(response.content)

print("File downloaded successfully.")


File downloaded successfully.


In [75]:
import pickle
import os

# Define the filename for pickling
pickle_filename = "transcription_obj.pickle"

# Check if the pickled file exists
if os.path.exists(pickle_filename):
    # Read the pickled file
    with open(pickle_filename, "rb") as file:
        transcription_obj = pickle.load(file)
else:

    import whisper
    model = whisper.load_model("base")
    transcription_obj = model.transcribe("audio.mp3", language='en', without_timestamps=True)
    
    with open(pickle_filename, "wb") as file:
        pickle.dump(transcription_obj, file)



In [76]:
with open('transcript.txt', 'w') as file:
    file.write(transcription_obj['text'])


In [77]:
# # import
# from langchain.text_splitter import CharacterTextSplitter
# from langchain_community.document_loaders import TextLoader
# from langchain_community.embeddings.sentence_transformer import (
#     SentenceTransformerEmbeddings,
# )
# from langchain_community.vectorstores import Chroma

# # load the document and split it into chunks
# loader = TextLoader("transcript.txt")
# documents = loader.load()


# # split it into chunks
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# docs = text_splitter.split_documents(documents)

# # print(len(docs[0].page_content))
# print(docs)



# # # create the open-source embedding function
# # embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# # # load it into Chroma
# # db = Chroma.from_documents(docs, embedding_function)

# # # query it
# # query = "What books were recommended in this podcast?"
# # docs = db.similarity_search(query)

# # print results
# # print(docs[0].page_content)

In [78]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain_core.vectorstores import VectorStoreRetriever
from langchain.document_loaders import TextLoader
from typing import List
from langchain.schema import Document
import os
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser


os.environ['OPENAI_API_KEY'] = ""

class Genie:

    def __init__(self, file_path: str):
        self.file_path = file_path
        self.loader = TextLoader(self.file_path)
        self.documents = self.loader.load()
        self.texts = self.text_split(self.documents)
        self.vectordb = self.embeddings(self.texts)
        retriever = VectorStoreRetriever(vectorstore=self.vectordb)
        self.genie = RetrievalQA.from_llm(llm=OpenAI(), retriever=retriever)

    @staticmethod
    def text_split(documents: TextLoader):
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        texts = text_splitter.split_documents(documents)
        return texts

    @staticmethod
    def embeddings(texts: List[Document]):
        embeddings = OpenAIEmbeddings()
        vectordb = Chroma.from_documents(texts, embeddings)
        return vectordb

    def ask(self, query: str):
        return self.genie.run(query)


genie = Genie("transcript.txt")
print(genie.ask("What books were recommended in this podcast?"))

 The books recommended in this podcast were "Operation Pedestal" by Max Hastings, "Into the Heart of Romans" by N. T. Wright, and "Man Hunt: The 12-Day Chase for Lincoln's Killer" by James Swanson.


In [79]:
from operator import itemgetter

from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [80]:


file_path = "transcript.txt"
loader = TextLoader(file_path)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = loader.load()
texts = text_splitter.split_documents(documents)

vectorstore = FAISS.from_documents(texts, OpenAIEmbeddings())

template = """What books were recommended in this podcast?"""

chat = ChatPromptTemplate.from_template(template)

model = ChatOpenAI()

chain = (
    RunnablePassthrough()
    | RunnableLambda(lambda x: chat.run(x))
    | model
    | StrOutputParser()
)

In [81]:
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain_community.chat_models import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain import hub
from langchain_community.utils.openai_functions import (
    convert_pydantic_to_openai_function,
)
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain.chains.openai_functions import create_structured_output_runnable


class Book(BaseModel):
    """A book."""
    title: str = Field(description="Title of the book")
    author: str = Field(description="Author of the book")

class Books(BaseModel):
    """A list of books."""
    books: List[Book] = Field(description="A list of books")


openai_functions = [convert_pydantic_to_openai_function(Book)]


def embeddings(texts: List[Document]):
    embeddings = OpenAIEmbeddings()
    vectordb = Chroma.from_documents(texts, embeddings)
    return vectordb

prompt = hub.pull("langchain-ai/retrieval-qa-chat")
model = ChatOpenAI()


file_path = "transcript.txt"
loader = TextLoader(file_path)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = loader.load()
texts = text_splitter.split_documents(documents)
vectordb = embeddings(texts)

retriever = VectorStoreRetriever(vectorstore=vectordb)
# chain = create_stuff_documents_chain(
#     model, prompt
# )
chain = create_structured_output_runnable(Books, model, prompt)
chain = create_retrieval_chain(retriever, chain)
# chain = create_retrieval_chain(retriever, combine_docs_chain)

result = chain.invoke({"input": "What books were recommended in this podcast?"})


In [82]:
result

{'input': 'What books were recommended in this podcast?',
 'context': [Document(page_content="Hastings book. He's one of my favorite military historians. It's called Operation pedestal and for you, military history buffs who listen to Ezra really phenomenal storytelling about a pivotal convoy to save Malta in 1942 when it was under siege. Just incredible cast of characters and remarkable level of heroism. And it's a really tremendous book. The next one is a brand new book by N. T. Wright, who's a theologian. And it's called Into the Heart of Romans. And this is for your theology buffs who are Ezra who listened to your show Ezra. And it really is making a really interesting argument that the book of Romans, this pivotal book and the New Testament has been in some important ways misinterpreted. And that a more proper interpretation of Romans is one that actually has a more radical call to virtue. And then the next book is Back to Your History Buffs. And it's called Man Hunt, 12 day chase

In [58]:
result = result['answer']

In [64]:
for book in result['answer'].books:
    print(f"{book.title} by {book.author}")

Operation Pedestal by Hastings
Into the Heart of Romans by N. T. Wright
Man Hunt: 12 Day Chase for Lincoln's Killer by James Swanson
