In [1]:

def group_text_by_time_window(result: dict, duration: int, time_window_size: int = 30):
    """
    Group text output from OpenAI whisper into time windows
    :param result: dict, output from OpenAI whisper model
    :param time_window_size: int, size of time window in seconds

    :return: dict, start time in seconds as key, text as value
    """
    # get transcript segments and their start times
    seg_starts = [seg["start"] for seg in result["segments"]]
    seg_text = [seg["text"] for seg in result["segments"]]

    time_windows = {}

    # group text into buckets
    for time, text in zip(seg_starts, seg_text):
        time_window = int(time // time_window_size)

        if time_window not in time_windows:
            time_windows[time_window] = [text]
        else:
            time_windows[time_window].append(text)
    final_times = {}

    # create lists of start and ends times
    starts = [index * time_window_size for index in time_windows.keys()]
    ends = starts[1:] + [duration]

    final_result = []

    # create list of texts with startime and end timex
    for ((index, text), (start, end)) in zip(time_windows.items(), zip(starts, ends)):
        final_result.append({"start": start, "end": end, "text": "".join(text)})

    return final_result


In [2]:
import requests
import os
filename = "audio.mp3"

if not os.path.exists(filename):
    url = "https://nyt.simplecastaudio.com/3026b665-46df-4d18-98e9-d1ce16bbb1df/episodes/dd430a54-e475-46bd-b9fb-97a359c4161c/audio/128/default.mp3/default.mp3_ywr3ahjkcgo_63f2a6a9bc78a0a3100fbc9a815bf42d_62565420.mp3?aid=rss_feed&amp;awCollectionId=3026b665-46df-4d18-98e9-d1ce16bbb1df&amp;awEpisodeId=dd430a54-e475-46bd-b9fb-97a359c4161c&amp;feed=82FI35Px&hash_redirect=1&x-total-bytes=62565420&x-ais-classified=unclassified&listeningSessionID=0CD_382_16__be48a1539bfd29c334b850a7a8cf37e16ec362de"
    
    response = requests.get(url)
    with open(filename, "wb") as file:
        file.write(response.content)

print("File downloaded successfully.")


File downloaded successfully.


In [3]:
import pickle
import os

# Define the filename for pickling
pickle_filename = "transcription_obj.pickle"

# Check if the pickled file exists
if os.path.exists(pickle_filename):
    # Read the pickled file
    with open(pickle_filename, "rb") as file:
        transcription_obj = pickle.load(file)
else:

    import whisper
    model = whisper.load_model("base")
    transcription_obj = model.transcribe("audio.mp3", language='en', without_timestamps=True)
    
    with open(pickle_filename, "wb") as file:
        pickle.dump(transcription_obj, file)



In [4]:
with open('transcript.txt', 'w') as file:
    file.write(transcription_obj['text'])


In [5]:
# # import
# from langchain.text_splitter import CharacterTextSplitter
# from langchain_community.document_loaders import TextLoader
# from langchain_community.embeddings.sentence_transformer import (
#     SentenceTransformerEmbeddings,
# )
# from langchain_community.vectorstores import Chroma

# # load the document and split it into chunks
# loader = TextLoader("transcript.txt")
# documents = loader.load()


# # split it into chunks
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# docs = text_splitter.split_documents(documents)

# # print(len(docs[0].page_content))
# print(docs)



# # # create the open-source embedding function
# # embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# # # load it into Chroma
# # db = Chroma.from_documents(docs, embedding_function)

# # # query it
# # query = "What books were recommended in this podcast?"
# # docs = db.similarity_search(query)

# # print results
# # print(docs[0].page_content)

In [6]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain_core.vectorstores import VectorStoreRetriever
from langchain.document_loaders import TextLoader
from typing import List
from langchain.schema import Document
import os
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser


os.environ['OPENAI_API_KEY'] = ""

class Genie:

    def __init__(self, file_path: str):
        self.file_path = file_path
        self.loader = TextLoader(self.file_path)
        self.documents = self.loader.load()
        self.texts = self.text_split(self.documents)
        self.vectordb = self.embeddings(self.texts)
        retriever = VectorStoreRetriever(vectorstore=self.vectordb)
        self.genie = RetrievalQA.from_llm(llm=OpenAI(), retriever=retriever)

    @staticmethod
    def text_split(documents: TextLoader):
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        texts = text_splitter.split_documents(documents)
        return texts

    @staticmethod
    def embeddings(texts: List[Document]):
        embeddings = OpenAIEmbeddings()
        vectordb = Chroma.from_documents(texts, embeddings)
        return vectordb

    def ask(self, query: str):
        return self.genie.run(query)


genie = Genie("transcript.txt")
print(genie.ask("What books were recommended in this podcast?"))

  warn_deprecated(
  warn_deprecated(
  warn_deprecated(


 The recommended books were "Operation Pedestal" by Max Hastings, "Into the Heart of Romans" by N.T. Wright, and "Manhunt: The 12-Day Chase for Lincoln's Killer" by James Swanson.


In [11]:
from operator import itemgetter

from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
# from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.chat_models import ChatOpenAI, OpenAIEmbeddings

ImportError: cannot import name 'OpenAIEmbeddings' from 'langchain_community.chat_models' (/usr/local/lib/python3.11/site-packages/langchain_community/chat_models/__init__.py)

In [80]:


file_path = "transcript.txt"
loader = TextLoader(file_path)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = loader.load()
texts = text_splitter.split_documents(documents)

vectorstore = FAISS.from_documents(texts, OpenAIEmbeddings())

template = """What books were recommended in this podcast?"""

chat = ChatPromptTemplate.from_template(template)

model = ChatOpenAI()

chain = (
    RunnablePassthrough()
    | RunnableLambda(lambda x: chat.run(x))
    | model
    | StrOutputParser()
)

In [7]:
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain_community.chat_models import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain import hub
from langchain_community.utils.openai_functions import (
    convert_pydantic_to_openai_function,
)
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain.chains.openai_functions import create_structured_output_runnable


class Book(BaseModel):
    """A book."""
    title: str = Field(description="Title of the book")
    author: str = Field(description="Author of the book")

class Books(BaseModel):
    """A list of books."""
    books: List[Book] = Field(description="A list of books")


openai_functions = [convert_pydantic_to_openai_function(Book)]


def embeddings(texts: List[Document]):
    embeddings = OpenAIEmbeddings()
    vectordb = Chroma.from_documents(texts, embeddings)
    return vectordb

prompt = hub.pull("langchain-ai/retrieval-qa-chat")
model = ChatOpenAI()


file_path = "transcript.txt"
loader = TextLoader(file_path)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = loader.load()
texts = text_splitter.split_documents(documents)
vectordb = embeddings(texts)

retriever = VectorStoreRetriever(vectorstore=vectordb)
# chain = create_stuff_documents_chain(
#     model, prompt
# )
chain = create_structured_output_runnable(Books, model, prompt)
chain = create_retrieval_chain(retriever, chain)
# chain = create_retrieval_chain(retriever, combine_docs_chain)

result = chain.invoke({"input": "What books were recommended in this podcast?"})


  warn_deprecated(
  warn_deprecated(


In [8]:
result

{'input': 'What books were recommended in this podcast?',
 'context': [Document(page_content="It's a Max Hastings book. He's one of my favorite military historians. It's called Operation pedestal and for you, military history buffs who listened to Ezra really phenomenal storytelling about a pivotal convoy to save Malta in 1942 when it was under siege, just incredible cast of characters and remarkable level of heroism. And it's a really tremendous book. The next one is a brand new book by N. T. Wright, who's a theologian and it's called Into the Heart of Romans. And this is for your theology buffs who are Ezra who listened to your show Ezra. And it really is making a really interesting argument that the book of Romans, this pivotal book and the New Testament has been in some important ways misinterpreted and that a more proper interpretation of Romans is one that actually has a more radical call to virtue. And then the next book is Back to Your History Buffs. And it's called Man Hunt, 1

In [9]:
result = result['answer']

In [10]:
for book in result.books:
    print(f"{book.title} by {book.author}")

Operation Pedestal by Max Hastings
Into the Heart of Romans by N. T. Wright
Man Hunt: 12 Day Chase for Lincoln's Killer by James Swanson


In [32]:
import requests
from bs4 import BeautifulSoup
import re

# Define the base URL for Amazon searches
search_url = "https://www.amazon.com/s?k={}&i=stripbooks"

# Combine the book title and author into a search query
book_title = "Operation Pedestal"
author = "Max Hastings"
query = f"{book_title} {author}"

# add headers to make it look like a real browser
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:124.0) Gecko/20100101 Firefox/124.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-GPC": "1"
}

# Perform the search with headers
response = requests.get(search_url.format(query.replace(" ", "+")), headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

# Focus on links within the specified div class
result_list_div = soup.find('div', class_='s-main-slot s-result-list s-search-results sg-row')
if result_list_div:
#     # Extract all relevant links
    links = result_list_div.find_all('a', class_="a-size-base", href=True)
    links = [link for link in links if link.contents and link.contents[0] in ["Paperback", "Hardcover"]]

    if links:
        book_link = "https://www.amazon.com" + links[0]['href']

        # Follow the first book link (as an example)
        detail_response = requests.get(book_link, headers=headers)
        detail_soup = BeautifulSoup(detail_response.text, 'html.parser')

        # Find and extract book title
        title = detail_soup.find(id='productTitle').text.strip()

        # Find and extract book author
        author_link = detail_soup.find(id='bylineInfo')
        author = author_link.find('a').text.strip() if author_link else None

        isbn10 = None
        isbn13 = None
        for item in detail_soup.find_all('li'):
            if "ISBN-10" in item.text:
                isbn10 = re.sub("[^0-9-]", "", item.text.split(':')[-1].strip())
            elif "ISBN-13" in item.text:
                isbn13 = re.sub("[^0-9-]", "", item.text.split(':')[-1].strip())
                
        image_link = detail_soup.find('img', id="landingImage")['data-old-hires']

        print({
            'isbn10': isbn10,
            'isbn13': isbn13,
            'title': title,
            'author': author,
            'image_link': image_link
        })
    else:
        print("Book link not found.")
else:
    print("Result list div not found.")

{'isbn10': '0063341085', 'isbn13': '978-0063341081', 'title': "Operation Biting: The 1942 Parachute Assault to Capture Hitler's Radar", 'author': 'Max Hastings', 'image_link': 'https://m.media-amazon.com/images/I/81HIJ+6qHWL._SL1500_.jpg'}


In [30]:
import re
soup = detail_soup


# Find and extract book title
title = soup.find(id='productTitle').text.strip()

# Find and extract book author
author_link = soup.find(id='bylineInfo')
author = author_link.find('a').text.strip() if author_link else None

# Find and extract high-resolution book image
img_div = soup.find(id='imgBlkFront')
# high_res_image = img_div.get('data-a-dynamic-image')

{
    'isbn10': isbn10,
    'isbn13': isbn13,
    'title': title,
    'author': author,
    'high_res_image_link': high_res_image_link
}

{'isbn10': '0062980149',
 'isbn13': '978-0062980144',
 'title': 'Operation Pedestal: The Fleet That Battled to Malta, 1942',
 'author': 'Max Hastings'}

In [59]:
links = result_list_div.find_all('a', class_="a-link-normal s-underline-text", href=True)
links

[]

In [40]:
response.request.headers

{'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:124.0) Gecko/20100101 Firefox/124.0', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Connection': 'keep-alive', 'Accept-Language': 'en-US,en;q=0.5', 'DNT': '1', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-GPC': '1'}

In [44]:
import requests
from bs4 import BeautifulSoup
import re

AMAZON_SEARCH_URL = "https://www.amazon.com/s?k={}&i=stripbooks"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:124.0) Gecko/20100101 Firefox/124.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-GPC": "1"
}

def perform_amazon_search(book_title, author):
    """Searches Amazon for a book and returns the first relevant link."""
    query = f"{book_title} {author}".replace(" ", "+")
    search_response = requests.get(AMAZON_SEARCH_URL.format(query), headers=HEADERS)
    search_soup = BeautifulSoup(search_response.text, 'html.parser')

    result_div = search_soup.find('div', class_='s-main-slot s-result-list s-search-results sg-row')
    return get_first_book_link(result_div)

def get_first_book_link(result_div):
    """Extracts the first relevant book link from Amazon search results."""
    if not result_div:
        return None

    links = result_div.find_all('a', class_="a-size-base", href=True)
    filtered_links = [link for link in links if link.contents and link.contents[0] in ["Paperback", "Hardcover"]]

    if filtered_links:
        return "https://www.amazon.com" + filtered_links[0]['href']
    else:
        return None

def extract_book_details(book_url):
    """Fetches book details from a given Amazon book page."""
    detail_response = requests.get(book_url, headers=HEADERS)
    detail_soup = BeautifulSoup(detail_response.text, 'html.parser')

    return {
        'isbn10': extract_isbn(detail_soup, "ISBN-10"),
        'isbn13': extract_isbn(detail_soup, "ISBN-13"),
        'title': extract_title(detail_soup),
        'author': extract_author(detail_soup),
        'image_link': extract_image_link(detail_soup)
    }

def extract_isbn(soup, isbn_type):
    """Extracts either ISBN-10 or ISBN-13 from the detail page."""
    isbn = None
    for item in soup.find_all('li'):
        if isbn_type in item.text:
            isbn = re.sub("[^0-9-]", "", item.text.split(':')[-1].strip())
    return isbn

def extract_title(soup):
    title_element = soup.find(id='productTitle')
    return title_element.text.strip() if title_element else None

def extract_author(soup):
    author_link = soup.find(id='bylineInfo')
    return author_link.find('a').text.strip() if author_link else None

def extract_image_link(soup):
    image_element = soup.find('img', id="landingImage")
    return image_element['data-old-hires'] if image_element else None


# book_title = "Operation Pedestal"
# author = "Max Hastings"

# book_link = perform_amazon_search(book_title, author)
# if book_link:
#     book_details = extract_book_details(book_link)
#     print(book_details)
# else:
#     print("Book link not found.")

# write a function to clean the book_url by removing all query params
def clean_book_link(book_link):
    link_without_query = book_link.split('?')[0]
    clean_link = re.sub(r'/ref=[^/]+', '', link_without_query)
    return clean_link

def get_book_from_amazon(book_title, author):
    book_link = perform_amazon_search(book_title, author)
    book_link = clean_book_link(book_link)
    if book_link:
        return {'book_url': book_link, **extract_book_details(book_link)}
    else:
        return None


In [45]:
for book in result.books:
    print(f"{book.title} by {book.author}")
    print(get_book_from_amazon(book.title, book.author))
    break

Operation Pedestal by Max Hastings
{'book_url': 'https://www.amazon.com/Operation-Pedestal-Fleet-Battled-Malta/dp/0062980149', 'isbn10': '0062980149', 'isbn13': '978-0062980144', 'title': 'Operation Pedestal: The Fleet That Battled to Malta, 1942', 'author': 'Max Hastings', 'image_link': 'https://m.media-amazon.com/images/I/81PCJyPYeuL._SL1500_.jpg'}
