In [None]:
# Import necessary libraries
import os
import dotenv
import urllib.request
from langchain_openai import ChatOpenAI, OpenAI
from langchain_community.document_loaders import PyPDFLoader, JSONLoader
from langchain_community.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
from langchain_community.document_loaders.parsers import OpenAIWhisperParser
from langchain_community.document_loaders.generic import GenericLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.agents import AgentExecutor, create_react_agent
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain import hub
from langchain.memory import ConversationSummaryBufferMemory
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
from langchain.tools.retriever import create_retriever_tool

# Load environment variables from .env file
dotenv.load_dotenv()

# Initialize OpenAI models for chat and general use
chat_model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.0)
llm_model = OpenAI(temperature=0.0)

In [None]:
# Load a PDF document
pdf_loader = PyPDFLoader("docs/dissertation.pdf")
pdf_pages = pdf_loader.load()

# Split text into chunks based on character count
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
pdf_splits = text_splitter.split_documents(pdf_pages)

# Create a vector store from PDF document splits for information retrieval
pdf_vector_store = Chroma.from_documents(documents=pdf_splits, embedding=OpenAIEmbeddings())
pdf_retriever = pdf_vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 4, "fetch_k": 6})

# Convert the PDF document retriever into a tool
pdf_retriever_tool = create_retriever_tool(
    pdf_retriever,
    "search_dissertation",
    "Searches and returns excerpts from the dissertation thesis on photoproduction measurements of ALICE detector at the LHC.",
)


In [None]:
# Download YouTube video, create transcript, and save it
youtube_url = "https://www.youtube.com/watch?v=KwR3nxojS0g"
youtube_save_dir = "docs/youtube/"
youtube_loader = GenericLoader(
    YoutubeAudioLoader([youtube_url], youtube_save_dir),
    OpenAIWhisperParser()
)
docs_youtube = youtube_loader.load()

# Split the YouTube transcript into chunks
transcript_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
youtube_splits = transcript_splitter.split_documents(docs_youtube)

# Create a vector store from YouTube document splits
youtube_vector_store = Chroma.from_documents(documents=youtube_splits, embedding=OpenAIEmbeddings())
youtube_retriever = youtube_vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 4, "fetch_k": 6})

# Convert the YouTube video retriever into a tool
youtube_retriever_tool = create_retriever_tool(
    youtube_retriever,
    "search_YT_video",
    "Searches and returns excerpts from the YouTube video 'How to Invest New Cash? Dollar Cost Averaging vs. Lump Sum Investing' by Ben Felix"
)


In [None]:
# Initialize Wikipedia API wrapper and tool
wiki_api_wrapper = WikipediaAPIWrapper(top_k_results=1, doc_content_chars_max=100)
wiki_tool = WikipediaQueryRun(api_wrapper=wiki_api_wrapper)


In [None]:
# Define prompt and tools for the agent
prompt = hub.pull("tomas-herman/react-chat")
tools = [youtube_retriever_tool, pdf_retriever_tool, wiki_tool, TavilySearchResults(max_results=1)]

# Create a memory buffer for conversation history
conversation_memory = ConversationSummaryBufferMemory(llm=llm_model, max_token_limit=100, return_messages=True, memory_key="chat_history")

# Create and execute an agent for handling various inputs
agent = create_react_agent(chat_model, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, memory=conversation_memory, handle_parsing_errors=True)

In [None]:
agent_executor.invoke({"input": "Which measurement is reported in the dissertation thesis?"})

In [None]:
agent_executor.invoke({"input": "Which theoretical models are discussed in the dissertation thesis?"})

In [None]:
agent_executor.invoke({"input": "What is the best way to invest a large sum of money?"})

In [None]:
agent_executor.invoke({"input": "Who is the author of the dissertation thesis?"})