In [37]:
# %pip install llama-index
# %pip install python-dotenv
# %pip install chromadb
# %pip install openai
# %pip install langchain_chroma
# %pip install langchain
# %pip install langchain_openai
# %pip install langchain_community

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

# 1. Download Data

In [2]:
# !mkdir -p 'data/paul_graham/'
# !wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

--2024-09-18 10:45:10--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75042 (73K) [text/plain]
Saving to: ‘data/paul_graham/paul_graham_essay.txt’


2024-09-18 10:45:11 (4.36 MB/s) - ‘data/paul_graham/paul_graham_essay.txt’ saved [75042/75042]



# 2. No framework

In [28]:
import chromadb
from openai import OpenAI

Load data

In [29]:
file = open("./data/paul_graham/paul_graham_essay.txt", "r")
doc = file.read()

Split document

In [30]:
splitted_docs = [doc[i:i+5000] for i in range(0, len(doc), 5000)]

Retriever

In [31]:
class Retriever:
    def __init__(self, collection_name: str) -> None:
        self.client = chromadb.Client()
        self.collection_name = collection_name
        self.collection = None
    
    def add_documents(self, docs: list[str]) -> None:
        try:
            print("Creating collection")
            self.collection = self.client.create_collection(self.collection_name)
            self.collection.add(
                documents=docs,
                ids=[str(i) for i in range(len(docs))],
            )
        except:
            print("Collection already exists")
            self.collection = self.client.get_collection(self.collection_name)
    
    def query(self, input: str) -> list[str]:
        results = self.collection.query(
            query_texts=[input],
            n_results=2,
        )
        return results['documents'][0]

In [32]:
retriever = Retriever("paul_graham")
retriever.add_documents(splitted_docs)

Creating collection


LLM

In [33]:
client = OpenAI()

RAG

In [34]:
def noframework_chatbot(question: str) -> str:
    
    retrieved_docs = retriever.query(question)
    
    context = ""
    for doc in retrieved_docs:
        context = context + doc + "\n"

    prompt = f"""
    <context>
    {context}
    </context>

    Based on the context provided, answer the following question:
    Question: {question}
    """

    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ]
    )

    return completion.choices[0].message.content

# 3. LlamaIndex

In [35]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.openai import OpenAI as OpenAILlama

Load data

In [36]:
data = SimpleDirectoryReader(input_dir="./data/paul_graham/").load_data()

Retriever

In [37]:
index = VectorStoreIndex.from_documents(data)

LLM

In [38]:
llm = OpenAILlama(model="gpt-4o-mini")

RAG

In [39]:
llamaindex_chatbot = index.as_chat_engine(chat_mode="context", llm=llm, verbose=True)

# 4. Langchain

In [40]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

Load data

In [41]:
loader = TextLoader("./data/paul_graham/paul_graham_essay.txt")
doc = loader.load()

Split document

In [42]:
text_splitter = CharacterTextSplitter(chunk_size=5000, chunk_overlap=0)
splitted_docs = text_splitter.split_documents(doc)

Retriever

In [43]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

vectorstore = Chroma(
    collection_name="paul_graham_langchain", embedding_function=embeddings
)

vectorstore.add_documents(documents=splitted_docs)
vectorstore.search_kwargs = {"k": 2}

retriever_lang = vectorstore.as_retriever()

LLM

In [44]:
llm = ChatOpenAI(model="gpt-4o-mini")

RAG

In [45]:
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever_lang | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)



# 5. Experiment

In [46]:
question = "What did Paul Graham work on before college?"

No framework

In [47]:
print(noframework_chatbot(question))

Before college, Paul Graham was involved in various activities related to computer science and programming. He was working on his dissertation in a PhD program, specifically dealing with Lisp hacking and a project titled "On Lisp." Additionally, he had experience as a consultant during his grad school years, which provided him with financial support and practical experience. He was also engaged in the development of ecommerce software with collaborators Robert Morris and Trevor Blackwell, leading to the creation of Viaweb, an early online store builder.


LlamaIndex

In [48]:
print(llamaindex_chatbot.chat(question))

Before college, Paul Graham primarily worked on writing and programming. He wrote short stories, which he described as being poorly constructed with hardly any plot, focusing instead on characters with strong feelings. In terms of programming, he began writing programs on the IBM 1401 used by his school district for data processing when he was around 13 or 14 years old. He later transitioned to working with microcomputers, starting with a TRS-80 that his father eventually bought for him. During this time, he wrote simple games, a program to predict the flight of model rockets, and a word processor that his father used to write a book.


LangChain

In [49]:
print(rag_chain.invoke(question))

Before college, Paul Graham primarily worked on writing and programming. He wrote short stories and began programming on an IBM 1401 in 9th grade, later moving on to microcomputers like the TRS-80, where he created simple games and a word processor. His early experiences in both writing and programming laid the groundwork for his future endeavors in technology and online essays.
