# Langchain Multi-doc retriever with ChromaDB

In [1]:
from dotenv import load_dotenv
import os

# .env dosyasını yükle
load_dotenv()

# Ortam değişkenini al
openai_api_key = os.getenv('OPENAI_API_KEY')

In [2]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader, DirectoryLoader

## Load some files for a source

In [3]:
# ! wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip
# ! unzip -q new_articles.zip -d new_articles

# burdan bir dropbox kaynagondan kullanmak icin rastgele dosyalr indirdik .txt formatinda

# Load multiple and process documents

In [4]:
# Load and process the text files
# loader = TExtloader('single_text_file.txt')

loader = DirectoryLoader('./new_articles/', glob='./*txt', loader_cls=TextLoader) # pdf varsa PDFloader gibi biseyde var ama netten bak)

documents = loader.load()
documents[1]

# burda tüm textleri bir yere topluyor. 
# mesela documents[1] bunu yapinca new_articles dosysindan bir dosyanin tamamini getirdi

Document(page_content='The best way to avoid a down round is to found an AI startup\n\nAs we see unicorns slash staff and the prevalence of down rounds spike, it may seem that the startup ecosystem is chock-full of bad news and little else. That’s not precisely the case.\n\nWhile AI, and in particular the generative AI subcategory, are as hot as the sun, not all venture attention is going to the handful of names that you already know. Sure, OpenAI is able to land nine and 10-figure rounds from a murderer’s row of tech investors and mega-cap corporations. And rising companies like Hugging Face and Anthropic cannot stay out of the news, proving that smaller AI-focused startups are doing more than well.\n\nIn fact, new data from Carta, which provides cap table management and other services, indicates that AI-focused startups are outperforming their larger peer group at both the seed and Series A stage.\n\nThe dataset, which notes that AI-centered startups are raising more and at higher va

In [5]:
# splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents) 
# burda bütün metinlerin toplami var ama
# ama chunklara ayrilmis durumda. her 1000 karakter 1 chunk oluyor

len(texts) #yani toplamda 233 chunks oldu

233

# Create a ChromaDB

In [6]:
# ! pip install chromadb

In [7]:
# Embed and store the text
# Supplying a persist directory will store the embeddings on disk
persist_directory = 'db'

embedding = OpenAIEmbeddings()   
# OpenAI embeddings modelini kullaniyoruz. onun kendine ait embeddings kümesi var ve tokenizer
# bununla PDF ler ele aliniyor ve anlamlari ele aliniyor
 
vectordb = Chroma.from_documents(documents = texts, # bizim PDF lerin toplami
                                 embedding = embedding, 
                                 persist_directory = persist_directory)
                                 # owerwrite parametreside var. True ise m,evcut verilerin üstüne yazar, False ise yeni bir database olusturur

# burda texts embeddings ile vectörlere cevrilir ve Chroma db e kaydedilir
# ve calistirinca bir 'db' adinda bir database olusturulur ve icine kaydedilir
# tüm belgelerimizi kodladi yani Openai embeddinginden gecti.database'e kaydedilme burda yapiliyor

In [8]:
# persiste the db to disk
vectordb.persist()  # bu vector database diske kaydeder. yani datalari bir sürü diske kaydeder. database in boyutunu belirler ve icindekileri diske yazar
vectordb = None # yukarida diske kaydedildigi icin bu degiskenin artik kullanilmayacagini gösterir.

# vector database size'ini belirlemek icin vector database 'collection' ve documents özelliklerini kullanir
# documents bizim kullandigimiz degisken icinde tüm textin oldugu

# bunlari kaydedersekiyi olur. cünkü kaydetmezsek bu uygulamayi her actigimizda tekrardan Openai embeddingi yapmak gerekecek
# kaydedersek ordan direkt kullanilir. 
# aslinda bu kaydetme islemini bir kere yazariz ve owerwrite parametresini ona göre ayarlayabiliriz
# 

In [9]:
# now we can load the persisted database from disk and use it as normal
vectordb = Chroma(persist_directory = persist_directory,
                  embedding_function = embedding)

# daha önce diske kaydettigimiz Chromadb yi tekrardan kullanmak icin yapiyoruz

### make a retriever

In [None]:
# burda aslinda byte yapmak istedigi sey . ilgili sorunun dosyasini buluyor.
# hem cevabin oldugu metni veriyor ve en altta dosyanin adinida veriyor
# burda metinler chunks lar halinde geliyor. her 1000 charakter 1 chunks oluyor

In [10]:
retriever = vectordb.as_retriever()   # bu database'i bir arama motoruna cevirir. ve metin aramsi yapmak icin kullanilir
# bir üsttekiyle retriever baglantili hale getiriliyor. baglantili haliyle bir asagida yeni bir function icinde kullanilabiliyor
docs = retriever.get_relevant_documents('How much money did Pando raise?') # bu bir liste döndürür. burda bir degisken adi yazip, yularida bu degisken adinin oldugu yere soruyu yazabiliriz, daha okunakli olur
docs

# bu retriever asagida chain olusturulken kullanacagiz

[Document(page_content='Signaling that investments in the supply chain sector remain robust, Pando, a startup developing fulfillment management technologies, today announced that it raised $30 million in a Series B round, bringing its total raised to $45 million.\n\nIron Pillar and Uncorrelated Ventures led the round, with participation from existing investors Nexus Venture Partners, Chiratae Ventures and Next47. CEO and founder Nitin Jayakrishnan says that the new capital will be put toward expanding Pando’s global sales, marketing and delivery capabilities.\n\n“We will not expand into new industries or adjacent product areas,” he told TechCrunch in an email interview. “Great talent is the foundation of the business — we will continue to augment our teams at all levels of the organization. Pando is also open to exploring strategic partnerships and acquisitions with this round of funding.”', metadata={'source': 'new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investm

In [11]:
len(docs)
# burdanda 4 tane chunks oldugunu görebiliyorum

4

In [12]:
retriever = vectordb.as_retriever(search_kwargs = {'k' : 2})
retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7fe2944fe2e0>, search_kwargs={'k': 2})

In [13]:
retriever.search_type  # burda benzerlik aramasi yaptigomo görebiliyorum

'similarity'

In [14]:
retriever.search_kwargs

{'k': 2}

### Make a chain

In [None]:
# yukarida retriever ile ilgili kisimlari yaptik simdi chain yapacagiz ama tam olarak ne yapacagimizi tam bilmiyorum

In [18]:
# creta the chain to answer questions
# burda bir tanimlama yaoacagiz ve asagida sorgu yaparken kullanacagiz

qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),
                                        chain_type='stuff',
                                        retriever=retriever,
                                        return_source_documents = True)

# qa_chain = bunu yazdirirsak icinde Retrival ile bilgileri yazdiriyor
# openAi killandigimizi gösteriyor ve open_key gibi biligler veriyor

In [20]:
query = 'How much money did Pando raise?'

qa_chain(query)
# burda gelen cevabin icinde aslinda nelerin oldugunu görebiliyoruz
# query
# result
# source_documents

{'query': 'How much money did Pando raise?',
 'result': ' Pando raised $30 million in a Series B round.',
 'source_documents': [Document(page_content='Signaling that investments in the supply chain sector remain robust, Pando, a startup developing fulfillment management technologies, today announced that it raised $30 million in a Series B round, bringing its total raised to $45 million.\n\nIron Pillar and Uncorrelated Ventures led the round, with participation from existing investors Nexus Venture Partners, Chiratae Ventures and Next47. CEO and founder Nitin Jayakrishnan says that the new capital will be put toward expanding Pando’s global sales, marketing and delivery capabilities.\n\n“We will not expand into new industries or adjacent product areas,” he told TechCrunch in an email interview. “Great talent is the foundation of the business — we will continue to augment our teams at all levels of the organization. Pando is also open to exploring strategic partnerships and acquisitions

In [21]:
# ciktilari daha güzel görmek icin bir funktion yazacagiz

def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response['source_documents']:
        print(source.metadata['source'])

In [22]:
# example with function

query = 'How much money did Pando raise?'
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Pando raised $30 million in a Series B round, bringing its total raised to $45 million.


Sources:
new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
