# Langchain Multi-doc retriever with ChromaDB

In [1]:
from dotenv import load_dotenv
import os

# .env dosyasını yükle
load_dotenv()

# Ortam değişkenini al
openai_api_key = os.getenv('OPENAI_API_KEY')

In [2]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader, DirectoryLoader

## Load some files for a source

In [3]:
# ! wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip
# ! unzip -q new_articles.zip -d new_articles

# burdan bir dropbox kaynagondan kullanmak icin rastgele dosyalr indirdik .txt formatinda

# Load multiple and process documents

In [5]:
# Load and process the text files
# loader = TExtloader('single_text_file.txt')

loader = DirectoryLoader('./new_articles/', glob='./*txt', loader_cls=TextLoader) # pdf varsa PDFloader gibi biseyde var ama netten bak)

documents = loader.load()
documents[1]

# burda tüm textleri bir yere topluyor. 
# mesela documents[1] bunu yapinca new_articles dosysindan bir dosyanin tamamini getirdi

Document(page_content='The best way to avoid a down round is to found an AI startup\n\nAs we see unicorns slash staff and the prevalence of down rounds spike, it may seem that the startup ecosystem is chock-full of bad news and little else. That’s not precisely the case.\n\nWhile AI, and in particular the generative AI subcategory, are as hot as the sun, not all venture attention is going to the handful of names that you already know. Sure, OpenAI is able to land nine and 10-figure rounds from a murderer’s row of tech investors and mega-cap corporations. And rising companies like Hugging Face and Anthropic cannot stay out of the news, proving that smaller AI-focused startups are doing more than well.\n\nIn fact, new data from Carta, which provides cap table management and other services, indicates that AI-focused startups are outperforming their larger peer group at both the seed and Series A stage.\n\nThe dataset, which notes that AI-centered startups are raising more and at higher va

In [6]:
# splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents) 
# burda bütün metinlerin toplami var ama
# ama chunklara ayrilmis durumda. her 1000 karakter 1 chunk oluyor

len(texts) #yani toplamda 233 chunks oldu

233

# Create a ChromaDB

In [None]:
# ! pip install chromadb

In [10]:
# Embed and store the text
# Supplying a persist directory will store the embeddings on disk
persist_directory = 'db'

embedding = OpenAIEmbeddings()   
# OpenAI embeddings modelini kullaniyoruz. onun kendine ait embeddings kümesi var ve tokenizer
# bununla PDF ler ele aliniyor ve anlamlari ele aliniyor
 
vectordb = Chroma.from_documents(documents = texts, # bizim PDF lerin toplami
                                 embedding = embedding, 
                                 persist_directory = persist_directory)

# burda texts embeddings ile vectörlere cevrilir ve Chroma db e kaydedilir
# ve calistirinca bir 'db' adinda bir database olusturulur ve icine kaydedilir