In [1]:
import os

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

from langchain_openai import ChatOpenAI

chat = ChatOpenAI(
    openai_api_key=os.environ.get('OPENAI_API_KEY'),
    openai_api_base=os.environ.get('CHATGPT_API_ENDPOINT')   
)



In [2]:
from langchain.document_loaders import ReadTheDocsLoader

In [3]:
loader = ReadTheDocsLoader("htmldocs")
docs = loader.load()

In [5]:
print(docs[0].page_content[:500])

langchain.indexes.vectorstore.VectorstoreIndexCreator¶
class langchain.indexes.vectorstore.VectorstoreIndexCreator[source]¶
Bases: BaseModel
Logic for creating indexes.
Create a new model by parsing and validating input data from keyword arguments.
Raises ValidationError if the input data cannot be parsed to form a valid model.
param embedding: Embeddings [Optional]¶
param text_splitter: TextSplitter [Optional]¶
param vectorstore_cls: Type[VectorStore] = <class 'langchain_community.vectorstores.


In [6]:
# gpt-3.5-turbo 4096 tokens
# If 4096 - (Input(Instruction + query + context) + output)
#     If Chunk nums = 5:
#         Chunk Size = 2000 / 5 = 400

# So Chunk Size <= 400

# Too small not meaningful
# Too big not efficient

In [7]:
import tiktoken

In [8]:
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
tokenizer

<Encoding 'cl100k_base'>

In [10]:
def token_count(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [15]:
tokens = [token_count(doc.page_content) for doc in docs]
tokens

[1538, 1605]

In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [17]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=token_count,
    separators=["\n\n", "\n", " ", ""]
)

In [19]:
chunks = text_splitter.split_text(docs[0].page_content)
len(chunks)

5

In [21]:
token_count(chunks[0]), token_count(chunks[1]), token_count(chunks[2]), token_count(chunks[3]), token_count(chunks[4])

(383, 373, 345, 376, 105)

In [22]:
chunks

["langchain.indexes.vectorstore.VectorstoreIndexCreator¶\nclass langchain.indexes.vectorstore.VectorstoreIndexCreator[source]¶\nBases: BaseModel\nLogic for creating indexes.\nCreate a new model by parsing and validating input data from keyword arguments.\nRaises ValidationError if the input data cannot be parsed to form a valid model.\nparam embedding: Embeddings [Optional]¶\nparam text_splitter: TextSplitter [Optional]¶\nparam vectorstore_cls: Type[VectorStore] = <class 'langchain_community.vectorstores.inmemory.InMemoryVectorStore'>¶\nparam vectorstore_kwargs: dict [Optional]¶\nasync afrom_documents(documents: List[Document]) → VectorStoreIndexWrapper[source]¶\nCreate a vectorstore index from documents.\nParameters\ndocuments (List[Document]) – \nReturn type\nVectorStoreIndexWrapper\nasync afrom_loaders(loaders: List[BaseLoader]) → VectorStoreIndexWrapper[source]¶\nCreate a vectorstore index from loaders.\nParameters\nloaders (List[BaseLoader]) – \nReturn type\nVectorStoreIndexWrappe

In [23]:
from langchain_openai import OpenAIEmbeddings

In [28]:
embedddings = OpenAIEmbeddings(
    base_url=os.environ.get('CHATGPT_API_ENDPOINT'),
)

In [29]:
sentences1 = "Hello, my name is John."
sentences2 = "I am a software engineer."
sentences3 = "I love to code."

In [35]:
embeddding1 = embedddings.embed_query(sentences1)
embeddding2 = embedddings.embed_query(sentences2)
embeddding3 = embedddings.embed_query(sentences3)

In [33]:
import numpy as np

In [36]:
np.dot(embeddding1, embeddding2)

np.float64(0.8136125713435753)

In [37]:
np.dot(embeddding1, embeddding3)

np.float64(0.7818426246123964)

In [55]:
import chromadb

chromadb.api.client.SharedSystemClient.clear_system_cache()

In [56]:
from langchain.vectorstores import Chroma

In [57]:
persist_directory = "./db"

In [58]:
!rm -rf $persist_directory

In [59]:
doc_chunks = text_splitter.create_documents(chunks)

In [60]:
vectorDB = Chroma.from_documents(
    documents=doc_chunks,
    embedding=embedddings,
    persist_directory=persist_directory
)

# vectorDB = Chroma.from_texts(
#     texts=chunks,
#     embedding=embedddings,
#     persist_directory=persist_directory
# )

In [48]:
print(vectorDB._collection.count())

10
