In [None]:
import os
from dotenv import load_dotenv
from llms import get_model

# Load environment variables from .env file
load_dotenv()

## TOKENIZATION AND CHUNKING

In [None]:
def read_text_files(directory = "./data"):
    result = []

    # Iterate over files in the directory
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path) and filename.endswith(".txt"):
            with open(file_path, "r") as file:
                content = file.read()
                result.append(content)
    
    return result

In [None]:
documents_raw = read_text_files()
print(f"There are: {len(documents_raw)} documents")

In [None]:
import tiktoken

# Different types of encodings: https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L14
tokenizer = tiktoken.encoding_for_model('text-davinci-003')

# Function to count token length
def token_len(text) -> int:
    tokens = tokenizer.encode(
        text,
        disallowed_special=[]
    )
    return len(tokens)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Creating the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400, # Size of each individual chunk
    chunk_overlap=0, # How much overlap there can be between chunks
    length_function=token_len,
    separators=["\n\n", "\n", " ", ""]
)

In [None]:
# Split the text into chunks and make them document
documents = text_splitter.create_documents(documents_raw)
print(f"We have {len(documents)} chunks")
print(documents[0])

## VECTOR EMBEDDINGS and vector database

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'
embedding_model = OpenAIEmbeddings(model=model_name)

In [None]:
from langchain.vectorstores import Chroma

# Call embeddings API
docsearch = Chroma.from_documents(documents, embedding_model)

In [None]:
from langchain.chains import RetrievalQA

llm = get_model()

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff", # Chain types: https://python.langchain.com/en/latest/modules/chains/index_examples/question_answering.html
    retriever=docsearch.as_retriever(search_kwargs={"k": 5}),  # 5 is the max number of results from the retriever
    return_source_documents=True,
)

query = "I have a small budget, what hotels can I visit?"
result = qa({"query": query})
print(result["result"])
print(result["source_documents"])