In [21]:
import os
import sys
root_dir = sys.path[0]

## Prepare Documents
Only needs to be run once, then will persist in memory

In [1]:
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
loader = DirectoryLoader("texts", glob="*.txt", show_progress=True)
docs = loader.load()
len(docs)

100%|██████████| 1/1 [00:02<00:00,  2.75s/it]


1

In [2]:
raw_text = ''
for i, doc in enumerate(docs):
    text = doc.page_content
    if text:
        raw_text += text
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap  = 100,
    length_function = len,
    is_separator_regex = False,
)
texts = text_splitter.split_text(raw_text)
len(texts)

53

In [3]:
texts[0]

'A Appendix: Instructions This appendix provides the English translations of the instructions for the 4I treatment.'

## Prepare Database

In [32]:
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores.chroma import Chroma
create_db = False # set True to create a new database
model_name = "BAAI/bge-small-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

embedding_function = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs,

)

In [33]:
db_dir = os.path.join(root_dir, "chroma_db")
if create_db:
### Make the chroma and persiste to disk
    db = Chroma.from_texts(texts,
                        embedding_function,
                        persist_directory=db_dir,)
else:
    db = Chroma(persist_directory=db_dir, embedding_function=embedding_function)

In [34]:
### Query the database with 5 most similar documents
query = "When can you accept trades?"

db.similarity_search(query, k=5)

[Document(page_content='Another trader can now accept the new Ask of 70 that you have just made by clicking on the ‘Immediately BUY’ button at the bottom of the screen. The transaction price will then appear in the middle column ‘Transaction Details’. Because you sold the asset the Buy/Sell column will indicate ‘Sell’ so that you can keep track of what you are buying and selling. As a result of the transaction, the Ask will not be available anymore in the ‘Outstanding Ask’ column.'),
 Document(page_content='Another trader can now accept the new Bid of 50 that you have just made by clicking on the ‘Immediately SELL’ button at the bottom of the screen. The transaction price will then appear in the middle column ‘Transaction Details’ (see image below). Because you bought the asset the Buy/Sell column will indicate ‘Buy’ so that you can keep track of what you are buying and selling.'),
 Document(page_content='Because the Ask of 70 has been accepted, you have sold one unit of Asset Y at 70 

In [30]:
# create an mmr retriever to get the most relevant matching documents
retriever = db.as_retriever(k=5, fetch_k=20, search_type="mmr")

retriever.get_relevant_documents(query)[1]

Document(page_content='4. You can submit Bids and Asks or accept available offers from other traders to trade units. 5. The amount units pay to their owners at the end of a market period is based on the state (A, B, or C) for Type I and Type II as follows. All States are equally likely.')

## Create Chat

In [42]:
def wrap_text(text, width=90): #preserve_newlines
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

In [41]:
from langchain.llms import GPT4All
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser



template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

model_path = os.path.join(root_dir,
                          "model",
                          "mistral-7b-instruct-v0.1.Q4_0.gguf")

model = GPT4All(
    model=model_path
)

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [43]:
text_reply = chain.invoke("Tell me about auctions. When can you accept trades?")

print(wrap_text(text_reply))