# Wiki RAG

Load a FAISS vector database with the Miniscript wiki content.

In [1]:
# Install prerequisites.

%pip install boto3 --quiet
%pip install docx2txt --quiet
%pip install langchain --quiet
%pip install pydantic==1.10.13 --quiet
%pip install pypdf==3.8.1 faiss-cpu==1.7.4 --quiet
%pip install python-docx --quiet
%pip install sqlalchemy==2.0.21 --quiet
%pip install tiktoken==0.4.0 --quiet
#%pip install faiss-cpu==1.7.4 # For CPU Installation
%pip install faiss-gpu # For CUDA 7.5+ Supported GPU's.

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
# Import all the things.

import docx
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings import BedrockEmbeddings
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms.bedrock import Bedrock
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
import numpy as np
import os
import shutil
import sys
import time
from utils.TokenCounterHandler import TokenCounterHandler
import zipfile
from IPython.display import display, FileLink, FileLinks, Markdown

In [2]:
# Load from this location.

data_path = './data'
aws_profile_name = 'sandbox'

# Setting this to true will force the vector database to be regenerated on each run.
# Setting it to false will try and load the database from a file.
rebuild_database = False

In [4]:
# Chunk the files.

files = []

# Recursively search for files in the data path.
for foldername, subfolders, filenames in os.walk(data_path):
    for filename in filenames:
        if not filename.endswith('.txt'):
            continue

        path = os.path.join(foldername, filename)

        # Generate a metadata dictionary for the document.
        files.append({
            'path': path,
            'subject': filename.split('.')[0],
            'modified_date': time.ctime(os.path.getmtime(path)),
        })

# Chunk the documents so that each chunk has a max of 512 tokens; roughly 2000 characters.

documents = []

for idx, file in enumerate(files):
    loader = TextLoader(file['path'])
    document = loader.load()
    for document_fragment in document:
        document_fragment.metadata = file
        
    print(f'{len(document)} {document}\n')
    documents += document

# - in our testing Character split works better with this PDF data set
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 2000,
    chunk_overlap  = 200,
)

docs = text_splitter.split_documents(documents)

avg_doc_length = lambda documents: sum([len(doc.page_content) for doc in documents])//len(documents)
print(f'Average length among {len(documents)} documents loaded is {avg_doc_length(documents)} characters.')
print(f'After the split we have {len(docs)} documents as opposed to the original {len(documents)}.')
print(f'Average length among {len(docs)} documents (after split) is {avg_doc_length(docs)} characters.')

1 [Document(page_content='Normally in MiniScript, whenever a variable a used in an expression, if it refers to a function, that function is automatically invoked.  The <c>@</c> operator suppresses this invocation, allowing you to work with a function reference without invoking it.\n\n=== Example ===\n\nTry the following at any interactive MiniScript [[REPL]]:\n\n<ms>f = rnd    // makes f contain the result of calling [[rnd]]\nf          // prints that result\nf          // prints that SAME result again (f is just a number\nf = @rnd   // makes f refer to the [[rnd]] function\nf          // invokes f, and prints a random number\nf          // invokes f again, printing a DIFFERENT result\n</ms>', metadata={'path': './data/@ operator.txt', 'subject': '@ operator', 'modified_date': 'Fri Nov 10 10:27:35 2023'})]

1 [Document(page_content='The <msinline>abs</msinline> function returns the absolute value of the given number.\n\n=== Arguments ===\n\n{| class="wikitable"\n|-\n! Parameter Name !!

In [6]:
# Generate the database.  If necessary.
#os.environ['BEDROCK_ASSUME_ROLE'] = '<YOUR_VALUES>'
os.environ['AWS_PROFILE'] = aws_profile_name

module_path = ".."
sys.path.append(os.path.abspath(module_path))
from utils import bedrock, print_ww

bedrock_client = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None),
    runtime=True # Default. Needed for invoke_model() from the data plane
)

token_counter = TokenCounterHandler()

# Create the Anthropic Model.
llm = Bedrock(model_id="anthropic.claude-v2", 
              client=bedrock_client, 
              model_kwargs={
                  'max_tokens_to_sample': 1000
              }, 
              callbacks=[token_counter])

# Create the Titan Embeddings Model.
bedrock_embeddings = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1",
                                       client=bedrock_client)

sample_embedding = np.array(bedrock_embeddings.embed_query(docs[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

db_path = os.path.join(data_path, 'miniscript.vdb')
if rebuild_database or not os.path.exists(db_path):
    print("Generating the vector store.  This may take some time.")
    vectorstore_faiss = FAISS.from_documents(
        docs,
        bedrock_embeddings,
    )
    vectorstore_faiss.save_local(db_path)
else:
    vectorstore_faiss = FAISS.load_local(db_path, bedrock_embeddings)

wrapper_store_faiss = VectorStoreIndexWrapper(vectorstore=vectorstore_faiss)

def ask(query):
    prompt_template = """Human: Here is a set of context, contained in <context> tags:

<context>
{context}
</context>

Use the context to provide an answer to the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
Your answer must be nicely formatted using Markdown.

Provide code examples in Miniscript demonstrating how to perform the requested action.

{question}

Assistant:"""

    PROMPT = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
    )

    filter = {}

    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore_faiss.as_retriever(
            search_type="similarity",
            search_kwargs={"k": 50, "filter": filter},
            callbacks=[token_counter]
        ),
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT},
        callbacks=[token_counter]
    )

    result = qa({"query": query})

    # print(f'Query: {result["query"]}\n')
    # print(f'Result: {result["result"]}\n')
    # print(f'Context Documents: ')

    display(Markdown(result['result']))
    # display(Markdown('---\n# Sources'))
    # for i, srcdoc in enumerate(result["source_documents"]):
    #     # print(f'{srcdoc.metadata}\n')
    #     display(FileLink(path=srcdoc.metadata['path'], result_html_prefix=f'  {i+1}. ', result_html_suffix=''))
    
    # query = "Your answer should be nicely formatted using Markdown.\n" + query
    # answer = wrapper_store_faiss.query(question=query, llm=llm)
    # display(Markdown(answer))

    # query_embedding = vectorstore_faiss.embedding_function(query)
    # np.array(query_embedding)

    # relevant_documents = vectorstore_faiss.similarity_search_by_vector(query_embedding)
    # display(Markdown('---\n# Sources'))
    # for i, rel_doc in enumerate(relevant_documents):
    #     display(FileLink(path=rel_doc.metadata["path"], result_html_prefix=f'  {i+1}. ', result_html_suffix=''))

Create new client
  Using region: None
  Using profile: sandbox
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-east-1.amazonaws.com)
Sample embedding of a document chunk:  [ 0.37109375  0.04980469  0.04174805 ...  0.27539062 -0.95703125
  0.5078125 ]
Size of the embedding:  (1536,)


In [7]:
ask('Give me an example of how to use the TileDisplay.')


Token Counts:
Total: 4814
Embedding: N/A
Prompt: 4594
Generation:220



 Here is an example using TileDisplay in Mini Micro:

```minicode
display(5).mode = displayMode.tile
td = display(5)
td.tileSet = file.loadImage("/sys/pics/TileShapes.png")
td.tileSetTileSize = 64  
td.extent = [10, 8]

// Draw a border 
for x in range(0,10)
  td.setCell(x, 0, 1) 
  td.setCell(x, 7, 1)
end for

for y in range(0,8)
  td.setCell(0, y, 1)
  td.setCell(9, y, 1) 
end for

// Set the center tile
td.setCell(4, 3, 20) 
```

This configures display 5 as a TileDisplay, loads a tileset image, sets the extent to 10x8 tiles, draws a gray square border around the edges using tile index 1, and sets the center tile to index 20 (a blue circle).