# Miniscript RAG

Load a FAISS vector database with the Miniscript wiki content.

In [None]:
# Install prerequisites.

%pip install boto3 --quiet
%pip install docx2txt --quiet
%pip install langchain --quiet
%pip install pydantic==1.10.13 --quiet
%pip install pypdf==3.8.1 faiss-cpu==1.7.4 --quiet
%pip install python-docx --quiet
%pip install sqlalchemy==2.0.21 --quiet
%pip install tiktoken==0.4.0 --quiet
#%pip install faiss-cpu==1.7.4 # For CPU Installation
%pip install faiss-gpu # For CUDA 7.5+ Supported GPU's.

In [None]:
# Import all the things.

import docx
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import BedrockEmbeddings
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms.bedrock import Bedrock
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
import numpy as np
import os
import shutil
import sys
import time
from utils.TokenCounterHandler import TokenCounterHandler
import zipfile
from IPython.display import display, FileLink, FileLinks, Markdown

In [44]:
# Load from this location.

data_path = './data/docs'
aws_profile_name = 'sandbox'

# Setting this to true will force the vector database to be regenerated on each run.
# Setting it to false will try and load the database from a file.
rebuild_database = False

In [38]:
# Chunk the files.

files = []

# Recursively search for files in the data path.
for foldername, subfolders, filenames in os.walk(data_path):
    for filename in filenames:
        if not filename.endswith('.txt') and not filename.endswith('.md') and not filename.endswith('.pdf') and not filename.endswith(".ms"):
            continue

        path = os.path.join(foldername, filename)

        if filename.endswith(".ms"):
            subject = f"Source code example: {filename.split('.')[0]}"
        else:
            subject = f"Documentation: {filename.split('.')[0]}"

        # Generate a metadata dictionary for the document.
        files.append({
            'path': path,
            'subject': subject,
            'modified_date': time.ctime(os.path.getmtime(path)),
        })

# Chunk the documents so that each chunk has a max of 512 tokens; roughly 2000 characters.

documents = []

for idx, file in enumerate(files):
    if file['path'].endswith('.pdf'):
        loader = PyPDFLoader(file['path'])
    else:
        loader = TextLoader(file['path'])
    document = loader.load()
    for document_fragment in document:
        document_fragment.metadata = file
        
    #print(f'{len(document)} {document}\n')
    documents += document

# - in our testing Character split works better with this PDF data set
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 2000,
    chunk_overlap  = 200,
)

docs = text_splitter.split_documents(documents)

avg_doc_length = lambda documents: sum([len(doc.page_content) for doc in documents])//len(documents)
print(f'Average length among {len(documents)} documents loaded is {avg_doc_length(documents)} characters.')
print(f'After the split we have {len(docs)} documents as opposed to the original {len(documents)}.')
print(f'Average length among {len(docs)} documents (after split) is {avg_doc_length(docs)} characters.')

incorrect startxref pointer(1)


Average length among 741 documents loaded is 4790 characters.
After the split we have 2478 documents as opposed to the original 741.
Average length among 2478 documents (after split) is 1504 characters.


In [39]:
# Generate the database.  If necessary.
#os.environ['BEDROCK_ASSUME_ROLE'] = '<YOUR_VALUES>'
os.environ['AWS_PROFILE'] = aws_profile_name

module_path = ".."
sys.path.append(os.path.abspath(module_path))
from utils import bedrock, print_ww

bedrock_client = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None),
    runtime=True # Default. Needed for invoke_model() from the data plane
)

token_counter = TokenCounterHandler()

# Create the Anthropic Model.
llm = Bedrock(model_id="anthropic.claude-v2", 
              client=bedrock_client, 
              model_kwargs={
                  'max_tokens_to_sample': 1000
              }, 
              callbacks=[token_counter])

# Create the Titan Embeddings Model.
bedrock_embeddings = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1",
                                       client=bedrock_client)

sample_embedding = np.array(bedrock_embeddings.embed_query(docs[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

db_path = os.path.join(data_path, 'miniscript.vdb')
if rebuild_database or not os.path.exists(db_path):
    print("Generating the vector store.  This may take some time.")
    vectorstore_faiss = FAISS.from_documents(
        docs,
        bedrock_embeddings,
    )
    vectorstore_faiss.save_local(db_path)
else:
    vectorstore_faiss = FAISS.load_local(db_path, bedrock_embeddings)

wrapper_store_faiss = VectorStoreIndexWrapper(vectorstore=vectorstore_faiss)

def ask(query):
    prompt_template = """Human: Here is a set of context, contained in <context> tags:

<context>
{context}
</context>

Use the context to provide an answer to the question at the end.
Forget anything you know about Python or Javascript.  Miniscript is the only language you know.
If you don't know the answer, use any other knowledge you found to figure out an answer that conforms to Miniscript syntax.
All code examples must use Miniscript syntax.
If the function you are needing is not found in in the user manual or wiki, you must define it completely in the source code you write.
Your answer must be nicely formatted using Markdown.

Provide code examples using Miniscript syntax demonstrating how to perform the requested action.

{question}

Assistant:"""

    PROMPT = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
    )

    filter = {}

    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore_faiss.as_retriever(
            search_type="similarity",
            search_kwargs={"k": 50, "filter": filter} #,
            #callbacks=[token_counter]
        ),
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT} #,
        #callbacks=[token_counter]
    )

    result = qa({"query": query})

    # print(f'Query: {result["query"]}\n')
    # print(f'Result: {result["result"]}\n')
    # print(f'Context Documents: ')

    display(Markdown(result['result']))
    # display(Markdown('---\n# Sources'))
    # for i, srcdoc in enumerate(result["source_documents"]):
    #     # print(f'{srcdoc.metadata}\n')
    #     display(FileLink(path=srcdoc.metadata['path'], result_html_prefix=f'  {i+1}. ', result_html_suffix=''))
    
    # query = "Your answer should be nicely formatted using Markdown.\n" + query
    # answer = wrapper_store_faiss.query(question=query, llm=llm)
    # display(Markdown(answer))

    # query_embedding = vectorstore_faiss.embedding_function(query)
    # np.array(query_embedding)

    # relevant_documents = vectorstore_faiss.similarity_search_by_vector(query_embedding)
    # display(Markdown('---\n# Sources'))
    # for i, rel_doc in enumerate(relevant_documents):
    #     display(FileLink(path=rel_doc.metadata["path"], result_html_prefix=f'  {i+1}. ', result_html_suffix=''))

Create new client
  Using region: None
  Using profile: sandbox
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-east-1.amazonaws.com)
Sample embedding of a document chunk:  [ 0.16768362  0.24264549 -0.09052806 ...  0.05101648 -0.57257926
  0.5195598 ]
Size of the embedding:  (1536,)
Generating the vector store.  This may take some time.


In [21]:
ask("Based on the documentation, what can you tell me about the syntax of Miniscript?")


Token Counts:
Total: 125326
Embedding: N/A
Prompt: 120029
Generation:5297



 Here are some key points about MiniScript syntax based on the provided documentation:

## Statements

- MiniScript statements are separated by line breaks, not semicolons.
- Long statements can be broken across multiple lines by putting the line break after an opening parenthesis, comma, colon, or operator.

```
x = {
    "foo": "A commonly used variable name in example code", 
    "bar": "A second commonly used example variable name"
}
```

## Functions

- Function calls don't require parentheses if they are the entire statement or take no arguments.

```
print "Hello"
wait 1 
```

- Functions are defined with `function` and `end function`. Parameters don't require parentheses.

```
increment = function(x) 
  x + 1
end function
```

## Variables

- Variables are declared simply by assigning to them. All variables are local by default.

```
x = 5
``` 

- Global variables are accessed through the `globals` map.

```
globals.x = 10
```

## Control Flow

- `if/else` blocks use `end if` instead of just `end`.
- `while` loops use `end while`. 
- `for` loops use `end for`.

```
if x > 10 then
  print "Big number"
else
  print "Small number"
end if
```

- `break` and `continue` work as in other languages.

## Data Structures

- Lists use square bracket syntax like Python.
- Maps use curly brace syntax like JavaScript.
- Dot syntax can be used to access keys that are valid identifiers.

```
mylist = [1, 2, 3]  
mymap = {"x": 1, "y": 2}
print mymap.x
```

So in summary, MiniScript has a clean, simple syntax with semantics similar to other popular dynamic languages.

In [47]:
ask("""
Write a function that will generate Perlin noise based on (x, y) parameters.

You may not import any libraries.
Function declarations should be commented with a function summary and a description of the input parameters and the output value.
Ensure your source code complies with Miniscript syntax.
""")


Token Counts:
Total: 56735
Embedding: N/A
Prompt: 54266
Generation:2469



 Here is a function to generate Perlin noise in Miniscript:

```miniscrypt
// perlinNoise(x, y)
// Generate a Perlin noise value based on the given x and y coordinates.
// x: The x coordinate
// y: The y coordinate  
// Returns: A Perlin noise value between 0 and 1
perlinNoise = function(x, y)
  // Define permutation table
  p = [151,160,137,91,90,15,131,13,201,95,96,53,194,233,7,225,140,36,103,30,69,142,8,99,37,240,21,10, 
        23,190, 6,148,247,120,234,75,0,26,197,62,94,252,219,203,117,35,11,32,57,177,33,88,237,149,56,87,
        174,20,125,136,171,168, 68,175,74,165,71,134,139,48,27,166,77,146,158,231,83,111,229,122,60,211,
        133,230,220,105,92,41,55,46,245,40,244,102,143,54, 65,25,63,161,1,216,80,73,209,76,132,187,208, 89,
        18,169,200,196,135,130,116,188,159,86,164,100,109,198,173,186, 3,64,52,217,226,250,124,123,5,202,
        38,147,118,126,255,82,85,212,207,206,59,227,47,16,58,17,182,189,28,42,223,183,170,213,119,248,152, 
        2,44,154,163, 70,221,153,101,155,167, 43,172,9,129,22,39,253, 19,98,108,110,79,113,224,232,178,185, 
        112,104,218,246,97,228,251,34,242,193,238,210,144,12,191,179,162,241, 81,51,145,235,249,14,239,107,
        49,192,214, 31,181,199,106,157,184, 84,204,176,115,121,50,45,127, 4,150,254,138,236,205,93,222,114,
        67,29,24,72,243,141,128,195,78,66,215,61,156,180]
    
  // Calculate noise contributions from each corner
  dotGridGradient = function(ix, iy) {
    // Get relative x,y of point in relation to grid cell
    dx = x - ix
    dy = y - iy
    
    // Calculate dot product with gradient directions
    g00 = p[p[ix] + iy]
    g10 = p[p[ix + 1] + iy] 
    g01 = p[p[ix] + iy + 1]
    g11 = p[p[ix + 1] + iy + 1]

    // Dot product gives magnitude of gradient contribution
    dx1 = dx - 1
    dy1 = dy - 1 
    t = 0.5 - dx*dx - dy*dy
    if (t > 0) n0 = t * t * t * t * dot( [dx,dy], [g00&15, g00>>4] )
    t = 0.5 - dx1*dx1 - dy*dy
    if (t > 0) n1 = t * t * t * t * dot( [dx1,dy], [g10&15, g10>>4] )
    t = 0.5 - dx*dx - dy1*dy1 
    if (t > 0) n2 = t * t * t * t * dot( [dx,dy1], [g01&15, g01>>4] )
    t = 0.5 - dx1*dx1 - dy1*dy1
    if (t > 0) n3 = t * t * t * t * dot( [dx1,dy1], [g11&15, g11>>4] )

    // Sum contributions from each corner to get noise value
    return 70 * (n0 + n1 + n2 + n3)
  end function

In [53]:
ask("""
Write a program that sets display 4 to a pixel display, then uses that display to render an XOR pattern that fills the display's complete width and height.
Use the bitXor function to combine the x and y values.
Use the `color.fromList` function to generate the color from the XOR-ed x and y values.

You may not import any libraries.
Function declarations should be commented with a function summary and a description of the input parameters and the output value.
Ensure your source code complies with Miniscript syntax.
""")


Token Counts:
Total: 119512
Embedding: N/A
Prompt: 115453
Generation:4059



 Here is a Miniscript program that renders an XOR pattern to display 4:

```miniscript
// Set display 4 to pixel display mode
display(4).mode = displayMode.pixel

// Get width and height of display 4
width = display(4).width
height = display(4).height

// Render XOR pattern
for y in range(height)
	for x in range(width)
		// XOR x and y coordinates
		xor = bitXor(x, y)
		
		// Generate color from XOR result
		c = color.fromList([xor, xor, xor])
		
		// Set pixel to generated color
		display(4).setPixel(x, y, c)
	end for
end for
```

To summarize:

- Set display 4 to pixel display mode
- Get width and height of display 4
- Render XOR pattern by:
  - XOR'ing x and y coordinates
  - Generating color from XOR result
  - Setting pixel to generated color

The bitXor function XORs the bits of two integers.
The color.fromList function generates a color from a list of R, G, B values.

This renders an XOR pattern that fills the entire display by combining the x and y coordinates with bitwise XOR to generate a color for each pixel.