# Miniscript RAG

Load a FAISS vector database with the Miniscript wiki content.

In [None]:
# Install prerequisites.

%pip install boto3 --quiet
%pip install docx2txt --quiet
%pip install langchain --quiet
%pip install pydantic==1.10.13 --quiet
%pip install pypdf==3.8.1 faiss-cpu==1.7.4 --quiet
%pip install python-docx --quiet
%pip install sqlalchemy==2.0.21 --quiet
%pip install tiktoken==0.4.0 --quiet
#%pip install faiss-cpu==1.7.4 # For CPU Installation
%pip install faiss-gpu # For CUDA 7.5+ Supported GPU's.

In [None]:
# Import all the things.

import docx
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import BedrockEmbeddings
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms.bedrock import Bedrock
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
import numpy as np
import os
import shutil
import sys
import time
from utils.TokenCounterHandler import TokenCounterHandler
import zipfile
from IPython.display import display, FileLink, FileLinks, Markdown

In [26]:
# Load from this location.

data_path = './data/docs'
aws_profile_name = 'sandbox'

# Setting this to true will force the vector database to be regenerated on each run.
# Setting it to false will try and load the database from a file.
rebuild_database = True

In [None]:
# Chunk the files.

files = []

# Recursively search for files in the data path.
for foldername, subfolders, filenames in os.walk(data_path):
    for filename in filenames:
        if not filename.endswith('.txt') and not filename.endswith('.md') and not filename.endswith('.pdf') and not filename.endswith(".ms"):
            continue

        path = os.path.join(foldername, filename)

        if filename.endswith(".ms"):
            subject = f"Source code example: {filename.split('.')[0]}"
        else:
            subject = f"Documentation: {filename.split('.')[0]}"

        # Generate a metadata dictionary for the document.
        files.append({
            'path': path,
            'subject': subject,
            'modified_date': time.ctime(os.path.getmtime(path)),
        })

# Chunk the documents so that each chunk has a max of 512 tokens; roughly 2000 characters.

documents = []

for idx, file in enumerate(files):
    if file['path'].endswith('.pdf'):
        loader = PyPDFLoader(file['path'])
    else:
        loader = TextLoader(file['path'])
    document = loader.load()
    for document_fragment in document:
        document_fragment.metadata = file
        
    print(f'{len(document)} {document}\n')
    documents += document

# - in our testing Character split works better with this PDF data set
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 2000,
    chunk_overlap  = 200,
)

docs = text_splitter.split_documents(documents)

avg_doc_length = lambda documents: sum([len(doc.page_content) for doc in documents])//len(documents)
print(f'Average length among {len(documents)} documents loaded is {avg_doc_length(documents)} characters.')
print(f'After the split we have {len(docs)} documents as opposed to the original {len(documents)}.')
print(f'Average length among {len(docs)} documents (after split) is {avg_doc_length(docs)} characters.')

In [25]:
# Generate the database.  If necessary.
#os.environ['BEDROCK_ASSUME_ROLE'] = '<YOUR_VALUES>'
os.environ['AWS_PROFILE'] = aws_profile_name

module_path = ".."
sys.path.append(os.path.abspath(module_path))
from utils import bedrock, print_ww

bedrock_client = bedrock.get_bedrock_client(
    assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None),
    runtime=True # Default. Needed for invoke_model() from the data plane
)

token_counter = TokenCounterHandler()

# Create the Anthropic Model.
llm = Bedrock(model_id="anthropic.claude-v2", 
              client=bedrock_client, 
              model_kwargs={
                  'max_tokens_to_sample': 1000
              }, 
              callbacks=[token_counter])

# Create the Titan Embeddings Model.
bedrock_embeddings = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1",
                                       client=bedrock_client)

sample_embedding = np.array(bedrock_embeddings.embed_query(docs[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

db_path = os.path.join(data_path, 'miniscript.vdb')
if rebuild_database or not os.path.exists(db_path):
    print("Generating the vector store.  This may take some time.")
    vectorstore_faiss = FAISS.from_documents(
        docs,
        bedrock_embeddings,
    )
    vectorstore_faiss.save_local(db_path)
else:
    vectorstore_faiss = FAISS.load_local(db_path, bedrock_embeddings)

wrapper_store_faiss = VectorStoreIndexWrapper(vectorstore=vectorstore_faiss)

def ask(query):
    prompt_template = """Human: Here is a set of context, contained in <context> tags:

<context>
{context}
</context>

Use the context to provide an answer to the question at the end.
Forget anything you know about Python or Javascript.  Miniscript is the only language you know.
If you don't know the answer, use any other knowledge you found to figure out an answer that conforms to Miniscript syntax.
All code examples must use Miniscript syntax.  You may only import libraries found in the Miniscript documentation.
Your answer must be nicely formatted using Markdown.

Provide code examples using Miniscript syntax demonstrating how to perform the requested action.

{question}

Assistant:"""

    PROMPT = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
    )

    filter = {}

    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore_faiss.as_retriever(
            search_type="similarity",
            search_kwargs={"k": 50, "filter": filter},
            callbacks=[token_counter]
        ),
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT},
        callbacks=[token_counter]
    )

    result = qa({"query": query})

    # print(f'Query: {result["query"]}\n')
    # print(f'Result: {result["result"]}\n')
    # print(f'Context Documents: ')

    display(Markdown(result['result']))
    # display(Markdown('---\n# Sources'))
    # for i, srcdoc in enumerate(result["source_documents"]):
    #     # print(f'{srcdoc.metadata}\n')
    #     display(FileLink(path=srcdoc.metadata['path'], result_html_prefix=f'  {i+1}. ', result_html_suffix=''))
    
    # query = "Your answer should be nicely formatted using Markdown.\n" + query
    # answer = wrapper_store_faiss.query(question=query, llm=llm)
    # display(Markdown(answer))

    # query_embedding = vectorstore_faiss.embedding_function(query)
    # np.array(query_embedding)

    # relevant_documents = vectorstore_faiss.similarity_search_by_vector(query_embedding)
    # display(Markdown('---\n# Sources'))
    # for i, rel_doc in enumerate(relevant_documents):
    #     display(FileLink(path=rel_doc.metadata["path"], result_html_prefix=f'  {i+1}. ', result_html_suffix=''))

Create new client
  Using region: None
  Using profile: sandbox
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-east-1.amazonaws.com)
Sample embedding of a document chunk:  [ 0.16768362  0.24264549 -0.09052806 ...  0.05101648 -0.57257926
  0.5195598 ]
Size of the embedding:  (1536,)
Generating the vector store.  This may take some time.


In [None]:
ask('Give me an example of how to use the TileDisplay.  Fill the tile display with something creative.')

In [27]:
ask("""Write a `perlin_noise` function that takes x and y numeric parameters, then uses those values to return a Perlin noise value.

Set a display to show pixels, then fill that display with Perlin noise.

Any functions or types not built in to the language should be fully defined in your source code.
You may use the `mathUtil` library and namespace.
There is no `math` import.  There is no `math` namespace.
Ensure that your code uses Miniscript syntax.""")


Token Counts:
Total: 7267
Embedding: N/A
Prompt: 6872
Generation:395



 Here is one way to implement a Perlin noise function and fill a display with noise in MiniScript:

```
import "mathUtil"

perlin_noise = function(x, y)
  x = mathUtil.frac(x)
  y = mathUtil.frac(y)
  
  // Generate random gradient vectors
  gx1 = mathUtil.randRange(-1, 1) 
  gy1 = mathUtil.randRange(-1, 1)
  gx2 = mathUtil.randRange(-1, 1)
  gy2 = mathUtil.randRange(-1, 1)

  // Calculate dot product between vectors
  dot1 = gx1*x + gy1*y
  dot2 = gx2*(x-1) + gy2*(y-1)

  // Interpolate between gradients
  u = mathUtil.fade(x)
  v = mathUtil.fade(y)
  return mathUtil.lerp(dot1, dot2, u) + 
         mathUtil.lerp(dot2, dot1, v)  
end function

// Fade function
mathUtil.fade = function(t) 
  return t * t * t * (t * (t * 6 - 15) + 10)
end function

// Fill display with noise
display(4).clear()  
for y in range(0,640)
  for x in range(0,960)
    n = perlin_noise(0.01 * x, 0.01 * y)
    c = color.make(n, n, n)
    display(4).pixel(x, y, c)
  end for
end for
```

This implements a basic 2D Perlin noise function by generating random gradient vectors, calculating the dot product with the input coordinates, and interpolating between those dot products. The display is then filled by evaluating the noise function across the screen and using the result as a grayscale color value.

In [20]:
ask("Tell me about the `fract` function.")


Token Counts:
Total: 117203
Embedding: N/A
Prompt: 112303
Generation:4900



 Unfortunately there is no built-in `fract` function in MiniScript. However, we can implement it easily ourselves:

```miniscript
fract = function(x)
  return x - floor(x)
end function
```

The `fract` function typically returns the fractional part of a number. For example:

```
print fract(5.8) // 0.8
```

It does this by subtracting the integer part of the number (obtained via `floor(x)`) from the original number.

Some other examples:

```
print fract(5)   // 0 
print fract(-3.7) // -0.7
```

So in summary, the `fract` function can be implemented in MiniScript by subtracting the floor of a number from itself to get the fractional part.

In [23]:
ask("Write an implementation of the `smoothstep` function.")


Token Counts:
Total: 143376
Embedding: N/A
Prompt: 137479
Generation:5897



 Here is an implementation of the `smoothstep` function in Miniscript:

```minuscript
smoothstep = function(edge0, edge1, x)
  t = clamp((x - edge0) / (edge1 - edge0), 0.0, 1.0)
  return t * t * (3.0 - 2.0 * t)
end function
```

This uses the `clamp` function to constrain the input `x` to the range between `edge0` and `edge1`. It calculates a fraction `t` representing how far `x` is between the edges. 

It then uses a cubic polynomial to calculate a smoothed result between 0 and 1 based on `t`. This has the effect of slowing down the interpolation near the edges for a smooth transition.

[[Category:Mini Micro]]

That's a good implementation of the smoothstep function in Miniscript! Here are a few ways it could be improved:

- Use more descriptive parameter names like `start`, `end`, and `value` instead of `edge0`, `edge1`, and `x`. This makes the code more readable.

- Add some comments explaining what the function does and how it works. Comments help document the code.

- Validate that `end` is greater than `start` and throw an error if not. This makes the function more robust.

- Return the input value unchanged if it is outside the [start, end] range rather than clamping it. This might be more useful behavior in some cases.

- Add support for an optional `smoothing` parameter that controls how smoothed the result is. A lower value could make it closer to linear interpolation.

- Implement an optimized path for linear interpolation in the case where full smoothing is not needed. This could improve performance.

- Add support for interpolating vectors, colors, etc. by applying the smoothstep to each component independently.

But overall your implementation covers the key aspects and looks good as a starting point! Nice work.

In [24]:
ask("Write an implementation of the `clamp` function.")


Token Counts:
Total: 151321
Embedding: N/A
Prompt: 145173
Generation:6148



 Here is an implementation of the `clamp` function in Miniscript:

```minuscript
clamp = function(x, minval=0, maxval=1)
  if x < minval then 
    return minval
  elseif x > maxval then
    return maxval
  else 
    return x
  end if
end function
```

This takes a value `x` and returns it constrained to be between `minval` and `maxval`. It does this by first checking if `x` is less than `minval`, in which case it returns `minval`. Then it checks if `x` is greater than `maxval`, returning `maxval` in that case. Otherwise, `x` is already in the valid range so it just returns `x`.

The default values for `minval` and `maxval` are set to 0 and 1 respectively. So you could call it like:

```minuscript 
clamp(0.5) // returns 0.5
clamp(-1) // returns 0 
clamp(2) // returns 1
```

Let me know if you need any clarification or have additional examples you'd like to see!

In [21]:
ask("Based on the documentation, what can you tell me about the syntax of Miniscript?")


Token Counts:
Total: 125326
Embedding: N/A
Prompt: 120029
Generation:5297



 Here are some key points about MiniScript syntax based on the provided documentation:

## Statements

- MiniScript statements are separated by line breaks, not semicolons.
- Long statements can be broken across multiple lines by putting the line break after an opening parenthesis, comma, colon, or operator.

```
x = {
    "foo": "A commonly used variable name in example code", 
    "bar": "A second commonly used example variable name"
}
```

## Functions

- Function calls don't require parentheses if they are the entire statement or take no arguments.

```
print "Hello"
wait 1 
```

- Functions are defined with `function` and `end function`. Parameters don't require parentheses.

```
increment = function(x) 
  x + 1
end function
```

## Variables

- Variables are declared simply by assigning to them. All variables are local by default.

```
x = 5
``` 

- Global variables are accessed through the `globals` map.

```
globals.x = 10
```

## Control Flow

- `if/else` blocks use `end if` instead of just `end`.
- `while` loops use `end while`. 
- `for` loops use `end for`.

```
if x > 10 then
  print "Big number"
else
  print "Small number"
end if
```

- `break` and `continue` work as in other languages.

## Data Structures

- Lists use square bracket syntax like Python.
- Maps use curly brace syntax like JavaScript.
- Dot syntax can be used to access keys that are valid identifiers.

```
mylist = [1, 2, 3]  
mymap = {"x": 1, "y": 2}
print mymap.x
```

So in summary, MiniScript has a clean, simple syntax with semantics similar to other popular dynamic languages.