In [2]:
from langchain.llms import OpenAI
from typing import Any, Dict
from langchain.docstore.document import Document
import requests
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate
import pathlib
import subprocess
import tempfile
import os
from pprint import pprint
import openai
from utils.helpers import get_secret
from langchain import PromptTemplate, OpenAI, LLMChain
from dotenv import load_dotenv

load_dotenv()

def get_secret(name):
    if name in os.environ:
        return os.environ[name]
    return os.getenv(name)

# create .env in project dir and set api keys there
openai.api_key = get_secret('OPENAI_API_KEY')

## Collect docs from a random github repo

In [3]:
def get_github_docs(repo_owner, repo_name):
    with tempfile.TemporaryDirectory() as d:
        subprocess.check_call(
            f"git clone --depth 1 https://github.com/{repo_owner}/{repo_name}.git .",
            cwd=d,
            shell=True,
        )
        git_sha = (
            subprocess.check_output("git rev-parse HEAD", shell=True, cwd=d)
            .decode("utf-8")
            .strip()
        )
        repo_path = pathlib.Path(d)
        markdown_files = list(repo_path.glob("*/*.md")) + list(
            repo_path.glob("*/*.mdx")
        )
        for markdown_file in markdown_files:
            with open(markdown_file, "r") as f:
                relative_path = markdown_file.relative_to(repo_path)
                github_url = f"https://github.com/{repo_owner}/{repo_name}/blob/{git_sha}/{relative_path}"
                yield Document(page_content=f.read(), metadata={"source": github_url})

sources = get_github_docs("yirenlu92", "deno-manual-forked")

source_chunks = []
splitter = CharacterTextSplitter(separator=" ", chunk_size=1024, chunk_overlap=0)
for source in sources:
    for chunk in splitter.split_text(source.page_content):
        source_chunks.append(Document(page_content=chunk, metadata=source.metadata))

Cloning into '.'...


## Create vectorstore

In [4]:
search_index = DeepLake.from_documents(source_chunks, OpenAIEmbeddings())

./deeplake/ loaded successfully.





Deep Lake Dataset in ./deeplake/ already exists, loading from the storage
Dataset(path='./deeplake/', tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype      shape      dtype  compression
  -------   -------    -------    -------  ------- 
 embedding  generic  (634, 1536)  float32   None   
    ids      text     (634, 1)      str     None   
 metadata    json     (634, 1)      str     None   
   text      text     (634, 1)      str     None   


Evaluating ingest: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:07<00:00

Dataset(path='./deeplake/', tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype      shape      dtype  compression
  -------   -------    -------    -------  ------- 
 embedding  generic  (951, 1536)  float32   None   
    ids      text     (951, 1)      str     None   
 metadata    json     (951, 1)      str     None   
   text      text     (951, 1)      str     None   





In [6]:
from langchain.chains import LLMChain
prompt_template = """Use the context below to write a 400 word blog post about the topic below:
    Context: {context}
    Topic: {topic}
    Blog post:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "topic"]
)

llm = OpenAI(temperature=0)

chain = LLMChain(llm=llm, prompt=PROMPT)

## for doc in similar_docs: model(doc + "environment variables") 
## simply put: runs a batch of each most similar doc + topic name and returns a batch of output

In [7]:
def generate_blog_post(topic):
    docs = search_index.similarity_search(topic, k=2)
    inputs = [{"context": doc.page_content, "topic": topic} for doc in docs]
    print(chain.apply(inputs))
    
generate_blog_post("environment variables")

[{'text': '\n\nEnvironment variables are a great way to store and access sensitive information in your applications. They are also a great way to keep your codebase clean and organized. In this blog post, we\'ll discuss how to use environment variables in Deno.\n\nDeno offers built-in support for environment variables with `Deno.env`. `Deno.env` has getter and setter methods that allow you to set and retrieve environment variables. Here is an example of how to use `Deno.env`:\n\n```ts\nDeno.env.set("FIREBASE_API_KEY", "examplekey123");\nDeno.env.set("FIREBASE_AUTH_DOMAIN", "firebasedomain.com");\n\nconsole.log(Deno.env.get("FIREBASE_API_KEY")); // examplekey123\nconsole.log(Deno.env.get("FIREBASE_AUTH_DOMAIN")); // firebasedomain.com\n```\n\nYou can also store environment variables in a `.env` file and retrieve them using `dotenv` in the standard library. To access the environment variables in'}, {'text': '\n\nEnvironment variables are a great way to store and access sensitive informat

## Take Solidity repo and split it to docs

In [8]:
repo_dir = "data/2023-04-frankencoin mini"  # this is just an example, for full repo use: "data/2023-04-frankencoin"
processed_repo_path = "data/2023-04-frankencoin-preprocessed"
import os
from langchain.document_loaders import TextLoader


docs = []
for dirpath, dirnames, filenames in os.walk(repo_dir):
    for file in filenames:
        try: 
            loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
            docs.extend(loader.load_and_split())
        except Exception as e: 
            pass

text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
my_documents = text_splitter.split_documents(docs)
db = DeepLake.from_documents(my_documents, embedding=OpenAIEmbeddings(), dataset_path=processed_repo_path) # saves to processed_repo_path
# to load already preprocessed:
# DeepLake(processed_repo_path, embedding_function=OpenAIEmbeddings())

Created a chunk of size 1191, which is longer than the specified 1024
Created a chunk of size 1252, which is longer than the specified 1024
Created a chunk of size 1118, which is longer than the specified 1024
Created a chunk of size 1540, which is longer than the specified 1024


data/2023-04-frankencoin-preprocessed loaded successfully.





Deep Lake Dataset in data/2023-04-frankencoin-preprocessed already exists, loading from the storage
Dataset(path='data/2023-04-frankencoin-preprocessed', tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype      shape      dtype  compression
  -------   -------    -------    -------  ------- 
 embedding  generic  (819, 1536)  float32   None   
    ids      text     (819, 1)      str     None   
 metadata    json     (819, 1)      str     None   
   text      text     (819, 1)      str     None   


Evaluating ingest: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00

Dataset(path='data/2023-04-frankencoin-preprocessed', tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype      shape      dtype  compression
  -------   -------    -------    -------  ------- 
 embedding  generic  (888, 1536)  float32   None   
    ids      text     (888, 1)      str     None   
 metadata    json     (888, 1)      str     None   
   text      text     (888, 1)      str     None   





In [9]:
prompt_template = """Gas optimize the solidity code surrounded by triple backticks:
    Context: {context}
    Solidity code: ```{solidity_code}```
    Gas optimized solidity code:"""

solidity_code = """function loop(uint[] memory arr) external pure returns (uint sum) {
    for (uint i = 0; i < arr.length; i++) {
        sum += arr[i];
    }
}"""

In [8]:
from langchain.chains import LLMChain

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "solidity_code"]
)

llm = OpenAI(temperature=0)
chain = LLMChain(llm=llm, prompt=PROMPT)

def optimize_solidity_code_with_vectore_store(vector_store, solidity_code):
    docs = vector_store.similarity_search(solidity_code, k=2)
    inputs = [{"context": doc.page_content, "solidity_code": solidity_code} for doc in docs]
    print(chain.apply(inputs))
    
optimize_solidity_code_with_vectore_store(search_index, solidity_code)

[{'text': ' ```function loop(uint[] memory arr) external pure returns (uint sum) {\n    uint length = arr.length;\n    for (uint i = 0; i < length; i++) {\n        sum += arr[i];\n    }\n}```'}, {'text': ' ```function loop(uint[] memory arr) external pure returns (uint sum) {\n    uint length = arr.length;\n    for (uint i = 0; i < length; i++) {\n        sum += arr[i];\n    }\n}```'}]


# How those docs look like inside

In [9]:
docs = search_index.similarity_search(solidity_code, k=2)

print(len(docs))
for i in range(len(docs)):
    print(f"doc # {i}:\n```\n{docs[i].page_content}\n```\n")

2
doc # 0:
```
"./deps.ts";

function totalCost(outbound: number, inbound: number, tax: number): number {
 return multiply(add(outbound, inbound), tax);
}

console.log(totalCost(19, 31, 1.2));
console.log(totalCost(45, 27, 1.15));

/**
 * Output
 *
 * 60
 * 82.8
 */
```
```

doc # 1:
```
"./deps.ts";

function totalCost(outbound: number, inbound: number, tax: number): number {
 return multiply(add(outbound, inbound), tax);
}

console.log(totalCost(19, 31, 1.2));
console.log(totalCost(45, 27, 1.15));

/**
 * Output
 *
 * 60
 * 82.8
 */
```
```



In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

class SolidityTextSplitter(RecursiveCharacterTextSplitter):
    """Attempts to split the text along Python syntax."""

    def __init__(self, **kwargs: Any):
        """Initialize a MarkdownTextSplitter."""
        separators = [
            # First, try to split along class definitions
            "\ncontract ",
            "\nfunction ",
            "\n\tfunction ",
            "\nconstructor ",
            "\n\tconstructor ",
            "\nmodifier ",
            "\n\tmodifier ",
            # Now split by the normal type of lines
            "\n\n",
            "\n",
            " ",
            "",
        ]
        super().__init__(separators=separators, **kwargs)

In [11]:
text_splitter = SolidityTextSplitter(chunk_size=300, chunk_overlap=0, length_function = len)
my_documents = text_splitter.split_documents(docs)
better_search_index = DeepLake.from_documents(my_documents, embedding=OpenAIEmbeddings(), dataset_path=processed_repo_path) # saves to processed_repo_path

data/2023-04-frankencoin-preprocessed loaded successfully.





Deep Lake Dataset in data/2023-04-frankencoin-preprocessed already exists, loading from the storage
Dataset(path='data/2023-04-frankencoin-preprocessed', tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype      shape      dtype  compression
  -------   -------    -------    -------  ------- 
 embedding  generic  (817, 1536)  float32   None   
    ids      text     (817, 1)      str     None   
 metadata    json     (817, 1)      str     None   
   text      text     (817, 1)      str     None   


Evaluating ingest: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00

Dataset(path='data/2023-04-frankencoin-preprocessed', tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype      shape      dtype  compression
  -------   -------    -------    -------  ------- 
 embedding  generic  (819, 1536)  float32   None   
    ids      text     (819, 1)      str     None   
 metadata    json     (819, 1)      str     None   
   text      text     (819, 1)      str     None   





In [12]:
docs = better_search_index.similarity_search(solidity_code, k=2)
print(f"number of retrieved docs: {len(docs)}")
for i in range(len(docs)):
    print(f"doc # {i}:\n\n{docs[i].page_content}\n\n")

number of retrieved docs: 2
doc # 0:

function votes(address sender, address[] calldata helpers) public view returns (uint256) {
        uint256 _votes = votes(sender);
        for (uint i=0; i<helpers.length; i++){
            address current = helpers[i];
            require(current != sender);
            require(canVoteFor(sender, current));
            for (uint j=i+1; j<helpers.length; j++){
                require(current != helpers[j]); // ensure helper unique
            }
            _votes += votes(current);
        }
        return _votes;
    }


doc # 1:

function votes(address sender, address[] calldata helpers) public view returns (uint256) {
        uint256 _votes = votes(sender);
        for (uint i=0; i<helpers.length; i++){
            address current = helpers[i];
            require(current != sender);
            require(canVoteFor(sender, current));
            for (uint j=i+1; j<helpers.length; j++){
                require(current != helpers[j]); // ensure hel

## Compound docs and pass to input with original Solidity code

In [13]:
def optimize_solidity_code_with_vectore_store(vector_store, solidity_code):
    docs = vector_store.similarity_search(solidity_code, k=2)
    context = '\n'.join([doc.page_content for doc in docs])
#     chain.apply(inputs)
    result = chain({"context": context, "solidity_code": solidity_code})
    print(f"result.keys(): {result.keys()}")
    pprint(result)
    
optimize_solidity_code_with_vectore_store(better_search_index, solidity_code)

result.keys(): dict_keys(['context', 'solidity_code', 'text'])
{'context': 'function votes(address sender, address[] calldata helpers) public '
            'view returns (uint256) {\n'
            '        uint256 _votes = votes(sender);\n'
            '        for (uint i=0; i<helpers.length; i++){\n'
            '            address current = helpers[i];\n'
            '            require(current != sender);\n'
            '            require(canVoteFor(sender, current));\n'
            '            for (uint j=i+1; j<helpers.length; j++){\n'
            '                require(current != helpers[j]); // ensure helper '
            'unique\n'
            '            }\n'
            '            _votes += votes(current);\n'
            '        }\n'
            '        return _votes;\n'
            '    }\n'
            'function votes(address sender, address[] calldata helpers) public '
            'view returns (uint256) {\n'
            '        uint256 _votes = votes(sender)