In [2]:
!pip install transformers langchain

Collecting langchain
  Downloading langchain-0.0.170-py3-none-any.whl (834 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m834.2/834.2 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
Collecting aiohttp<4.0.0,>=3.8.3
  Downloading aiohttp-3.8.4-cp310-cp310-macosx_11_0_arm64.whl (336 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.9/336.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hCollecting openapi-schema-pydantic<2.0,>=1.2
  Downloading openapi_schema_pydantic-1.2.4-py3-none-any.whl (90 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting async-timeout<5.0.0,>=4.0.0
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting tenacity<9.0.0,>=8.1.0
  Downloading tenacity-8.2.2-py3-none-any.whl (24 kB)
Collecting pydantic<2,>=1
  Download

In [None]:
EMB_INSTRUCTOR_XL = "hkunlp/instructor-xl"
LLM_FASTCHAT_T5_XL = "lmsys/fastchat-t5-3b-v1.0"

In [3]:
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline
import os

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
class RepoQA:
    question_check_template = """Given the following pieces of context, determine if the question is able to be answered by the information in the context.
Respond with 'yes' or 'no'.
{context}
Question: {question}
"""
    QUESTION_CHECK_PROMPT = PromptTemplate(
        template=question_check_template, input_variables=["context", "question"]
    )
    def __init__(self, config: dict={}):
        self.config = config
        self.embedding = None
        self.vectordb = None
        self.llm = None
        self.qa = None
    
    # The following class methods are useful to create global GPU model instances
    # This way we don't need to reload models in an interactive app,
    # and the same model instance can be used across multiple user sessions
    @classmethod
    def create_instructor_xl(cls):
        return HuggingFaceInstructEmbeddings(model_name=EMB_INSTRUCTOR_XL, model_kwargs={"device": "cuda"})

    @classmethod
    def create_fastchat_t5_xl(cls, load_in_8bit=False):
        return pipeline(
            task="text2text-generation",
            model = LLM_FASTCHAT_T5_XL,
            model_kwargs={"device_map": "auto", "load_in_8bit": load_in_8bit, "max_length": 512, "temperature": 0.}
        )
    
    def init_models(self) -> None:
        load_in_8bit = self.config["load_in_8bit"]

        if self.config["embedding"] == EMB_INSTRUCTOR_XL:
            if self.embedding is None:
                self.embedding = RepoQA.create_instructor_xl()
        else:
            raise ValueError("Invalid config")
        
        if self.config["llm"] == LLM_FASTCHAT_T5_XL:
            if self.llm is None:
                self.llm = RepoQA.create_fastchat_t5_xl(load_in_8bit=load_in_8bit)
        else:
            raise ValueError("Invalid config")
    
    def vectorize_repo(repo_path: str):
        print(repo_path)
        text_splitter = RecursiveCharacterTextSplitter(
            # Set a really small chunk size, just to show.
            chunk_size = 100,
            chunk_overlap  = 20,
            length_function = len,
        )
        for root, dir_names, file_names in os.walk(repo_path):
            for f in file_names:
                fname = os.path.join(root, f)
                if f.endswith() == 'post.go':
                    with open(fname) as myfile:
                        doco = myfile.read()
                    texts = text_splitter.create_documents([doco])
                    for text in texts:
                        print(text)


    def get_answer(self, question: str) -> str:
        hf_llm = HuggingFacePipeline(pipeline=self.llm)
        self.qa = RetrievalQA.from_chain_type(llm=hf_llm, chain_type="stuff", retriever=self.vectordb.as_retriever(search_kwargs={"k":4}))

In [12]:
RepoQA.vectorize_repo("/Users/tommycalvy/aws/gatekeeper/crud_service")

/Users/tommycalvy/aws/gatekeeper/crud_service
The directory name is: ['protos', 'post', 'user', 'service']
The directory name is: []
The directory name is: []
132
The directory name is: []
The directory name is: []
