In [1]:
import os
import json
from dotenv import load_dotenv
load_dotenv()

from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.embeddings.base import Embeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document

from tqdm import tqdm

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
from approaches.azureretriver import AzureRetrieveApproach
from utils.tagreader import read_tags_file

In [3]:
ctags_path = './repo/langchain/libs/langchain/tags'
ctags_root_path = os.path.dirname(ctags_path)
assert os.path.isfile(ctags_path), "Please run `zsh download_example_rpo.sh` first"

In [4]:
AZURE_SEARCH_SERVICE = os.environ["AZURE_SEARCH_SERVICE"]
AZURE_SEARCH_TINY_INDEX = os.environ["AZURE_SEARCH_TINY_INDEX"]
AZURE_SEARCH_BIGGER_INDEX = os.environ["AZURE_SEARCH_BIGGER_INDEX"]

## Helper functions

In [5]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # vector length 384
embedding_dimension = embeddings.client.get_sentence_embedding_dimension()

def get_embeddings(text: str, normalize=True) -> list:
    embeddings.encode_kwargs = {'normalize_embeddings': normalize}
    return embeddings.embed_query(text)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import inspect
import importlib.util
import os

def get_source_code(function_name, function_path, ctags_root_path=ctags_root_path):
    spec=importlib.util.spec_from_file_location(function_name, os.path.join(ctags_root_path, function_path))
    foo = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(foo)
    return inspect.getsource(foo)

In [7]:
def create_code_file_text(metadatas : list[dict]):
    code_file_text = ""
    for i, metadata in enumerate(metadatas):
        code_file_text += f'==== File {i+1}/{len(metadata)} ====\n'
        code_file_text += f'File path: {metadata["file_name"]}\n'
        code_file_text += f'Tag name: {metadata["tag_name"]}\n'
        code_string = get_source_code(metadata["tag_name"], metadata["file_name"])
        code_file_text += f'Code: {code_string}\n'
        code_file_text += "\n"
    return code_file_text

In [8]:
template = """You are a hupful bot that fuilfill the human' program task:

The following is releative code:
{code_file_text}

User: {user_prompt}
Ai:
"""

## Init Azure client

In [9]:
azure_retriever = AzureRetrieveApproach()

### Create index

In [10]:
azure_retriever.create_index(indedx_name="poc_20231202", embedding_dimension=embedding_dimension)

poc_20231202 created


### Read ctag, embedding and upload to Azure

In [11]:
ctags_root_path = os.path.dirname(ctags_path)
tags = read_tags_file(ctags_path, accept_file=[".py"])

print(f"Total tags: {len(tags)}")

# tags = tags[:5000]

documents = []
idx = 0
for tag in tqdm(tags):
    documents.append(
        dict(
            id=str(idx),
            title=tag['file_name'],
            metadata=json.dumps(tag),
            content=f"{tag['file_name']} | {tag['tag_name']} | ",
            category="code",
            titleVector=get_embeddings(f"{tag['file_name']} | {tag['tag_name']}"),
            contentVector=get_embeddings(tag['tag_name'])
        )
    )
    idx+=1

Total tags: 13400


100%|██████████| 13400/13400 [08:10<00:00, 27.32it/s]


In [12]:
azure_retriever.batch_update(documents=documents, index_name="poc_20231202")

### Search using vector similarity

In [13]:
text = "How can i add a Custom Prompt Template in this repository? also add the unit-test. Give me an example"

In [14]:
results = azure_retriever.search(
    index_name="poc_20231202", 
    vector=get_embeddings(text), 
    fields="contentVector", 
    top=2
)

In [15]:
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Metadata: {result['metadata']}")  
    print(f"Content: {result['content']}")  
    print(f"Category: {result['category']}\n")  

Title: langchain/chains/summarize/refine_prompts.py
Score: 0.768639
Metadata: {"tag_name": "prompt_template", "file_name": "langchain/chains/summarize/refine_prompts.py", "pattern": "/^prompt_template = \"\"\"Write a concise summary of the following:$/;\""}
Content: langchain/chains/summarize/refine_prompts.py | prompt_template
Category: code

Title: langchain/retrievers/document_compressors/chain_extract_prompt.py
Score: 0.768639
Metadata: {"tag_name": "prompt_template", "file_name": "langchain/retrievers/document_compressors/chain_extract_prompt.py", "pattern": "/^prompt_template = \"\"\"Given the following question and context, extract any part of the context */;\""}
Content: langchain/retrievers/document_compressors/chain_extract_prompt.py | prompt_template
Category: code



### Search using hybrid (text keyword and vector similarity)

In [16]:
results = azure_retriever.hybrid_search(
    index_name="poc_20231202", 
    text=text,
    vector=get_embeddings(text), 
    fields="contentVector", 
    top=2
)

In [17]:
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Metadata: {result['metadata']}")  
    print(f"Content: {result['content']}")  
    print(f"Category: {result['category']}\n")  

Title: langchain/chains/summarize/refine_prompts.py
Score: 0.01666666753590107
Metadata: {"tag_name": "prompt_template", "file_name": "langchain/chains/summarize/refine_prompts.py", "pattern": "/^prompt_template = \"\"\"Write a concise summary of the following:$/;\""}
Content: langchain/chains/summarize/refine_prompts.py | prompt_template
Category: code

Title: langchain/docstore/in_memory.py
Score: 0.01666666753590107
Metadata: {"tag_name": "add", "file_name": "langchain/docstore/in_memory.py", "ex_command": "^    def add(self, texts: Dict[str, Document]) -> None:$", "tag_kind": "m", "extension_fields": "class:InMemoryDocstore"}
Content: langchain/docstore/in_memory.py | add | 
Category: code



### Search using hybrid_reranking_search

In [18]:
results = azure_retriever.hybrid_reranking_search(
    index_name="poc_20231202", 
    text=text,
    vector=get_embeddings(text), 
    fields="contentVector", 
    top=2
)

In [19]:
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Metadata: {result['metadata']}")  
    print(f"Content: {result['content']}")  
    print(f"Category: {result['category']}\n")  

    captions = result.get("@search.captions")
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")

Title: langchain/chains/summarize/refine_prompts.py
Score: 0.01666666753590107
Metadata: {"tag_name": "prompt_template", "file_name": "langchain/chains/summarize/refine_prompts.py", "pattern": "/^prompt_template = \"\"\"Write a concise summary of the following:$/;\""}
Content: langchain/chains/summarize/refine_prompts.py | prompt_template
Category: code

Caption: langchain/chains/summarize/refine_prompts.py. code. langchain/chains/summarize/refine_prompts.py | prompt_template.

Title: langchain/chains/chat_vector_db/prompts.py
Score: 0.016393441706895828
Metadata: {"tag_name": "prompt_template", "file_name": "langchain/chains/chat_vector_db/prompts.py", "pattern": "/^prompt_template = \"\"\"Use the following pieces of context to answer the question at the end. If y/;\""}
Content: langchain/chains/chat_vector_db/prompts.py | prompt_template
Category: code

Caption: langchain/chains/chat_vector_db/prompts.py. code. langchain/chains/chat_vector_db/prompts.py |<em> prompt_template.</em>



## Example

In [20]:
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import HumanMessage, AIMessage


llm = AzureChatOpenAI(
    azure_deployment=os.environ.get("DEPLOYMENT_NAME"),
    temperature=0.5,
)



In [21]:
def ask(user_prompt: str, retriever_type: str = "vector_search") -> str:
    # find docs similar to user_prompt

    if retriever_type == "vector_search":
        results = azure_retriever.search(
            index_name="poc_20231202", 
            vector=get_embeddings(user_prompt), 
            fields="contentVector", 
            top=10
        )
    elif retriever_type == "hybrid_search" or retriever_type == "hybrid_reranking_search":
        results = azure_retriever.hybrid_reranking_search(
            index_name="poc_20231202", 
            text=user_prompt,
            vector=get_embeddings(user_prompt), 
            fields="contentVector", 
            top=10
        )
    else:
        raise ValueError(f"retriever_type: {retriever_type} is not supported")

    metadatas = []
    for result in results:
        result_dict = json.loads(result['metadata'])
        if result_dict['file_name'].endswith(".py"):
            metadatas.append(result_dict)
    metadatas = metadatas[:3]

    citations = [metadata["file_name"] for metadata in metadatas]

    user_prompt = template.format(code_file_text=create_code_file_text(metadatas), user_prompt=user_prompt)
    
    # call openai api here
    message = HumanMessage(content=user_prompt)
    final_message = llm([message]).content  

    citations_str = "\n".join(citations)
    final_message += f"\n\nCitations:\n{citations_str}"

    return final_message

In [23]:
user_question = "What is Langchain design for?"
result = ask(user_question, retriever_type="hybrid_reranking_search")

print("="*20)
print(f"👩‍💻 : {user_question}")
print("="*20)
print(f"🤖 : {result}")

AttributeError: 'NoneType' object has no attribute '__dict__'

In [24]:
user_question = "How can i add a Custom Prompt Template in this repository? also add the unit-test. Give me an example"
result = ask(user_question, retriever_type="hybrid_reranking_search")

print("="*20)
print(f"👩‍💻 : {user_question}")
print("="*20)
print(f"🤖 : {result}")

👩‍💻 : How can i add a Custom Prompt Template in this repository? also add the unit-test. Give me an example
🤖 : To add a custom prompt template in this repository, you can follow these steps:

1. Create a new file in the relevant directory (e.g., langchain/chains/summarize/) and name it appropriately.
2. In the new file, define a string variable that represents your prompt template. You can use triple quotes (""") to create multi-line strings for more complex templates.
3. Use the `PromptTemplate.from_template()` method to create a prompt template object from your string template.
4. Save the prompt template object to a variable for later use.

Here's an example of adding a custom prompt template in the langchain/chains/summarize/refine_prompts.py file:

```python
from langchain_core.prompts import PromptTemplate

custom_prompt_template = """\
Your custom prompt template here
"""

CUSTOM_PROMPT = PromptTemplate.from_template(custom_prompt_template)
```

To add unit tests for your custo