In [1]:
import os
import json
from dotenv import load_dotenv
load_dotenv()

from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.embeddings.base import Embeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document

from tqdm import tqdm

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
from approaches.azureretriver import AzureRetrieveApproach

In [3]:
ctags_path = './repo/langchain/libs/langchain/tags'
ctags_root_path = os.path.dirname(ctags_path)
assert os.path.isfile(ctags_path), "Please run `zsh download_example_rpo.sh` first"

In [4]:
AZURE_SEARCH_SERVICE = os.environ["AZURE_SEARCH_SERVICE"]
AZURE_SEARCH_TINY_INDEX = os.environ["AZURE_SEARCH_TINY_INDEX"]
AZURE_SEARCH_BIGGER_INDEX = os.environ["AZURE_SEARCH_BIGGER_INDEX"]

## Helper functions

In [5]:
def read_tags_file(file_path: str) -> list[dict]:
    with open(file_path, 'r', errors='ignore') as file:
        lines = file.readlines()

    tags = []
    for line in lines:
        if line.startswith('!'):  # Skip metadata lines
            continue
        parts = line.split('\t')
        if len(parts) >= 4:
            tag_name = parts[0]
            file_name = parts[1]
            pattern = parts[2]
            tags.append(dict(tag_name=tag_name, file_name=file_name, pattern=pattern))

    return tags

In [6]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # vector length 384
embedding_dimension = embeddings.client.get_sentence_embedding_dimension()

def get_embeddings(text: str, normalize=True) -> list:
    embeddings.encode_kwargs = {'normalize_embeddings': normalize}
    return embeddings.embed_query(text)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
import inspect
import importlib.util
import os

def get_source_code(function_name, function_path, ctags_root_path=ctags_root_path):
    spec=importlib.util.spec_from_file_location(function_name, os.path.join(ctags_root_path, function_path))
    foo = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(foo)
    return inspect.getsource(foo)

In [8]:
def create_code_file_text(metadatas : list[dict]):
    code_file_text = ""
    for i, metadata in enumerate(metadatas):
        code_file_text += f'==== File {i+1}/{len(metadata)} ====\n'
        code_file_text += f'File path: {metadata["file_name"]}\n'
        code_file_text += f'Tag name: {metadata["tag_name"]}\n'
        code_string = get_source_code(metadata["tag_name"], metadata["file_name"])
        code_file_text += f'Code: {code_string}\n'
        code_file_text += "\n"
    return code_file_text

In [9]:
template = """You are a hupful bot that fuilfill the human' program task:

The following is releative code:
{code_file_text}

User: {user_prompt}
Ai:
"""

## Init Azure client

In [10]:
azure_retriever = AzureRetrieveApproach()

### Create index

In [11]:
# azure_retriever.create_index(indedx_name="poc_20231202", embedding_dimension=embedding_dimension)

### Read ctag, embedding and upload to Azure

In [12]:
# ctags_root_path = os.path.dirname(ctags_path)
# tags = read_tags_file(ctags_path)

# print(f"Total tags: {len(tags)}")

# tags = tags[:5000]

# documents = []
# idx = 0
# for tag in tqdm(tags):
#     documents.append(
#         dict(
#             id=str(idx),
#             title=tag['file_name'],
#             metadata=json.dumps(tag),
#             content=f"{tag['file_name']} | {tag['tag_name']}",
#             category="code",
#             titleVector=get_embeddings(f"{tag['file_name']} | {tag['tag_name']}"),
#             contentVector=get_embeddings(tag['tag_name'])
#         )
#     )
#     idx+=1

In [13]:
# azure_retriever.batch_update(documents=documents, index_name="poc_20231202")

### Search using vector similarity

In [14]:
text = "How can i add a Custom Prompt Template in this repository? also add the unit-test. Give me an example"

In [15]:
results = azure_retriever.search(
    index_name="poc_20231202", 
    vector=get_embeddings(text), 
    fields="contentVector", 
    top=2
)

In [16]:
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Metadata: {result['metadata']}")  
    print(f"Content: {result['content']}")  
    print(f"Category: {result['category']}\n")  

Title: langchain/chains/query_constructor/prompt.py
Score: 0.7941357
Metadata: {"tag_name": "EXAMPLE_PROMPT_TEMPLATE", "file_name": "langchain/chains/query_constructor/prompt.py", "pattern": "/^EXAMPLE_PROMPT_TEMPLATE = \"\"\"\\\\$/;\""}
Content: langchain/chains/query_constructor/prompt.py | EXAMPLE_PROMPT_TEMPLATE
Category: code

Title: langchain/chains/natbot/prompt.py
Score: 0.7826364
Metadata: {"tag_name": "_PROMPT_TEMPLATE", "file_name": "langchain/chains/natbot/prompt.py", "pattern": "/^_PROMPT_TEMPLATE = \"\"\"$/;\""}
Content: langchain/chains/natbot/prompt.py | _PROMPT_TEMPLATE
Category: code



### Search using hybrid (text keyword and vector similarity)

In [17]:
results = azure_retriever.hybrid_search(
    index_name="poc_20231202", 
    text=text,
    vector=get_embeddings(text), 
    fields="contentVector", 
    top=2
)

In [18]:
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Metadata: {result['metadata']}")  
    print(f"Content: {result['content']}")  
    print(f"Category: {result['category']}\n")  

Title: langchain/chains/query_constructor/prompt.py
Score: 0.01666666753590107
Metadata: {"tag_name": "EXAMPLE_PROMPT_TEMPLATE", "file_name": "langchain/chains/query_constructor/prompt.py", "pattern": "/^EXAMPLE_PROMPT_TEMPLATE = \"\"\"\\\\$/;\""}
Content: langchain/chains/query_constructor/prompt.py | EXAMPLE_PROMPT_TEMPLATE
Category: code

Title: tests/integration_tests/examples/example-utf8.html
Score: 0.01666666753590107
Metadata: {"tag_name": "Chase the red dot", "file_name": "tests/integration_tests/examples/example-utf8.html", "pattern": "/^    <h2>Chase the red dot<\\/h2>$/;\""}
Content: tests/integration_tests/examples/example-utf8.html | Chase the red dot
Category: code



### Search using hybrid_reranking_search

In [19]:
results = azure_retriever.hybrid_reranking_search(
    index_name="poc_20231202", 
    text=text,
    vector=get_embeddings(text), 
    fields="contentVector", 
    top=2
)

In [20]:
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Metadata: {result['metadata']}")  
    print(f"Content: {result['content']}")  
    print(f"Category: {result['category']}\n")  

Title: langchain/chains/flare/prompts.py
Score: 0.0117647061124444
Metadata: {"tag_name": "PROMPT", "file_name": "langchain/chains/flare/prompts.py", "pattern": "/^PROMPT = PromptTemplate($/;\""}
Content: langchain/chains/flare/prompts.py | PROMPT
Category: code

Title: langchain/chains/summarize/refine_prompts.py
Score: 0.012048192322254181
Metadata: {"tag_name": "PROMPT", "file_name": "langchain/chains/summarize/refine_prompts.py", "pattern": "/^PROMPT = PromptTemplate.from_template(prompt_template)$/;\""}
Content: langchain/chains/summarize/refine_prompts.py | PROMPT
Category: code



## Example

In [21]:
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import HumanMessage, AIMessage


llm = AzureChatOpenAI(
    azure_deployment=os.environ.get("DEPLOYMENT_NAME"),
    temperature=0.5,
)



In [22]:
def ask(user_prompt: str, retriever_type: str = "vector_search") -> str:
    # find docs similar to user_prompt

    if retriever_type == "vector_search":
        results = azure_retriever.search(
            index_name="poc_20231202", 
            vector=get_embeddings(user_prompt), 
            fields="contentVector", 
            top=10
        )
    elif retriever_type == "hybrid_search" or retriever_type == "hybrid_reranking_search":
        results = azure_retriever.hybrid_reranking_search(
            index_name="poc_20231202", 
            text=user_prompt,
            vector=get_embeddings(user_prompt), 
            fields="contentVector", 
            top=10
        )
    else:
        raise ValueError(f"retriever_type: {retriever_type} is not supported")

    metadatas = []
    for result in results:
        result_dict = json.loads(result['metadata'])
        if result_dict['file_name'].endswith(".py"):
            metadatas.append(result_dict)
    metadatas = metadatas[:3]

    citations = [metadata["file_name"] for metadata in metadatas]

    user_prompt = template.format(code_file_text=create_code_file_text(metadatas), user_prompt=user_prompt)
    
    # call openai api here
    message = HumanMessage(content=user_prompt)
    final_message = llm([message]).content  

    citations_str = "\n".join(citations)
    final_message += f"\n\nCitations:\n{citations_str}"

    return final_message

In [23]:
user_question = "What is Langchain design for?"
result = ask(user_question, retriever_type="hybrid_reranking_search")

print("="*20)
print(f"👩‍💻 : {user_question}")
print("="*20)
print(f"🤖 : {result}")

👩‍💻 : What is Langchain design for?
🤖 : Langchain is designed to facilitate the creation and execution of conversational AI systems. It provides a framework for building chains of components that can process and transform input, carry on conversations, and generate responses. The design allows for flexibility in incorporating different language models and memory systems, enabling the development of sophisticated conversational agents.

Citations:
langchain/chains/transform.py
langchain/vectorstores/sklearn.py
langchain/chains/conversation/base.py


In [24]:
user_question = "How can i add a Custom Prompt Template in this repository? also add the unit-test. Give me an example"
result = ask(user_question, retriever_type="hybrid_reranking_search")

print("="*20)
print(f"👩‍💻 : {user_question}")
print("="*20)
print(f"🤖 : {result}")

👩‍💻 : How can i add a Custom Prompt Template in this repository? also add the unit-test. Give me an example
🤖 : To add a custom prompt template in this repository, you can follow these steps:

1. Create a new Python file in the appropriate directory (e.g., `langchain/chains/custom/prompts.py`).
2. Define your custom prompt template using the `PromptTemplate` class from `langchain_core.prompts`.
3. Write your custom prompt code, including any necessary input variables and template strings.
4. Add unit tests for your custom prompt template in the appropriate test file (e.g., `tests/chains/custom/test_prompts.py`).

Here's an example of how you can add a custom prompt template:

==== File 1/2 ====
File path: langchain/chains/custom/prompts.py
Tag name: PROMPT
Code:
```python
from langchain_core.prompts import PromptTemplate

CUSTOM_PROMPT_TEMPLATE = """\
This is a custom prompt template.

Input 1: {input1}
Input 2: {input2}
Input 3: {input3}

>>> INPUT 1: {input1}
>>> INPUT 2: {input2}
>>