In [1]:
import torch
from transformers import (
    AutoModel,
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
)
from typing import List, Dict, Union
from tqdm.auto import tqdm, trange

MODELS = {
    "re_ranker": "BAAI/bge-reranker-large",
    "completion": "teknium/OpenHermes-2.5-Mistral-7B",
    "embedding": "BAAI/bge-large-en-v1.5",
}

## Embedding

- Embedding using **bge-large-en-v1.5**
- Saving Embeddings to Pinecone


In [2]:
class Embedding:

    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def __call__(self, sentences: List[Dict], batch_size: int = 64):
        for ix in trange(0, len(sentences), batch_size):
            tokenized_inputs = self.tokenizer(
                sentences[ix:ix + batch_size],
                padding=True,
                truncation=True,
                return_tensors="pt",
            )
            with torch.no_grad():
                model_output = self.model(tokenized_inputs)
            for i, ixd in enumerate(range(ix, ix + batch_size)):
                sentences[ixd]["embedding"] = model_output[i].tolist()
        return sentences

In [3]:
import pinecone
from typing import List, Dict, Union
from tqdm.auto import tqdm, trange


class PineconeDB:
    def __init__(
        self,
        api_key: str,
        environment: str,
        index_name: str,
        namespace: Union[str, None] = None,
        batch_size: int = 50,
    ):
        pinecone.init(api_key=api_key, environment=environment)
        self.index = pinecone.Index(index_name)
        self.batch_size = batch_size
        self.namespace = namespace

    def __store__(self, embeddings: List[Dict]):
        for ix in trange(0, len(embeddings), self.batch_size, desc="Storing Vectors"):
            pvs = []
            for ixe, embs in enumerate(embeddings[ix : ix + self.batch_size]):
                if len(embs.get("embedding")) > 0:
                    e = embs.get("embedding")
                    del embs["embedding"]
                    pvs.append((str(ix + ixe), e, {**embs}))
            if self.namespace:
                self.index.upsert(vectors=pvs, namespace=self.namespace)
            else:
                self.index.upsert(vectors=pvs)

    def __call__(self, embeddings: List[Dict]):
        self.__store__(embeddings)

ModuleNotFoundError: No module named 'pinecone'

## Completion

**Completion** and **Function Calling** using **_OpenHeremes-2.5-Mistral-7B_**


In [31]:
import json
import xml.etree.ElementTree as ET
import re


class OpenHermesInference:
    def __init__(self, model: AutoModelForCausalLM, tokenizer: AutoTokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def __inference__(self, messages: List[Dict]):
        tokens = self.tokenizer.apply_chat_template(messages, return_tensors="pt").to(
            self.model.device
        )
        input_size = tokens.numel()
        print("Input Tokens: ", input_size)
        with torch.inference_mode():
            generated_tokens = self.model.generate(
                tokens,
                use_cache=True, do_sample=True, temperature=0.2, top_p=1.0, top_k=0, max_new_tokens=512, eos_token_id=self.tokenizer.eos_token_id, pad_token_id=self.tokenizer.eos_token_id
            )
        # print("Generated Tokens: ", len(generated_tokens.squeeze()))
        print("Generated New Tokens: ", len(generated_tokens.squeeze()[input_size:]))
        # print("GENERATED NEW TOKENS: ", generated_tokens.squeeze()[input_size:])
        # print(self.tokenizer.decode(generated_tokens.squeeze()))
        return self.tokenizer.decode(
            generated_tokens.squeeze()[input_size:], skip_special_tokens=True
        )


class FunctionCall(OpenHermesInference):
    def __init__(self, model, tokenizer):
        super().__init__(self, FunctionCall)
        self.system_prompt = """You are a helpful assistant with access to the following functions:
        
            {functions}
        
            To use these functions respond with:
            <multiplefunctions>
                <functioncall> {{fn}} </functioncall>
                <functioncall> {{fn}} </functioncall>
                ...
            </multiplefunctions>
            
            Edge cases you must handle:
            - If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>

            Refer the below provided output example for function calling
            Question: What's the weather difference in NY and LA?
            <multiplefunctions>
                <functioncall> {{"name": "getWeather", "parameters": {{"city": "NY"}}}} </functioncall>
                <functioncall> {{"name": "getWeather", "parameters": {{"city": "LA"}}}} </functioncall>
            </multiplefunctions>
            
        """

    def functionCall(self, messages: List[Dict], functions: List[Dict]):
        functions_texts = "\n\n".join(
            [f"{json.dumps(function)}" for function in functions]
        )
        if messages[0].get("role") == "system":
            new_system_prompt = (
                self.system_prompt.format(functions=functions_texts)
                + "\n"
                + messages[0].get("content")
            )
            messages[0]["content"] = new_system_prompt
        else:
            messages = [
                {
                    "role": "system",
                    "content": self.system_prompt.format(functions=functions_texts),
                }
            ] + messages
        output_text = self.__inference__(messages)
        return output_text


class NormalCompletion(OpenHermesInference):
    def __init__(self, model, tokenizer):
        super().__init__(self, NormalCompletion)

    def normalCompletion(self, messages: List[str]):
        output_text = self.__inference__(messages)
        return output_text


class FunctionExtraction:
    def __call__(self, text: str):
        completion = text.strip()
        pattern = r"(<multiplefunctions>(.*?)</multiplefunctions>)"
        match = re.search(pattern, completion, re.DOTALL)
        if not match:
            return None
        multiplefn = match.group(1)
        root = ET.fromstring(multiplefn)
        functions = root.findall("functioncall")
        return [json.loads(fn.text) for fn in functions]


class Completion(FunctionCall, NormalCompletion, FunctionExtraction):
    def __init__(self, model: AutoModelForCausalLM, tokenizer: AutoTokenizer):
        super().__init__(self, Completion)
        self.model = model
        self.tokenizer = tokenizer

    def chatCompletion(self, messages: List[Dict], functions: Union[None, List] = None):
        if functions:
            function_call_text = self.functionCall(messages, functions)
            # print(function_call_text)
            functions = FunctionExtraction()(function_call_text)
            # functions = self.extractFunctions(function_call_text)
            return functions
        else:
            return self.normalCompletion(messages)

## Reranking

- Re-ranking usin the **bge-reranker-large**


In [5]:
class ReRanker:

    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def __call__(self, query: str, matches: List[str]):
        pairs = [[query, match] for match in matches]
        with torch.no_grad():
            inputs = self.tokenizer(
                pairs,
                padding=True,
                truncation=True,
                return_tensors="pt",
                max_length=512,
            )
            scores = (self.model(**inputs,
                                 return_dict=True).logits.view(-1, ).float())
        sorted_indices = scores.argsort(descending=True).tolist()
        output_ranks = [matches[i] for i in sorted_indices]
        return output_ranks

In [6]:
DEPENDENCY_PROMPT = """You're a ChatGPT powered query planning agent. Given a user message provide all the question or context dependencies that would need to be addressed to provide a response to the user.
You've to break down questions into its dependent queries such that the answers of the dependent query can be used to inform the parent question.
You don't need to answer the questions, simply provide the correct sequence of questions to ask and relevant dependencies.
Call the function with appropriate data i.e. the dependencies.
"""

RAG_ANSWER_PROMPT = """You are a ChatGPT powered answering agent. You will be provided with a question and multiple passages in descending order or relevance. Based on those you have to answer the question if possible. If not then you can politely respond saying you cannot help."""

RAG_FINAL_ANSWER_PROMPT = """You are a ChatGPT powered answering agent. You will be provided with a question along with multiple dependencies of that question ans answers for that. Using these you've to provide a coherent answer for the main question if possible. 
If there is no content the answers of the sub questions that can help answer the main question then respond politely that you cannot help."""

In [7]:
# embedding_tokenizer = AutoTokenizer.from_pretrained(MODELS["embedding"])
# embedding_model = AutoModel.from_pretrained(
#     MODELS["embedding"], device_map="auto"
# ).eval()

# rerank_tokenizer = AutoTokenizer.from_pretrained(MODELS["re_ranker"])
# rerank_model = AutoModelForSequenceClassification.from_pretrained(
#     MODELS["re_ranker"]
# ).eval()

completion_tokenizer = AutoTokenizer.from_pretrained(MODELS["completion"])
completion_model = AutoModelForCausalLM.from_pretrained(
    MODELS["completion"], torch_dtype=torch.bfloat16, device_map="auto"
).eval()

tokenizer_config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

In [32]:
# embedding_obj = Embedding(embedding_model, embedding_tokenizer)

# reranking_obj = ReRanker(rerank_model, rerank_tokenizer)

completion_obj = Completion(completion_model, completion_tokenizer)

In [33]:
from pydantic import BaseModel, Field
from typing import List, Dict


class QueryDependency(BaseModel):
    id: int = Field(..., description="Unique Integer Id for the Query")
    question: str = Field(
        ...,
        description=
        "Question we want to ask to get a better context or more background about the main question.",
    )


class Dependencies(BaseModel):
    dependencies: List[QueryDependency] = Field(
        ...,
        description=
        "A list of query dependencies in the correct sequence to fetch more background information about the main question.",
    )
    # questions: List[str]


functions = [{
    "name": "dependencyPlanning",
    "description":
    "Plan a sequential list of all the sub-questions that once answered can provide more background to answer the main question.",
    "parameters": Dependencies.schema()
}]

/tmp/ipykernel_194/3132235189.py:27: PydanticDeprecatedSince20: The `schema` method is deprecated; use `model_json_schema` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  "parameters": Dependencies.schema()


In [34]:
question = "what's the distance between the capital of France and capital of United Kingdom?"
completion_messages = [
    {
        "role": "user",
        "content": f"""{DEPENDENCY_PROMPT}

        Question: {question}
        """,
    },
    {"role": "assistant", "content": ""}
]
function_calls = completion_obj.chatCompletion(completion_messages, functions)
function_calls

Input Tokens:  574
Generated New Tokens:  97


[{'name': 'dependencyPlanning',
  'parameters': {'dependencies': [{'id': 1,
     'question': 'What is the capital of France?'},
    {'id': 2, 'question': 'What is the capital of the United Kingdom?'},
    {'id': 3,
     'question': 'What is the distance between the two capital cities?'}]}}]

In [35]:
print(function_calls)

[{'name': 'dependencyPlanning', 'parameters': {'dependencies': [{'id': 1, 'question': 'What is the capital of France?'}, {'id': 2, 'question': 'What is the capital of the United Kingdom?'}, {'id': 3, 'question': 'What is the distance between the two capital cities?'}]}}]


In [36]:
question = "provide comparison between GPT-4 and Mistral-7B models benchmarks"
completion_messages = [
    {
        "role": "user",
        "content": f"""{DEPENDENCY_PROMPT}

        Question: {question}
        """,
    },
    {"role": "assistant", "content": ""}
]
function_calls = completion_obj.chatCompletion(completion_messages, functions)
function_calls

Input Tokens:  574
Generated New Tokens:  106


[{'name': 'dependencyPlanning',
  'parameters': {'dependencies': [{'id': 1, 'question': 'What is GPT-4?'},
    {'id': 2, 'question': 'What is Mistral-7B?'},
    {'id': 3,
     'question': 'What are the benchmarks used to compare AI models?'}]}}]

In [37]:
question = "compare iPhone 14 pro with iPhone 15 pro"
completion_messages = [
    {
        "role": "user",
        "content": f"""{DEPENDENCY_PROMPT}

        Question: {question}
        """,
    },
    {"role": "assistant", "content": ""}
]
function_calls = completion_obj.chatCompletion(completion_messages, functions)
function_calls

Input Tokens:  570
Generated New Tokens:  180


[{'name': 'dependencyPlanning',
  'parameters': {'dependencies': [{'id': 1,
     'question': 'What are the features of iPhone 14 Pro?'},
    {'id': 2, 'question': 'What are the features of iPhone 15 Pro?'}]}}]