In [1]:
# Import the necessary modules
import bs4
from pathlib import Path
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers.merger_retriever import MergerRetriever
from langchain.document_transformers import LongContextReorder, EmbeddingsClusteringFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.retrievers import ContextualCompressionRetriever
from langchain.document_transformers import (
    EmbeddingsRedundantFilter,
)
from os.path import expanduser
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate

### Configuration Parameters

In [2]:

#config 
db_dir = "/home/zjc1002/Mounts/data/" #directory to save vector indecies to disk
llm_dir = "/home/zjc1002/Mounts/llms/" #directory containing all local llms to use for retrieval , filtering, and generation 

#source data information (this is what you are going to query / create RAG framework for)
source_data_dict={
    "source_data_url": "https://lilianweng.github.io/posts/2023-06-23-agent/"
    , "page_components_2_parse":["post-content", "post-title", "post-header"]
}

#Foundational Model information (used for generation)
repo_id = "shaowenchen/llama-2-7b-langchain-chat-gguf"
filename = 'llama-2-7b-langchain-chat.Q4_K.gguf'
repo_type = "model"
local_dir = f"{llm_dir}/llama-2-7b-langchain-chat-gguf"
local_dir_use_symlinks = False
modelpath = Path(local_dir, filename) 
model_path = expanduser(modelpath)

#summarization template for LLamaCpp
#template to use for generating summaries from long text 
template = """
Write a concise summary of the text, return your responses with 5 lines that cover the key points of the text.
```{text}```
SUMMARY:
"""

#queries to use to test framework (these are end user queries that are used for retrieval)
query_list = ["Is there any framework available to tackle the climate change?", "what is huggingGPT?",]

#model memory settings
n_gpu_layers = 4 # Change this value based on your model and your GPU VRAM pool.
n_batch = 10    # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

#doc tokenization settings
doc_split_info = {'chunk_size':512
                  , 'chunk_overlap':100}

#Define the list of retrievers (include the foundatoinal model to use for filtering, in this case gpt2)
db_info = [{'db_name':'mini'
           , 'model_name':"sentence-transformers/all-MiniLM-L6-v2"
           , "model_kwargs": {"device":"cuda"}
           , "collection_meta":{"hnsw:space": "cosine"}
           , "persist_directory": f"{db_dir}/mini_tst"
           , "search_type": "similarity"
           , "search_kwargs": {"k":2, "include_metadata": True}
           , "filter":False

           }
           , {'db_name':'miniqa'
           , 'model_name':"sentence-transformers/multi-qa-MiniLM-L6-dot-v1"
           , "model_kwargs": {"device":"cuda"}
           , "collection_meta":{"hnsw:space": "cosine"}
           , "persist_directory": f"{db_dir}/miniqa_tst"
           , "search_type": "similarity"
           , "search_kwargs": {"k":2, "include_metadata": True}
           , "filter":False
           }
           , {'db_name':'bge'
           , 'model_name':"BAAI/bge-large-en-v1.5"
           , "model_kwargs": {"device":"cuda"}
           , "collection_meta":{"hnsw:space": "cosine"}
           , "persist_directory": f"{db_dir}/bge_tst"
           , "search_type": "similarity"
           , "search_kwargs": {"k":2, "include_metadata": True}
           , "filter":False
           }
           , {'db_name':'gpt2'
           , 'model_name':"gpt2"
           , "model_kwargs": {"device":"cuda"}
           , "filter":True
           }
           ]



### Load Data
1. use langchain web data loader and parsing with bs4
2. use recursive character text splitting to generate context-aware chunks of data

In [3]:
#LOAD SAMPLE DATA
#Define Sample Web Based Loader 
loader = WebBaseLoader(
    web_paths=(source_data_dict["source_data_url"],),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=(v for v in source_data_dict["page_components_2_parse"])
        )
    ),
)

#load and split sample docs 
docs = loader.load()

#define text chunking method
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=doc_split_info['chunk_size']
    , chunk_overlap=doc_split_info["chunk_overlap"])

#chunk text 
splits = text_splitter.split_documents(docs)
print(len(splits))


129


### Generate inputs to use in the merged retriever
1) **Load embeddings associated with each model** 
    - *Note: I load using HuggingFaceEmbeddings and point to my local directory where all my LLMs are saved(aka you dont need internet if u have the models already cached)*
2) **Initalize a Vector Store for each retriever and encode the source documents into each** 
3) **Save the vector store for each retriever to disk** , the vector stores will be stored within the 'persist_directory'  
4) **Create Retrievers from each vector store to use as inputs to the merged retriver**

In [4]:
# 1. load embeddings to use as retrievers
# use lots of small models to reduce bias and increase diversity of results
# NOTE: I wrap all models into dictionaries to enable flexible expansion/contraction of the number of retrievers used in the merged retriver 
embeddings_ = { _db["db_name"]:  HuggingFaceEmbeddings(model_name=_db['model_name']
                                    , model_kwargs = _db['model_kwargs']
                                    , cache_folder=llm_dir)
                                    for _db in db_info}

# 2. Index documents using each of the retrievers defined above (using (Chroma))
vector_stores = {_db["db_name"]: Chroma.from_documents(splits
                                     , embeddings_[_db['db_name']]
                                     , collection_metadata=_db['collection_meta']
                                     , persist_directory=_db['persist_directory'])
                                     for _db in db_info if _db['filter'] == False}

# 3.load the vector indcies from disk for demo (use persist directory to prevent loading into memory)
#load the vector indecies
vec_indcies = {_db["db_name"]: Chroma(persist_directory= _db['persist_directory']
                                      , embedding_function=embeddings_[_db['db_name']])
                                      for _db in db_info   if _db['filter'] == False
}


# 4.create retrievers from each vector index       
vecindex_retrievers = {_db["db_name"]: vec_indcies[_db['db_name']].as_retriever(
    search_type = _db["search_type"]
    , search_kwargs = _db['search_kwargs']
    )
    for _db in db_info if _db['filter'] == False
    }

  from .autonotebook import tqdm as notebook_tqdm
No sentence-transformers model found with name /home/zjc1002/Mounts/llms/gpt2. Creating a new one with MEAN pooling.


### Create the Merged Retriver

#### Merged Retriver Overview 
The MergerRetriever (LOTR) merges the outputs from different retrievers in a round-robin fashion. below is a high level overview of the process used to retrieve documents
   - a. Get the relevant documents from all retrievers and then merges them. 
   - b. The merged results will be a list of documents that are relevant to the query and that have been ranked by the different retrievers. 
   - c. EmbeddingsClusteringFilter is used to  divided into clusters or "centers" of meaning, and the closest document to that center is  picked for the final results.

#### Code Steps 
1) Initialize merged retriver by passing in all individual retrivers created in prior step 
2) Initalize a set of 'filter' embeddings to use to cluster MergeRetriever responses and remove redudent responses from consideration (we use gpt2 for filter)
3) Initialize ***DocumentCompressorPipeline*** to define the order of operations that occur post iniital retriveal 
4) Create final MergedRetriver with postprocessing/filtering enabled via ***ContextualCompressionRetriever***
5) pass each query defined in config to retriever and print the topk results out to illistrate the Information Retrival Component of RAG

In [8]:
# 1.Generate LORD OF ALL RETRIEVERS (aka an ensemble of retrievers)
lotr = MergerRetriever(retrievers=[retrvr for retrvr in vecindex_retrievers.values()])


# 2.the big model used for clusering and filtering results from merged retriver
filter_embeddings = [embeddings_[_db['model_name']] for _db in db_info if _db['filter'] == True][0]
filter_embeddings.client.tokenizer.pad_token = filter_embeddings.client.tokenizer.eos_token


# 3.We can remove redundant results from both retrievers using yet another embedding.
# Using multiples embeddings in diff steps could help reduce biases.
# If you want the final document to be ordered by the original retriever scores
# you need to add the "sorted" parameter.
filter_ordered_by_retriever = EmbeddingsClusteringFilter(
    embeddings=filter_embeddings,
    num_clusters=3,
    num_closest=1,
    sorted=True,
)

# 4.Compile final pipeline
# Contextual compression is a way of making it easy for models to fetch answers or relevant information 
# from the pool of data quickly. It allows the system to compress the files and filter out the irrelevant 
# information before making a similarity search or any kind of search. The compression is related to both 
# the data compression within the document and document compression from the pool of data:
# You can use an additional document transformer to reorder documents after removing redundance.
#filter = EmbeddingsRedundantFilter(embeddings=filter_embeddings)
reordering = LongContextReorder()
pipeline = DocumentCompressorPipeline(transformers=[ filter_ordered_by_retriever 
                                                    , reordering]
                                                    )

#final merged retriver to be used for IR 
compression_retriever = ContextualCompressionRetriever(
    base_compressor=pipeline
    , base_retriever=lotr
    , documents= splits
    )


#print top IR RESULTS (THIS IS NOT GENERATION)
for _query in query_list: 
    for chunks in compression_retriever.get_relevant_documents(_query):
        print(f":::{_query}:::")
        print(chunks.page_content)
        print('\n')

:::Is there any framework available to tackle the climate change?:::
ChatGPT Plugins and OpenAI API  function calling are good examples of LLMs augmented with tool use capability working in practice. The collection of tool APIs can be provided by other developers (as in Plugins) or self-defined (as in function calls).
HuggingGPT (Shen et al. 2023) is a framework to use ChatGPT as the task planner to select models available in HuggingFace platform according to the model descriptions and summarize the response based on the execution results.


:::Is there any framework available to tackle the climate change?:::
remaining questions.\n{Next question}\nIf everything is sufficiently clear, only answer \"Nothing more to clarify.\"."


:::Is there any framework available to tackle the climate change?:::
Another quite distinct approach, LLM+P (Liu et al. 2023), involves relying on an external classical planner to do long-horizon planning. This approach utilizes the Planning Domain Definition La

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


### GENERATION

##### Demonstration of using the LLAMACPP library and MapReduce to generate summaries of text documents using a summarization chain.

1. ***LLAMACPP library is imported and initialized*** with certain parameters, such as the model path, the number of GPU layers, the batch size, and the use of 16-bit key-value (f16_kv) encoding.

2. ***A prompt template is created*** using the *PromptTemplate* class. 
    - Note: This template will be used to instruct the LLAMACPP library on how to summarize the text. It takes an input variable called 'text' and a template string.

3. ***A summarization chain is loaded*** using the *load_summarize_chain* function. 
    - Note: This function takes the LLAMACPP instance, the chain type (in this case, 'stuff'), the prompt template, and a verbose flag. The chain is responsible for generating summaries of the top documents returned by a retriever.

4. Iterate over each query in the 'query_list'. Inside the loop, the relevant documents are retrieved using a ***'compression_retriever'*** object. 
    - Note: These documents will be passed to the LLAMACPP summarization chain for summarization and distillation into a single answer.

5. The 'chain' object is called with the retrieved documents as input. This generates a summary or answer for the given documents.
    - Note: The generated answer is printed to the console.

6. The query and the generated answer are appended to the 'results' list.

7. After the loop finishes, the 'results' list is printed to the console, which contains the query and the corresponding generated answer for each query.

In [12]:
# USING LLAMACPP to illistrate how flexible langchain can be (you dont have to use OPENAI!)
llm = LlamaCpp(
    model_path=model_path
    , n_gpu_layers=n_gpu_layers
    , n_batch=n_batch
    , f16_kv=True  # MUST set to True, otherwise you will run into problem after a couple of calls
)

#use summarization prompt template to instruct llama2 to informally summarize the text
prompt = PromptTemplate(
    input_variables=['text'],
    template=template
)

#use summarization chain to generate summarry of all topn docs returned from merged retrueiver
chain = load_summarize_chain(
    llm,
    chain_type='stuff',
    prompt=prompt,
    verbose=False
)

#generate diffused awnsers for each original query in the query list 
results = []
for query_ in query_list: 

    #Retrieve topn documents from the merged retriever, these documents will be passed to LLamaccp for summarization / distillation into a single awnser 
    docs = compression_retriever.get_relevant_documents(query_list[0])

    #final augmented awnser 
    result = chain(docs)

    print('THIS IS AN AWNSER FROM LLAMACPP')
    print("\n")
    print(result['output_text'])
    print("\n")
    results.append({'query':query_, 'awnser': result['output_text']})

#print(results)
############
##END DEMO##
############

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /home/zjc1002/Mounts/llms/llama-2-7b-langchain-chat-gguf/llama-2-7b-langchain-chat.Q4_K.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q6_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    7:            blk.0

THIS IS AN AWNSER FROM LLAMACPP


1. Plugins and OpenAI API are examples of LLMs (Large Language Models) that have been augmented with tool use capability, working in practice. These tool APIs can be provided by other developers or self-defined.
2. HuggingGPT is a framework to use ChatGPT as the task planner to select models available in the HuggingFace platform according to the model descriptions and summarize the response based on the execution results.
3. Another approach is LLM+P, which involves relying on an external classical planner to do long-horizon planning using PDDL (Planning Domain Definition Language) as an intermediate interface to describe the planning problem. This approach translates the problem into "Problem PDDL," requests a classical planner for a PDDL plan based on an existing "Domain PDDL", and finally translates the PDDL plan back into natural language.
4. These


THIS IS AN AWNSER FROM LLAMACPP


- ChatGPT Plugins and OpenAI API are good examples of Large Langu


llama_print_timings:        load time =     408.17 ms
llama_print_timings:      sample time =      77.86 ms /   196 runs   (    0.40 ms per token,  2517.18 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =   25481.48 ms /   196 runs   (  130.01 ms per token,     7.69 tokens per second)
llama_print_timings:       total time =   26002.83 ms
