# Install Packages and Setup Variables


In [3]:
!pip install -q openai==1.59.6 tiktoken==0.8.0 llama-index==0.12.12 llama-index-vector-stores-qdrant==0.4.3

In [4]:
import os

# Set the following API Keys in the Python environment. Will be used later.
os.environ["OPENAI_API_KEY"] = "[OPENAI_API_KEY]"

# from google.colab import userdata
# os.environ["OPENAI_API_KEY"] = userdata.get('openai_api_key')

In [5]:
import nest_asyncio

nest_asyncio.apply()

# Load a Model


In [6]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

Settings.llm = OpenAI(temperature=0.9, model="gpt-4o-mini", max_tokens=512)
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

# Create a VectoreStore


In [7]:
from qdrant_client import QdrantClient
from llama_index.vector_stores.qdrant import QdrantVectorStore

# qdrant_client = QdrantClient(location=":memory:")
# or Persist storage
qdrant_client = QdrantClient(path="/content/")

In [8]:
vector_store = QdrantVectorStore(client=qdrant_client, collection_name="ai_tutor_knowledge")

# Load the Dataset (JSON)


## Download


In [9]:
from huggingface_hub import hf_hub_download
file_path = hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="ai_tutor_knowledge.jsonl",repo_type="dataset",local_dir="/content")

ai_tutor_knowledge.jsonl:   0%|          | 0.00/6.96M [00:00<?, ?B/s]

## Read File


In [10]:
import json
with open(file_path, "r") as file:
    ai_tutor_knowledge = [json.loads(line) for line in file]
ai_tutor_knowledge[1]['content']

"Github Repo: https://github.com/vaibhawkhemka/ML-Umbrella/tree/main/NLP/Product-Categorization   From e-commerce to Customer support  all businesses require some kind of NER model to process huge amounts of texts from users.   To automate this whole  one requires NER models to extract relevant and important entities from text.   Final Result/OutputInput text = EL D68 (Green  32 GB) 3 GB RAM [3 GB RAM U+007C 32 GB ROM U+007C Expandable Upto 128 GB  15.46 cm (6.088 inch) Display  13MP Rear Camera U+007C 8MP Front Camera  4000 mAh Battery  Quad-Core Processor]   Output =   Green ->>>> COLOR 32 GB ->>>> STORAGE 3 GB RAM ->>>> RAM 3 GB RAM ->>>> RAM 32 GB ROM ->>>> STORAGE Expandable Upto 128 GB ->>>> EXPANDABLE_STORAGE 15.46 cm (6.088 inch) ->>>> SCREEN_SIZE 13MP Rear Camera ->>>> BACK_CAMERA 8MP Front Camera ->>>> FRONT_CAMERA 4000 mAh Battery ->>>> BATTERY_CAPACITY Quad-Core Processor ->>>> PROCESSOR_CORE   Data PreparationA tool for creating this dataset (https://github.com/tecoholic/n

In [11]:
# Not necessary to use full dataset
documents = ai_tutor_knowledge[:100]+ai_tutor_knowledge[500:]

# Transforming


In [12]:
from typing import List
from llama_index.core import Document

def create_docs_from_list(data_list: List[dict]) -> List[Document]:
    documents = []
    for data in data_list:
        documents.append(
            Document(
                doc_id=data["doc_id"],
                text=data["content"],
                metadata={  # type: ignore
                    "url": data["url"],
                    "title": data["name"],
                    "tokens": data["tokens"],
                    "source": data["source"],
                },
                excluded_llm_metadata_keys=[
                    "title",
                    "tokens",
                    "source",
                ],
                excluded_embed_metadata_keys=[
                    "url",
                    "tokens",
                    "source",
                ],
            )
        )
    return documents

doc = create_docs_from_list(documents)
doc[2]

Document(id_='45501b72-9391-529e-8e5e-59a2604ba26e', embedding=None, metadata={'url': 'https://towardsai.net/p/machine-learning/adaboost-explained-from-its-original-paper', 'title': 'AdaBoost Explained From Its Original Paper', 'tokens': 1697, 'source': 'tai_blog'}, excluded_embed_metadata_keys=['url', 'tokens', 'source'], excluded_llm_metadata_keys=['title', 'tokens', 'source'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text="This publication is meant to show a very popular ML algorithm in complete detail  how it works  the math behind it  how to execute it in Python and an explanation of the proofs of the original paper. There will be math and code  but it is written in a way that allows you to decide which are the fun parts.   A bit on the origins of the algorithm: It was proposed by Yoav Freund and Robert E. Schapire in a 1997 paper  A Decision-Theoretic Generalization of On-Line Learning a

In [13]:
from llama_index.core.node_parser import TokenTextSplitter

# Define the splitter object that split the text into segments with 512 tokens,
# with a 128 overlap between the segments.
text_splitter = TokenTextSplitter(separator=" ", chunk_size=512, chunk_overlap=128)

In [14]:
from llama_index.core.extractors import (
    KeywordExtractor,
)
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.ingestion import IngestionPipeline

# Create the pipeline to apply the transformation on each chunk,
# and store the transformed text in the chroma vector store.
pipeline = IngestionPipeline(
    transformations=[
        text_splitter,
        KeywordExtractor(keywords=10, llm=Settings.llm),
        OpenAIEmbedding(model="text-embedding-3-small"),
    ],
    vector_store=vector_store,
)

# Run the transformation pipeline.
nodes = pipeline.run(documents=doc, show_progress=True)

Parsing nodes:   0%|          | 0/362 [00:00<?, ?it/s]

100%|██████████| 1181/1181 [05:49<00:00,  3.38it/s]


Generating embeddings:   0%|          | 0/1181 [00:00<?, ?it/s]

  self._client.create_payload_index(


In [15]:
!zip ai_tutor_knowledge_metadata.zip /content/collection/ai_tutor_knowledge

  adding: content/collection/ai_tutor_knowledge/ (stored 0%)


In [16]:
len(nodes)

1181

In [17]:
nodes[0].metadata

{'url': 'https://towardsai.net/p/machine-learning/bert-huggingface-model-deployment-using-kubernetes-github-repo-03-07-2024',
 'title': 'BERT HuggingFace Model Deployment using Kubernetes [ Github Repo]  03/07/2024',
 'tokens': 768,
 'source': 'tai_blog',
 'excerpt_keywords': 'BERT, HuggingFace, Kubernetes, Model Deployment, FastAPI, Docker, Inference, Containerization, Scalability, ML Pipeline'}

In [18]:
from llama_index.core import VectorStoreIndex

# Create the index based on the vector store.
index = VectorStoreIndex.from_vector_store(vector_store)

In [19]:
query_engine = index.as_query_engine(similarity_top_k=10)
res = query_engine.query("Explain how Advance RAG works?")

res.response

'Advanced Retrieval-Augmented Generation (RAG) systems enhance the basic RAG approach by improving the retrieval and generation processes, leading to higher quality outputs. Systems like Self-Reflective RAG (Self-RAG) and Corrective RAG (CRAG) are examples of these advancements. \n\nSelf-RAG introduces self-reflection tags that help the language model assess the relevance of retrieved documents dynamically, allowing it to critique the context before generating responses. On the other hand, CRAG employs an external evaluator to refine the quality of retrieved documents, ensuring that only the most relevant and high-quality context is used for generation.\n\nMoreover, Speculative RAG further optimizes the RAG process by decomposing tasks into drafting and verification phases. A specialized RAG drafter is used to generate multiple drafts based on subsets of retrieved documents, which are then verified by a larger generalist language model. This method minimizes redundancy and accelerates 

# Metadata Filtering


In [20]:
from llama_index.core.vector_stores import MetadataFilter, MetadataFilters, FilterOperator

filters = MetadataFilters(
    filters=[
        MetadataFilter(
            key="excerpt_keywords",
            operator=FilterOperator.TEXT_MATCH,
            value="PEFT",
        ),
    ]
)

# Query Dataset


In [21]:
# Define a query engine that is responsible for retrieving related pieces of text,
# and using a LLM to formulate the final answer.
query_engine = index.as_query_engine(filters=filters)

res = query_engine.query("How Parameter efficient fine tuning (PEFT) Works?")

In [22]:
res.response

"Parameter efficient fine tuning (PEFT) involves training large language models in a way that minimizes computational costs while still allowing for effective adaptation to specific tasks. It does this by focusing on a subset of parameters rather than fine-tuning the entire model. The main approaches within PEFT include:\n\n1. **Selective Fine-tuning**: This method involves selecting a subset of parameters from the initial model to fine-tune, thereby reducing the number of adjustments needed.\n\n2. **Reparameterization**: This approach utilizes a low-rank representation to reparameterize model weights, which can significantly decrease the number of parameters that require fine-tuning.\n\n3. **Additive Methods**: This includes the use of adapter modules, which are small neural network components inserted into certain layers of the pre-trained model to help it learn specific tasks without altering the core model significantly. Additionally, prompt tuning is another technique where specif

In [23]:
# Show the retrieved nodes
for src in res.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("Score\t", src.metadata["excerpt_keywords"])
    print("-_" * 20)

Node ID	 14a51133-e63f-482a-b2f6-759b550ec3f0
Title	 Fine-Tuning and Evaluating Large Language Models: Key Benchmarks and Metrics
Text	 SAMSUM is one of the datasets that FLAN T5 uses. There are several pre-trained FLAN T5 models that have been fine-tuned on SAMSUM  including Phil Schmid/flan-t5-base-samsum and jasonmcaffee/flan-t5-large-samsum on Hugging Face. If we want to fine-tune the FLAN T5 model specifically for formal dialogue conversations  we can do so using the DIALOGUESUM dataset.   Models fine-tuned on DialogSum can be applied to areas like customer support  meeting minutes generation  chatbot summarization  and more.   2. PEFT (Parameter efficient fine tuning)Training LLMs is computationally intensive. Full finetuning is computationally expensive as it might change each weight in the model. First  we start with a pretrained LLM like GPT-3. This model already has a vast amount of knowledge and understanding of language. Then we provide task-specific datasets  which could b

# Filter Metadata (source_name)


In [24]:
from llama_index.core.vector_stores import MetadataFilter,MetadataFilters,FilterOperator,FilterCondition

filters = MetadataFilters(
    filters=[
        MetadataFilter(
            key="excerpt_keywords",
            operator=FilterOperator.TEXT_MATCH,
            value="BERT",
        ),
        MetadataFilter(
            key="source", operator=FilterOperator.EQ, value="tai_blog"
        ),
    ],
    condition=FilterCondition.AND,
)

In [25]:
# Define a query engine that is responsible for retrieving related pieces of text,
# and using a LLM to formulate the final answer.
query_engine = index.as_query_engine(filters=filters)

result = query_engine.query("Explain BERT?")

In [26]:
result.response

"BERT, which stands for Bidirectional Encoder Representations from Transformers, is a language model developed by Google that employs a transformer architecture to better understand and generate human-like language. Its design allows it to process text in both directions simultaneously, which enhances its ability to capture context compared to traditional unidirectional models that read text sequentially from either left to right or right to left.\n\nFor instance, when analyzing a sentence with a missing word, a unidirectional model would rely mainly on the preceding context, potentially leading to ambiguous interpretations. In contrast, BERT's bidirectional capability enables it to consider the entire sentence, including words before and after the blank, thus allowing for more accurate predictions. \n\nBERT has two primary pre-training tasks: Masked Language Modeling (MLM), where around 15% of input tokens are masked and the model learns to predict them based on surrounding context, a

In [27]:
# Show the retrieved nodes
for src in result.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("Score\t", src.metadata["excerpt_keywords"])
    print("-_" * 20)

Node ID	 2d38e051-fee5-4444-b75a-8568820b2190
Title	 Attention is all you need: How Transformer Architecture in NLP started.
Text	 GPT-2  GPT-3  and GPT-4  which were decoder-only architectures. Another well-known example is BERT (Bidirectional Encoder Representations from Transformers)  an encoder-only transformer mode used as a component in sentence embedding models.   Lets talk about BERT!BERT stands for Bidirectional Encoder Representations from Transformers. It is a language model by Google that uses a transformer architecture to understand and generate human-like language. BERT is designed to simultaneously process text in both directions  allowing it to capture context more effectively than traditional unidirectional models  which read text sequentially from left to right or right to left.   Example of Bidirectional CapabilityConsider the sentence:    The bank is situated on the _______ of the river.   In a unidirectional model  understanding the blank would primarily rely on th

In [28]:
# When Mismatch between Keyword (value) and Query

filters = MetadataFilters(
    filters=[
        MetadataFilter(
            key="excerpt_keywords",
            operator=FilterOperator.TEXT_MATCH,
            value="BERT",
        ),
        MetadataFilter(
            key="source", operator=FilterOperator.EQ, value="tai_blog"
        ),
    ],
    condition=FilterCondition.AND,
)

query_engine = index.as_query_engine(filters=filters)

result = query_engine.query("Explain PEFT?")

print(result.response)


# Show the retrieved nodes
for src in result.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("Score\t", src.metadata["excerpt_keywords"])
    print("-_" * 20)


The provided excerpts do not contain information relevant to PEFT (Parameter-Efficient Fine-Tuning). As such, a specific explanation of PEFT cannot be provided based on the context information given.
Node ID	 2d38e051-fee5-4444-b75a-8568820b2190
Title	 Attention is all you need: How Transformer Architecture in NLP started.
Text	 GPT-2  GPT-3  and GPT-4  which were decoder-only architectures. Another well-known example is BERT (Bidirectional Encoder Representations from Transformers)  an encoder-only transformer mode used as a component in sentence embedding models.   Lets talk about BERT!BERT stands for Bidirectional Encoder Representations from Transformers. It is a language model by Google that uses a transformer architecture to understand and generate human-like language. BERT is designed to simultaneously process text in both directions  allowing it to capture context more effectively than traditional unidirectional models  which read text sequentially from left to right or right t