# Create AI-Tutor vector database


In [None]:
from dotenv import load_dotenv

load_dotenv("../.env")

In [None]:
import nest_asyncio

nest_asyncio.apply()

### Clean data


In [None]:
import json
import tiktoken
from collections import OrderedDict


def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(
        encoding.encode(
            string, disallowed_special=(encoding.special_tokens_set - {"<|endoftext|>"})
        )
    )
    return num_tokens


def clean_jsonl_file(input_filepath, output_filepath):
    cleaned_data = []

    with open(input_filepath, "r") as file:
        for line in file:
            json_obj = json.loads(line)
            content = json_obj.get("content", "")
            token_count = num_tokens_from_string(content, "cl100k_base")

            # Check conditions for keeping the line
            if token_count > 7 and not (
                token_count == 92 and json_obj.get("name") == "Transformers"
            ):
                # Create a new OrderedDict with 'tokens' as the first key
                new_obj = OrderedDict([("tokens", token_count)])
                # Add the rest of the key-value pairs from the original object
                new_obj.update(json_obj)
                cleaned_data.append(new_obj)

    with open(output_filepath, "w") as file:
        for item in cleaned_data:
            json.dump(item, file)
            file.write("\n")

    print(f"Original number of lines: {sum(1 for _ in open(input_filepath))}")
    print(f"Cleaned number of lines: {len(cleaned_data)}")


# Usage
input_filepath = "../hf_transformers_v4_42_0.jsonl"
output_filepath = "../hf_transformers_v4_42_0_cleaned.jsonl"
clean_jsonl_file(input_filepath, output_filepath)

### Merges lines by 'URL' and creates a new file with the merged data.

Fixes the 'name'


In [None]:
import json
from collections import defaultdict
import tiktoken


def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(
        encoding.encode(
            string, disallowed_special=(encoding.special_tokens_set - {"<|endoftext|>"})
        )
    )
    return num_tokens


def should_not_merge(url):
    """Check if the URL contains any of the exclusion patterns."""
    exclusion_patterns = ["model_doc", "internal", "main_classes"]
    return any(pattern in url for pattern in exclusion_patterns)


def merge_jsonl(input_file, output_file):
    # Dictionary to store merged data
    merged_data = defaultdict(list)

    # Read and process the input file
    with open(input_file, "r") as f:
        for line in f:
            data = json.loads(line)
            url = data["url"]
            merged_data[url].append(data)

    # Write the merged data to the output file
    with open(output_file, "w") as f:
        for url, entries in merged_data.items():
            if len(entries) == 1 or should_not_merge(url):
                # If there's only one entry or it shouldn't be merged, write all entries as is
                for entry in entries:
                    entry["retrieve_doc"] = False
                    json.dump(entry, f)
                    f.write("\n")
            else:
                # Merge the entries
                merged_entry = entries[0].copy()
                merged_entry["content"] = "\n\n".join(
                    entry["content"] for entry in entries
                )
                merged_entry["tokens"] = num_tokens_from_string(
                    merged_entry["content"], "cl100k_base"
                )
                merged_entry["retrieve_doc"] = True
                json.dump(merged_entry, f)
                f.write("\n")


# Usage
input_file = "../hf_transformers_v4_42_0_cleaned.jsonl"
output_file = "../hf_transformers_v4_42_0_merged.jsonl"
merge_jsonl(input_file, output_file)

### Count tokens of lines in merged file


In [None]:
# import json
# import tiktoken


# def num_tokens_from_string(string: str, encoding_name: str) -> int:
#     """Returns the number of tokens in a text string."""
#     encoding = tiktoken.get_encoding(encoding_name)
#     num_tokens = len(
#         encoding.encode(
#             string, disallowed_special=(encoding.special_tokens_set - {"<|endoftext|>"})
#         )
#     )
#     return num_tokens


# def count_tokens(input_file):

#     # Read and process the input file
#     with open(input_file, "r") as f:
#         for i, line in enumerate(f):
#             data = json.loads(line)
#             content = data["content"]
#             nb_tokens = num_tokens_from_string(content, "cl100k_base")
#             # print(i + 1, data["url"], nb_tokens)
#             if nb_tokens > 2000:
#                 print(i + 1, data["url"], data["name"], nb_tokens)
#             # if nb_tokens < 8:
#             # print(nb_tokens)
#             # print(data["url"])
#             # print(data["content"])


# # Usage
# input_file = "../hf_transformers_v4_42_0_merged.jsonl"
# # input_file = "../hf_transformers_v4_42_0.jsonl"
# count_tokens(input_file)

### Create a set of llama-index Documents


In [None]:
from llama_index.core import Document
from llama_index.core.schema import MetadataMode
import json


def create_docs(input_file):
    with open(input_file, "r") as f:
        documents = []
        for i, line in enumerate(f):
            data = json.loads(line)
            documents.append(
                Document(
                    text=data["content"],
                    metadata={
                        "url": data["url"],
                        "title": data["name"],
                        "tokens": data["tokens"],
                        "retrieve_doc": data["retrieve_doc"],
                    },
                    excluded_llm_metadata_keys=[
                        "url",
                        "title",
                        "tokens",
                        "retrieve_doc",
                    ],
                    excluded_embed_metadata_keys=[
                        "url",
                        "title",
                        "tokens",
                        "retrieve_doc",
                    ],
                )
            )
        return documents


documents = create_docs("../hf_transformers_v4_42_0_merged.jsonl")
print(documents[0])
print(documents[0].metadata)

document_dict = {doc.doc_id: doc for doc in documents}

In [None]:
# import chromadb

# # create client and a new collection
# # chromadb.EphemeralClient saves data in-memory.
# chroma_client = chromadb.PersistentClient(path="./ai-tutor-dataset")
# chroma_collection = chroma_client.create_collection("ai-tutor-dataset")

# from llama_index.vector_stores.chroma import ChromaVectorStore
# from llama_index.core import StorageContext

# # Define a storage context object using the created vector database.
# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding

# Build index / generate embeddings using OpenAI embedding model
index = VectorStoreIndex.from_documents(
    documents,
    # embed_model=OpenAIEmbedding(model="text-embedding-3-small"),
    embed_model=OpenAIEmbedding(model="text-embedding-3-large", mode="similarity"),
    transformations=[SentenceSplitter(chunk_size=800, chunk_overlap=400)],
    show_progress=True,
    use_async=True,
    # storage_context=storage_context,
)

In [None]:
# from llama_index.llms.openai import OpenAI

# llm = OpenAI(temperature=1, model="gpt-3.5-turbo", max_tokens=None)
# query_engine = index.as_query_engine(
#     llm=llm,
#     similarity_top_k=5,
#     embed_model=OpenAIEmbedding(model="text-embedding-3-small"),
#     use_async=True,
# )
retriever = index.as_retriever(
    similarity_top_k=10,
    use_async=True,
    embed_model=OpenAIEmbedding(model="text-embedding-3-large", mode="similarity"),
    # embed_model=OpenAIEmbedding(model="text-embedding-3-large", mode="text_search"),
)

In [None]:
from llama_index.core.data_structs import Node
from llama_index.core.schema import NodeWithScore

# res = query_engine.query("What is the LLaMa model?")
# res.response

# query = "fine-tune a pretrained model"
# query = "fine-tune an llm"
query = "how to fine-tune an llm?"

nodes_context = []
nodes = retriever.retrieve(query)


# # Filter nodes with the same ref_doc_id
# def filter_nodes_by_unique_doc_id(nodes):
#     unique_nodes = {}
#     for node in nodes:
#         doc_id = node.node.ref_doc_id
#         if doc_id is not None and doc_id not in unique_nodes:
#             unique_nodes[doc_id] = node
#     return list(unique_nodes.values())


# nodes = filter_nodes_by_unique_doc_id(nodes)

for node in nodes:
    print("Node ID\t", node.node_id)
    print("Title\t", node.metadata["title"])
    print("Text\t", node.text)
    print("Score\t", node.score)
    print("Metadata\t", node.metadata)
    print("-_" * 20)
    if node.metadata["retrieve_doc"] == True:
        print("This node will be replaced by the document")
        doc = document_dict[node.node.ref_doc_id]
        # print(doc.text)
        new_node = (
            NodeWithScore(
                node=Node(text=doc.text, metadata=node.metadata), score=node.score
            ),
        )
        nodes_context.append(new_node)
    else:
        nodes_context.append(node)

In [None]:
# from llama_index.core.schema import TextNode

# for src in res.source_nodes:
#     print(src.node.ref_doc_id)
#     # print(src.node.get_metadata_str())
#     print("Node ID\t", src.node_id)
#     print("Title\t", src.metadata["title"])
#     print("Text\t", src.text)
#     print("Score\t", src.score)
#     print("Metadata\t", src.metadata)
#     print("-_" * 20)
#     break

In [None]:
from llama_index.core.data_structs import Node
from llama_index.core.schema import NodeWithScore
from llama_index.core import get_response_synthesizer
from llama_index.llms.gemini import Gemini
from llama_index.llms.openai import OpenAI

from tutor_prompts import (
    TEXT_QA_TEMPLATE,
)


# llm = Gemini(model="models/gemini-1.5-flash", temperature=1, max_tokens=None)
# llm = Gemini(model="models/gemini-1.5-pro", temperature=1, max_tokens=None)
# llm = OpenAI(temperature=1, model="gpt-3.5-turbo", max_tokens=None)
llm = OpenAI(temperature=1, model="gpt-4o", max_tokens=None)

response_synthesizer = get_response_synthesizer(
    llm=llm, response_mode="simple_summarize", text_qa_template=TEXT_QA_TEMPLATE
)

response = response_synthesizer.synthesize(
    query,
    nodes=nodes,
    # nodes=[
    #     NodeWithScore(
    #         node=Node(text="LLama2 model has a total of 2B parameters."), score=1.0
    #     ),
    # ],
    # text_chunks=["text1", "text2", "text3"],
)
print(response.response)
# for src in response.source_nodes:
#     print(src.node.ref_doc_id)
#     print("Node ID\t", src.node_id)
#     print("Title\t", src.metadata["title"])
#     print("Text\t", src.text)
#     print("Score\t", src.score)
#     print("Metadata\t", src.metadata)
#     print("-_" * 20)

In [None]:
# import chromadb

# # create client and a new collection
# # chromadb.EphemeralClient saves data in-memory.
# chroma_client = chromadb.PersistentClient(path="./ai-tutor-db")
# chroma_collection = chroma_client.create_collection("ai-tutor-db")

In [None]:
# from llama_index.vector_stores.chroma import ChromaVectorStore
# from llama_index.core import StorageContext

# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# # Define a storage context object using the created vector store.
# storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
# import json
# from llama_index.core.schema import TextNode


# def load_jsonl_create_nodes(filepath):
#     nodes = []  # List to hold the created node objects
#     with open(filepath, "r") as file:
#         for line in file:
#             # Load each line as a JSON object
#             json_obj = json.loads(line)
#             # Extract required information
#             title = json_obj.get("title")
#             url = json_obj.get("url")
#             content = json_obj.get("content")
#             source = json_obj.get("source")
#             # Create a TextNode object and append to the list
#             node = TextNode(
#                 text=content,
#                 metadata={"title": title, "url": url, "source": source},
#                 excluded_embed_metadata_keys=["title", "url", "source"],
#                 excluded_llm_metadata_keys=["title", "url", "source"],
#             )
#             nodes.append(node)
#     return nodes

In [None]:
# filepath = "../combined_data.jsonl"
# nodes = load_jsonl_create_nodes(filepath)

# print(f"Loaded {len(nodes)} nodes/chunks from the JSONL file\n ")

# node = nodes[0]
# print(f"ID: {node.id_} \nText: {node.text}, \nMetadata: {node.metadata}")

# print("\n")

# node = nodes[-10000]
# print(f"ID: {node.id_} \nText: {node.text}, \nMetadata: {node.metadata}")

In [None]:
# # Create the pipeline to apply the transformation on each chunk,
# # and store the transformed text in the chroma vector store.
# pipeline = IngestionPipeline(
#     transformations=[
#         text_splitter,
#         QuestionsAnsweredExtractor(questions=3, llm=llm),
#         SummaryExtractor(summaries=["prev", "self"], llm=llm),
#         KeywordExtractor(keywords=10, llm=llm),
#         OpenAIEmbedding(),
#     ],
#     vector_store=vector_store
# )

# nodes = pipeline.run(documents=documents, show_progress=True);

In [None]:
# from llama_index.embeddings.openai import OpenAIEmbedding
# from llama_index.core import VectorStoreIndex

# # embeds = OpenAIEmbedding(model="text-embedding-3-small", mode="similarity")
# # embeds = OpenAIEmbedding(model="text-embedding-3-large", mode="similarity")
# embeds = OpenAIEmbedding(model="text-embedding-3-large", mode="text_search")
# # embeds = OpenAIEmbedding(model="text-embedding-ada-002", mode="similarity")

# # Build index / generate embeddings using OpenAI.
# index = VectorStoreIndex(
#     nodes=nodes,
#     show_progress=True,
#     use_async=True,
#     storage_context=storage_context,
#     embed_model=embeds,
#     insert_batch_size=3000,
# )

In [None]:
# from llama_index.llms.openai import OpenAI

# llm = OpenAI(temperature=0, model="gpt-3.5-turbo", max_tokens=None)
# query_engine = index.as_query_engine(llm=llm, similarity_top_k=5, embed_model=embeds)

In [None]:
# res = query_engine.query("What is the LLaMa model?")

In [None]:
# res.response

In [None]:
# for src in res.source_nodes:
#     print("Node ID\t", src.node_id)
#     print("Title\t", src.metadata["title"])
#     print("Text\t", src.text)
#     print("Score\t", src.score)
#     print("Metadata\t", src.metadata)
#     print("-_" * 20)

# Load DB from disk


In [None]:
# import logging

# logger = logging.getLogger(__name__)
# logging.basicConfig(level=logging.INFO)


# import chromadb
# from llama_index.vector_stores.chroma import ChromaVectorStore

# # Create your index
# db2 = chromadb.PersistentClient(path="./ai-tutor-db")
# chroma_collection = db2.get_or_create_collection("ai-tutor-db")
# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [None]:
# # Create your index
# from llama_index.core import VectorStoreIndex

# index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

In [None]:
# from llama_index.embeddings.openai import OpenAIEmbedding
# from llama_index.llms.openai import OpenAI
# from llama_index.core.vector_stores import (
#     ExactMatchFilter,
#     MetadataFilters,
#     MetadataFilter,
#     FilterOperator,
#     FilterCondition,
# )

# filters = MetadataFilters(
#     filters=[
#         MetadataFilter(key="source", value="lanchain_course"),
#         MetadataFilter(key="source", value="langchain_docs"),
#     ],
#     condition=FilterCondition.OR,
# )

# llm = OpenAI(temperature=0, model="gpt-3.5-turbo", max_tokens=None)
# embeds = OpenAIEmbedding(model="text-embedding-3-large", mode="text_search")
# # query_engine = index.as_query_engine(
# #     llm=llm, similarity_top_k=5, embed_model=embeds, verbose=True, streaming=True, filters=filters
# # )
# query_engine = index.as_query_engine(
#     llm=llm,
#     similarity_top_k=5,
#     embed_model=embeds,
#     verbose=True,
# )

In [None]:
# res = query_engine.query("What is the LLama model?")

# # history = ""
# # for token in res.response_gen:
# #     history += token
# #     print(history)

In [None]:
# res.response

In [None]:
# for src in res.source_nodes:
#     print("Node ID\t", src.node_id)
#     print("Source\t", src.metadata["source"])
#     print("Title\t", src.metadata["title"])
#     print("Text\t", src.text)
#     print("Score\t", src.score)
#     print("-_" * 20)

In [None]:
# from IPython.display import Markdown, display


# # define prompt viewing function
# def display_prompt_dict(prompts_dict):
#     for k, p in prompts_dict.items():
#         text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
#         display(Markdown(text_md))
#         print(p.get_template())
#         display(Markdown("<br><br>"))

In [None]:
# prompts_dict = query_engine.get_prompts()

In [None]:
# display_prompt_dict(prompts_dict)