## Process :: Loading docs & credentials -> Indexing -> retrieval -> updating

### Imstalling and importing

In [1]:
# installation

In [110]:
! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain openai jq trulens-eval pydantic



In [2]:
# importing
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [3]:
from typing import *

In [36]:
# credential loading and set to environment
import json
def get_credentials(file_path:str="credentials.json"):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

import os
openai_api_key = get_credentials()['openai_api_Key_aweqa']
# set key to environment
os.environ['OPENAI_API_KEY'] = openai_api_key

In [7]:
# os.environ['OPENAI_API_KEY']

### load document -> splitting -> indexing

In [8]:
from pathlib import Path
def load_document_as_array(file_path:str = "./data/scraped_data.jsonl"):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            json_object = json.loads(line)
            data.append(json_object)
    return data

len(load_document_as_array())

2580

In [10]:
type(load_document_as_array()[0])
load_document_as_array()[0]

{'content': "Skip to content\n          (#start-building) \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n  (https://github.com/duneanalytics/docs/edit/master/docs/index.md) \n Welcome \n Dune is a web-based platform that allows you to query public blockchain data and aggregate it into beautiful dashboards. \n \n Quickstart \n To get started with Dune in 5 minutes, see the  Quickstart  (quickstart/) . \n \n \n \n The world's blockchain data at your fingertips! \n \n Blockchains are open and transparent, but each chain is unique—making it difficult to understand, ingest, and aggregate data. Dune gives you the proper tools to analyze cross-chain data for different tokens, wallets, and protocols. You can also easily share your work with the community. \n ",
 'url': 'https://dune.com/docs/'}

In [11]:
# checking is there already exists or not
persist_directory = 'vector_db'
# embedding 
embedding = OpenAIEmbeddings()

### load document for splitting and split

In [12]:
# meta data function :: add source url as metadata instead of disk path
def metadata_func(record: dict, metadata: dict) -> dict:
    if "source" in metadata:
        metadata["source"] = record.get("url")
    return metadata

In [15]:
# import JSONLoader for JSON Line loading
from langchain_community.document_loaders import JSONLoader

loader = JSONLoader(
    file_path='./data/scraped_data.jsonl',
    jq_schema='.',
    text_content=False,
    json_lines=True,
    metadata_func=metadata_func)

document = loader.load()

In [16]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)

# Make splits
splits = text_splitter.split_documents(document)

In [18]:
#splits

In [21]:
# indexing and persist
persist_directory = 'vector_db'
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings(),
                                    persist_directory=persist_directory)

# retriever of vector db
retriever = vectorstore.as_retriever()

### Retrieval

#### User input here : change query in user_query variable

In [22]:
# write the user query here:: edit the user_query variable from here.
user_query = """We removed the ability to archive queries, and instead added the ability to completely delete them. 
Update all relevant knowledge"""

In [23]:
# getting all relevant docs from vector db
retrieved_docs = retriever.get_relevant_documents(user_query)

In [25]:
#retrieved_docs

In [26]:
# find the unique urls
unique_urls = set([doc.metadata['source'] for doc in retrieved_docs])

In [27]:
unique_urls

{'https://dune.com/docs/api/api-reference/',
 'https://dune.com/docs/api/api-reference/edit-queries/',
 'https://dune.com/docs/api/api-reference/edit-queries/archive-query/'}

In [28]:
# making list for needed to update
document_list = load_document_as_array()
# check every json, include it
to_be_updated_docs = []
for doc in document_list:
    if doc["url"] in unique_urls:
        to_be_updated_docs.append(doc)

# to_be_updated_docs

In [30]:
# to_be_updated_docs

In [49]:
# Try to revise and restructure user query for communicating with language model
from langchain.prompts import ChatPromptTemplate, PromptTemplate

# prompt
user_query_revise_prompt_template = """User writes a query instruction to update his original document. Our system updates that original documents 
based on that query. You are a smart query modifiying assistant. Your task is to rephrase and restructure the given query so that it become a better 
prompt for communicating with a language model. If you can't, just say that you can't do. Keep the answer concise.
Query: {question}
"""
user_query_revise_prompt = ChatPromptTemplate.from_template(user_query_revise_prompt_template)

In [32]:
# gpt4 llm
llm_gpt4 = ChatOpenAI(model_name="gpt-4-1106-preview", temperature=0)

In [37]:
llm_gpt3 = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [38]:
chain = user_query_revise_prompt | llm_gpt3

In [39]:
response = chain.invoke({"context":"","question":user_query})

In [40]:
user_updated_query = response.content

In [41]:
user_updated_query

'Please update all relevant information to reflect the change from archiving queries to deleting them entirely.'

### Update documents

In [42]:
import tiktoken
def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens + 5 # 5 is the max additional tokens

In [43]:
def convert_to_str(docs: list):
    docs = [f"[{doc}]" for doc in docs]
    return "".join(docs)

In [123]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

# parse model
class Document(BaseModel):
    url: str = Field(description="url")
    content: str = Field(description="content")

# parse model
class DocumentList(BaseModel):
    docs: list = List[Document]
    

In [44]:
# use when total token more than 20k, then break it multiple times
def get_llm_response(attached_doc_dict: str):
    # prompt
    doc_update_prompt_template = """Update the given document list based on given query. Update one by one.
    {format_instructions}
    Query: {query}
    Document list:
    {context}
    """
    parser = JsonOutputParser(pydantic_object=DocumentList)
    doc_update_prompt = PromptTemplate(template=doc_update_prompt_template,
                                               input_variables=["query", "context"],
                                               partial_variables={"format_instructions": parser.get_format_instructions()})
    
    chain = doc_update_prompt | llm_gpt3 | parser
    
    context_str = convert_to_str(list(attached_doc_dict.values()))
    
    response = chain.invoke({"context":context_str,"query":user_updated_query})

    return response

In [45]:
# parse model
class Document(BaseModel):
    url: str = Field(description="url")
    content: str = Field(description="content")

parser = JsonOutputParser(pydantic_object=Document)

In [122]:
num_retrieved_docs = len(to_be_updated_docs)

# final resultset
results = {}

if num_retrieved_docs > 25:
    # take as batch and send to API
    index = 0
    updated_dict = []
    attached_doc_dict = {}
    while index < retrieve_doc_count:
        list_str = "".join([str(dict_) for dict_ in list(attached_doc_dict.values())])

        # 
        if num_tokens_from_string(list_str) < 1000:
    
            # add to dict
            attached_doc_dict[to_be_updated_docs[index]["url"]] = to_be_updated_docs[index]
            index += 1
            
        else:
            # docs update here
            
            res_content = get_llm_response(attached_doc_dict)
            # add updated doc to dict
            updated_dict.append(response)
            
            # clear all data
            attached_doc_dict = {}
    
    # now the rest one
    res_content = get_llm_response(attached_doc_dict)
    updated_dict.append(res_content)
    
else: # total number of retrieved document is less than 25
    # to openAI one by one
    # go through all of them seperately
    for doc in to_be_updated_docs:
        doc_update_prompt_template = """Update the given document based on the given query.
        {format_instructions}
        Query:{query}
        Document:{doc}
        """

        parser = JsonOutputParser(pydantic_object=Document)
        
        doc_update_prompt = PromptTemplate(template=doc_update_prompt_template,
                                               input_variables=["query", "doc"],
                                               partial_variables={"format_instructions": parser.get_format_instructions()})
        
        chain = doc_update_prompt | llm_gpt3 | parser
        
        response = chain.invoke({"doc":doc,"query":user_updated_query})
        # save updated document against url to keep track
        results[doc["url"]] = response

In [59]:
# results

In [163]:
retrieve_doc_count = len(to_be_updated_docs)

attached_doc_dict = {}
updated_dict = []
index = 0
while index < retrieve_doc_count:
    list_str = "".join([str(dict_) for dict_ in list(attached_doc_dict.values())])
    
    if num_tokens_from_string(list_str) < 1000:

        # add to dict
        attached_doc_dict[to_be_updated_docs[index]["url"]] = to_be_updated_docs[index]
        index += 1
        
    else:
        # docs update here
        
        res_content = get_llm_response(attached_doc_dict)
        # add updated doc to dict
        updated_dict.append(response.content)
        
        # clear all data
        attached_doc_dict = {}

# now the rest one
res_content = get_llm_response(attached_doc_dict)
updated_dict.append(res_content)

In [57]:
#updated_dict

In [58]:
#attached_doc_dict

### Update main document and write two file: one contains error diff and other contains new updated json array

In [62]:
original_document = load_document_as_array()

doc_diffs = []

# iterate all entity of documents and check
for index in range(len(original_document)):
    url = original_document[index]["url"]
    if url in results.keys():
        # add from original
        doc_diffs.append(original_document[index]["content"])
        # add from updated one
        doc_diffs.append(results[url]["content"])
        # update original document entity by updated one
        original_document[index] = results[url]

# doc_diffs

In [70]:
# write to a file :: updated json to new jsonl file
def write_to_jsonl(documents: list, file_name: str):
    with open(file_name, 'w') as file:
        for json_obj in documents:
            json_str = json.dumps(json_obj)
            file.write(json_str + '\n')
    print(f"Write to {file_name} done!")

In [73]:
from datetime import datetime
# current date
today_ = datetime.today().date()
file_name = f"{str(today_)}_updated_document.jsonl"
write_to_jsonl(original_document, file_name)

Write to 2024-04-05_updated_document.jsonl done!


In [84]:
# print all diffs

In [87]:
# doc_diffs

In [120]:
import 

VersionConflict: Package pydantic is installed but has a version conflict:
    Requirement: pydantic<3,>=2
    Installed: 1.10.4
This package is required for trulens_eval. Please resolve the conflict by
installing a compatible version with:

    ```bash
    pip install 'pydantic<3,>=2'
    ```

If you are running trulens_eval in a notebook, you may need to restart the
kernel after resolving the conflict. If your distribution is in a bad place
beyond this package, you may need to reinstall trulens_eval so that all of the
dependencies get installed and hopefully corrected:
    
    ```bash
    pip uninstall -y trulens_eval
    pip install trulens_eval
    ```
