# DocsGPT

In [16]:
import os
import chromadb
import subprocess
import langchain.indexes
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms.openai import OpenAI
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from chromadb.config import Settings

DATA_DIR = './data'

# os.environ["OPENAI_API_KEY"] = "YOUR_KEY_HERE"

## Load the data

In [3]:
REPOS = [
    'https://github.com/kubecost/docs.git',
    'https://github.com/kubecost/cost-analyzer-helm-chart.git',
    'https://github.com/kubecost/poc-common-configurations.git',
    'https://github.com/kubecost/cluster-turndown.git',
    'https://github.com/kubecost/cost-prediction-action.git',
    'https://github.com/kubecost/kubectl-cost.git',
    'https://github.com/kubecost/kubecost-lens-extension.git',
    'https://github.com/opencost/opencost.git',
    'https://github.com/opencost/opencost-helm-chart.git',
    'https://github.com/opencost/opencost-website.git'
]

for repo in REPOS:
    # Parse for repo name
    repo_name = repo.split("/")[-1].split(".")[0]
    target_dir = DATA_DIR + '/' + repo_name

    # git clone or git pull
    if os.path.exists(target_dir):
        print(f"Directory '{target_dir}' already exists. Performing git pull instead.")
        subprocess.run(['cd', target_dir], shell=True)
        subprocess.run(['git', 'pull'], cwd=target_dir)
    else:
        print(f"Cloning to '{target_dir}'")
        subprocess.run(['git', 'clone', '-q', repo, target_dir])

Directory './data/docs' already exists. Performing git pull instead.
Already up to date.
Directory './data/cost-analyzer-helm-chart' already exists. Performing git pull instead.
Already up to date.
Directory './data/poc-common-configurations' already exists. Performing git pull instead.
Already up to date.
Directory './data/cluster-turndown' already exists. Performing git pull instead.
Already up to date.
Directory './data/cost-prediction-action' already exists. Performing git pull instead.
Already up to date.
Directory './data/kubectl-cost' already exists. Performing git pull instead.
Already up to date.
Directory './data/kubecost-lens-extension' already exists. Performing git pull instead.
Already up to date.
Directory './data/opencost' already exists. Performing git pull instead.
Already up to date.
Directory './data/opencost-helm-chart' already exists. Performing git pull instead.
Already up to date.
Directory './data/opencost-website' already exists. Performing git pull instead.
A

In [4]:
FILE_TYPES = ['.md', '.txt', '.yaml', '.go', '.sh']
loaders = []

# Recursively load all FILE_TYPES files from DATA_DIR
for root, dirs, files in os.walk(DATA_DIR):
    for file in files:
        file_path = os.path.join(root, file)
        for ft in FILE_TYPES:
            if ft in file_path:
                loaders.append(TextLoader(file_path, encoding='utf8'))
                print(file_path)
                continue

./data/state_of_the_union.txt
./data/cost-analyzer-helm-chart/kubecost.yaml
./data/cost-analyzer-helm-chart/kustomization.yaml
./data/cost-analyzer-helm-chart/helm.yaml
./data/cost-analyzer-helm-chart/README.md
./data/cost-analyzer-helm-chart/cost-analyzer/values-windows-node-affinity.yaml
./data/cost-analyzer-helm-chart/cost-analyzer/values-eks-cost-monitoring.yaml
./data/cost-analyzer-helm-chart/cost-analyzer/Chart.yaml
./data/cost-analyzer-helm-chart/cost-analyzer/values-agent.yaml
./data/cost-analyzer-helm-chart/cost-analyzer/README.md
./data/cost-analyzer-helm-chart/cost-analyzer/values-thanos.yaml
./data/cost-analyzer-helm-chart/cost-analyzer/values-custom-pricing.yaml
./data/cost-analyzer-helm-chart/cost-analyzer/values.yaml
./data/cost-analyzer-helm-chart/cost-analyzer/values-amp.yaml
./data/cost-analyzer-helm-chart/cost-analyzer/values-cloud-agent.yaml
./data/cost-analyzer-helm-chart/cost-analyzer/charts/grafana/Chart.yaml
./data/cost-analyzer-helm-chart/cost-analyzer/charts/g

## Create a vectorstore index

In [15]:
# QUICKSTART METHOD

# index = VectorstoreIndexCreator().from_loaders(loaders)

In [25]:
# Load the documents
docs = []
for loader in loaders:
    docs.extend(loader.load())

# Split long pieces of text into smaller chunks
sub_docs = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0).split_documents(docs)

In [26]:
# Create vectorstore
vectorstore = Chroma.from_documents(
    documents=sub_docs,
    embedding=OpenAIEmbeddings(),
    persist_directory='./db',
    client_settings=Settings(chroma_db_impl='duckdb+parquet', persist_directory='./db', anonymized_telemetry=False)
)
vectorstore.persist()

# EXAMPLE Similarity Search
# query = "What did the president say about Ketanji Brown Jackson"
# docs = vectorstore.similarity_search(query)
# print(docs)

Using embedded DuckDB with persistence: data will be stored in: ./db


## [optional] Load vectorstore from disk

In [27]:
vectordb = Chroma(
    persist_directory='./db',
    embedding_function=OpenAIEmbeddings(),
    client_settings=Settings(chroma_db_impl='duckdb+parquet', persist_directory='./db', anonymized_telemetry=False)
)

Using embedded DuckDB with persistence: data will be stored in: ./db


## Run a query

In [38]:
query = "What should my cloud-integration.json secret look like?"
query = "What's the prometheus query used to determine a PV's hourly cost?"
# query = "I've just installed my cloud billing integration, but I don't see any reconciled prices. Why?"
# query = "What's the most recent Chart.yaml version for kubecost-cost-analyzer?"
# query = "Which file in the codebase is responsible for metric emission? And can you explain the contents of the file?"
# query = """I am deploying Kubecost with AMP, I have a service account annotated with an IAM Role that allow access to the S3 bucket that has spot data feed. When I check the spot data feed diagnostic I get the following error:
#  operation error S3: ListObjects, https response error StatusCode: 403, RequestID: , HostID: , api error InvalidAccessKeyId: The AWS Access Key Id you provided does not exist in our records.
# The s3 client should have picked up the IRSA credentials and used them to read from the S3 bucket. Unless the issue seem to be from the sigv4 proxy. However, I see in the helm chart value the need to supply the aws credential for access key and secret key.
# """
# query = "What is happening in cluster.go?"

In [39]:
# QUICKSTART METHOD

# index.query_with_sources(query)

In [None]:
# Similarity Search
docs = vectordb.similarity_search(query)
print(docs)

In [40]:
# Retrieval & QA
qa = RetrievalQAWithSourcesChain.from_chain_type(llm=OpenAI(temperature=0), retriever=vectordb.as_retriever())
res = qa(query)

# Print the result
answer, sources = res['answer'], res['sources']
print("\n\n> Question:")
print(query)
print("\n> Answer:")
print(answer)

# Print the relevant sources used for the answer
print("\n> Sources:")
for source in sources.split(","):
    print(' - ' + source.strip())

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')).




> Question:
What's the prometheus query used to determine a PV's hourly cost?

> Answer:
 The Prometheus query used to determine a PV's hourly cost is "avg(node_cpu_hourly_cost) by (cluster_id)".


> Sources:
 - ./data/docs/gcp-gmp-integration.md
