# DocsGPT

In [2]:
import os
import chromadb
import tiktoken
import subprocess
from langchain.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator

DATA_DIR = './data'

## Load the data

In [3]:
# Clone the Kubecost "Docs" repo
repository_url = 'https://github.com/kubecost/docs.git'
target_directory = DATA_DIR + '/docs'

if os.path.exists(target_directory):
    print(f"Directory '{target_directory}' already exists. Performing git pull instead.")
    subprocess.run(['cd', target_directory], shell=True)
    subprocess.run(['git', 'pull'], cwd=target_directory)
else:
    subprocess.run(['git', 'clone', '-q', repository_url, target_directory])

Directory './data/docs' already exists. Performing git pull instead.
Already up to date.


In [4]:
loaders = []

# Recursively load all ".md" and ".txt" files from DATA_DIR
for root, dirs, files in os.walk(DATA_DIR):
    for file in files:
        file_path = os.path.join(root, file)
        if ".md" in file_path or ".txt" in file_path:
            print(file_path)
            loaders.append(TextLoader(file_path, encoding='utf8'))

./data/state_of_the_union.txt
./data/docs/http-profiling.md
./data/docs/storage.md
./data/docs/contactus.md
./data/docs/assets.md
./data/docs/secondary-clusters.md
./data/docs/api-request-right-sizing-v2.md
./data/docs/prometheus.md
./data/docs/api-request-right-sizing.md
./data/docs/allocation.md
./data/docs/user-metrics.md
./data/docs/architecture.md
./data/docs/node-pricing.md
./data/docs/asset-diff.md
./data/docs/kubecost-cloud-architecture.md
./data/docs/long-term-storage-aws.md
./data/docs/ksm-metrics.md
./data/docs/savings.md
./data/docs/federated-etl.md
./data/docs/long-term-storage.md
./data/docs/cost-model-deprecated.md
./data/docs/csv-pricing.md
./data/docs/add-key.md
./data/docs/install-on-plural.md
./data/docs/resource-consumption.md
./data/docs/saved-reports.md
./data/docs/open-source-deps.md
./data/docs/custom-grafana.md
./data/docs/SUMMARY.md
./data/docs/sharing-etl-backups.md
./data/docs/troubleshoot-install.md
./data/docs/user-management.md
./data/docs/audit-api.md
./

## Create a vectorstore index

In [5]:
index = VectorstoreIndexCreator().from_loaders(loaders)

Using embedded DuckDB without persistence: data will be transient


## Query the index

In [6]:
# Query the index
query = "What should my cloud-integration.json secret look like?"
query = "I've just installed my cloud billing integration, but I don't see any reconciled prices. Why?"
index.query_with_sources(query)

{'question': "I've just installed my cloud billing integration, but I don't see any reconciled prices. Why?",
 'answer': ' You may need to verify that the Helm values of `kubecostModel.etlCloudUsage` or `kubecostModel.etlCloudAsset` are not set to false. Cloud billing data may also lag by 24-48 hours.\n',
 'sources': './data/docs/cloud-integration.md, ./data/docs/setup/frequently-asked-questions.md, ./data/docs/assets.md'}