In [None]:
import tiktoken
import matplotlib.pyplot as plt
import pandas as pd
from src.utils.vector_store_manager import VectorStoreManager
from src.utils.configuration import LoaderConfiguration
from langchain_core.runnables import RunnableConfig
from dotenv import load_dotenv
load_dotenv()

config = LoaderConfiguration().from_runnable_config(
    RunnableConfig(configurable={"index_name": "au-blog-rag-fine-tuned"})
)
document_processor = VectorStoreManager(index_name="au-blog-rag-fine-tuned", configuration=config)

def count_tokens(text, model="gpt-4o-mini"):
    encoding = tiktoken.encoding_for_model(model)
    t = encoding.encode(text)
    return len(t)

docs = document_processor.get_all_documents()
print(f"Unique URLS: {len(set([doc['metadata']['source'] for doc in docs]))}")

lengths = [len(doc['content']) for doc in docs]
tokens = [count_tokens(doc['content']) for doc in docs]


fig, axes = plt.subplots(1, 2, figsize=(14, 6))
pd.Series(lengths).hist(bins=20, alpha=0.7, color='blue', edgecolor='black', ax=axes[0])
axes[0].set_title('Document Lengths')
axes[0].set_xlabel('Length (characters)')
axes[0].set_ylabel('Count')

pd.Series(tokens).hist(bins=20, alpha=0.7, color='green', edgecolor='black', ax=axes[1])
axes[1].set_title('Document Tokens')
axes[1].set_xlabel('Tokens')
axes[1].set_ylabel('Count')


print(f"Total documents: {len(lengths)}\n")

print(f"Total length: {sum(lengths)}")
print(f"Average length: {sum(lengths)/len(lengths)}")
print(f"Max length: {max(lengths)}\n")

print(f"Total tokens: {sum(tokens)}")
print(f"Average tokens: {sum(tokens)/len(tokens)}")
print(f"Max tokens: {max(tokens)}")

In [5]:
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("LANGCHAIN_API_KEY")
api_url = os.getenv("API_URL")

print(api_key)
print(api_url)

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
au-blog-rag-copy-06f82eec52425b5b86d88a43a223305a.us.langgraph.app


In [7]:
import requests

url = f"https://{api_url}/threads/"
payload = {
    "thread_id": "",
    "metadata": {},
    "if_exists": "raise"
}
headers = {"X-Api-Key": api_key}

response = requests.post(url, headers=headers, json=payload)

print(response)

<Response [503]>


In [4]:
thread_id = response.json()["thread_id"]
print(thread_id)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [31]:
import requests

url = f"https://{api_url}/threads/{thread_id}/runs/wait"

headers = {"X-Api-Key": api_key}
payload = {
    "thread_id": thread_id,
    "assistant_id": "loader_graph",
    "input": {
        "sitemap": "https://tech.appunite.com/blog/blog-sitemap.xml"
    },
    "config": {
        # "index_name": "au-blog-rag-fine-tuned",
        "index_name": "au-blog-rag",
        # "embedding_model": "wylupek/au-blog-rag-embedder",
        "embedding_model": "openai/text-embedding-3-small",
    }
}

response = requests.post(url, headers=headers, json=payload, timeout=None)
print(response)
print(response.json())

<Response [409]>


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [29]:
# get run

run_id = "1efe85bd-2c3d-6249-9e1b-0561db9adb47"
url = f"https://{api_url}/threads/{thread_id}/runs/{run_id}"

headers = {"X-Api-Key": api_key}
payload = {
    "thread_id": thread_id,
    "run_id": run_id
}

response = requests.get(url, headers=headers, json=payload)
print(response)
print(response.json())

<Response [200]>
{'run_id': '1efe85bd-2c3d-6249-9e1b-0561db9adb47', 'thread_id': '543211dd-13ab-447c-9352-7af4f867f3b9', 'assistant_id': 'b85351c3-b194-5002-83e4-a1fd576e2c75', 'created_at': '2025-02-11T09:37:34.753531+00:00', 'updated_at': '2025-02-11T09:37:34.753531+00:00', 'metadata': {'assistant_id': 'b85351c3-b194-5002-83e4-a1fd576e2c75'}, 'status': 'running', 'kwargs': {'input': {'sitemap': 'https://tech.appunite.com/blog/blog-sitemap.xml'}, 'config': {'metadata': {'graph_id': 'loader_graph', 'created_by': 'system', 'assistant_id': 'b85351c3-b194-5002-83e4-a1fd576e2c75'}, 'index_name': 'au-blog-rag', 'configurable': {'run_id': '1efe85bd-2c3d-6249-9e1b-0561db9adb47', 'user_id': '439a3269-29e9-488d-8ea0-54f768db2e3b', 'graph_id': 'loader_graph', 'x-scheme': 'https', 'thread_id': '543211dd-13ab-447c-9352-7af4f867f3b9', 'x-real-ip': '10.0.0.134', 'assistant_id': 'b85351c3-b194-5002-83e4-a1fd576e2c75', 'x-request-id': 'b793893745f348ad2b0a7d6a1752a571', 'x-forwarded-for': '10.0.0.134'

In [28]:
# Delete run

run_id = "1efe85bd-2c3d-6249-9e1b-0561db9adb47"
thread_id = "543211dd-13ab-447c-9352-7af4f867f3b9"
url = f"https://{api_url}/threads/{thread_id}/runs/{run_id}"

headers = {"X-Api-Key": api_key}
payload = {
    "thread_id": thread_id,
    "run_id": run_id
}

response = requests.delete(url, headers=headers, json=payload)
print(response)
print(response.json())

<Response [404]>


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [21]:
# List runs

run_id = "1efe8524-fe96-6d48-a78a-828aedfd8e08"
url = f"https://{api_url}/threads/{thread_id}/runs"

headers = {"X-Api-Key": api_key}
payload = {
    "thread_id": thread_id
}

response = requests.get(url, headers=headers, json=payload)
print(response)
print(response.json())

[{'run_id': '1efe8521-a6f2-63bd-b48d-7ad96cdeaf76', 'thread_id': '543211dd-13ab-447c-9352-7af4f867f3b9', 'assistant_id': 'b85351c3-b194-5002-83e4-a1fd576e2c75', 'created_at': '2025-02-11T08:28:00.027017+00:00', 'updated_at': '2025-02-11T08:28:00.027017+00:00', 'metadata': {'assistant_id': 'b85351c3-b194-5002-83e4-a1fd576e2c75'}, 'status': 'success', 'kwargs': {'input': {'sitemap': 'https://tech.appunite.com/blog/blog-sitemap.xml'}, 'config': {'metadata': {'created_by': 'system', 'assistant_id': 'b85351c3-b194-5002-83e4-a1fd576e2c75'}, 'index_name': 'au-blog-rag-fine-tuned', 'configurable': {'run_id': '1efe8521-a6f2-63bd-b48d-7ad96cdeaf76', 'user_id': '439a3269-29e9-488d-8ea0-54f768db2e3b', 'graph_id': 'loader_graph', 'x-scheme': 'https', 'thread_id': '543211dd-13ab-447c-9352-7af4f867f3b9', 'x-real-ip': '10.0.0.48', 'assistant_id': 'b85351c3-b194-5002-83e4-a1fd576e2c75', 'x-request-id': 'fea55d1ec1a11a6ff304ec55b9ba6fdd', 'x-forwarded-for': '10.0.0.48', 'x-forwarded-host': 'au-blog-rag-

In [27]:
# Background run
import requests

url = f"https://{api_url}/threads/{thread_id}/runs"

headers = {"X-Api-Key": api_key}
payload = {
    "thread_id": thread_id,
    "assistant_id": "loader_graph",
    "input": {
        "sitemap": "https://tech.appunite.com/blog/blog-sitemap.xml"
    },
    "config": {
        # "index_name": "au-blog-rag-fine-tuned",
        "index_name": "au-blog-rag",
        # "embedding_model": "wylupek/au-blog-rag-embedder",
        "embedding_model": "openai/text-embedding-3-small",
    },
    "on_disconnect": "continue",
    "if_not_exists": "create",
}

response = requests.post(url, headers=headers, json=payload)
print(response)
print(response.json())

<Response [200]>
{'run_id': '1efe85bd-2c3d-6249-9e1b-0561db9adb47', 'thread_id': '543211dd-13ab-447c-9352-7af4f867f3b9', 'assistant_id': 'b85351c3-b194-5002-83e4-a1fd576e2c75', 'created_at': '2025-02-11T09:37:34.753531+00:00', 'updated_at': '2025-02-11T09:37:34.753531+00:00', 'metadata': {'assistant_id': 'b85351c3-b194-5002-83e4-a1fd576e2c75'}, 'status': 'pending', 'kwargs': {'input': {'sitemap': 'https://tech.appunite.com/blog/blog-sitemap.xml'}, 'config': {'metadata': {'graph_id': 'loader_graph', 'created_by': 'system', 'assistant_id': 'b85351c3-b194-5002-83e4-a1fd576e2c75'}, 'index_name': 'au-blog-rag', 'configurable': {'run_id': '1efe85bd-2c3d-6249-9e1b-0561db9adb47', 'user_id': '439a3269-29e9-488d-8ea0-54f768db2e3b', 'graph_id': 'loader_graph', 'x-scheme': 'https', 'thread_id': '543211dd-13ab-447c-9352-7af4f867f3b9', 'x-real-ip': '10.0.0.134', 'assistant_id': 'b85351c3-b194-5002-83e4-a1fd576e2c75', 'x-request-id': 'b793893745f348ad2b0a7d6a1752a571', 'x-forwarded-for': '10.0.0.134'