In [None]:
import tiktoken
import matplotlib.pyplot as plt
import pandas as pd
from src.utils.vector_store_manager import VectorStoreManager
from src.utils.configuration import LoaderConfiguration
from langchain_core.runnables import RunnableConfig
from dotenv import load_dotenv
load_dotenv()

config = LoaderConfiguration().from_runnable_config(
    RunnableConfig(configurable={"index_name": "au-blog-rag-fine-tuned"})
)
document_processor = VectorStoreManager(index_name="au-blog-rag-fine-tuned", configuration=config)

def count_tokens(text, model="gpt-4o-mini"):
    encoding = tiktoken.encoding_for_model(model)
    t = encoding.encode(text)
    return len(t)

docs = document_processor.get_all_documents()
print(f"Unique URLS: {len(set([doc['metadata']['source'] for doc in docs]))}")

lengths = [len(doc['content']) for doc in docs]
tokens = [count_tokens(doc['content']) for doc in docs]


fig, axes = plt.subplots(1, 2, figsize=(14, 6))
pd.Series(lengths).hist(bins=20, alpha=0.7, color='blue', edgecolor='black', ax=axes[0])
axes[0].set_title('Document Lengths')
axes[0].set_xlabel('Length (characters)')
axes[0].set_ylabel('Count')

pd.Series(tokens).hist(bins=20, alpha=0.7, color='green', edgecolor='black', ax=axes[1])
axes[1].set_title('Document Tokens')
axes[1].set_xlabel('Tokens')
axes[1].set_ylabel('Count')


print(f"Total documents: {len(lengths)}\n")

print(f"Total length: {sum(lengths)}")
print(f"Average length: {sum(lengths)/len(lengths)}")
print(f"Max length: {max(lengths)}\n")

print(f"Total tokens: {sum(tokens)}")
print(f"Average tokens: {sum(tokens)/len(tokens)}")
print(f"Max tokens: {max(tokens)}")

In [1]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("wylupek/au-blog-rag-embedder")
print(model.get_sentence_embedding_dimension())

sentences = [
    "Explain the importance of having a \"Portfolio\" section on a website. How can showcasing past work benefit a business or individual?",
    "### Sitemap\n\n- Home\n- About us\n- How we work\n- Services\n- Portfolio\n- Blog\n- Career",
    "Business keeps asking questions about what has been done or what is delivered to our clients.\n\nWhat you can do:",
    "Appunite always pushes boundaries and is very creative in ensuring that there is knowledge shared amongst the workspace and to our external stakeholders. This extends itself to overcoming the problem of articles, which are a longer form of writing, that they can be time consuming. There has recently been the addition of the TIL section on our website and is thanks to the initiative of a few Appuniter who said they would like to make something like this happen. They took on the task and are now live on our site:  This is a valuable add to companies where peoples main focus is not writing and is somewhere else, in this case Software development. It allows for shortened pieces which are compacted with knowledge and what one learns is shared regularly.\n\n### Akai\n\nAs Appunite believes that the future is in the hands and minds of the youth, what better way to take advantage of this and benefit from a fruitful collaboration."
]
embeddings = model.encode(sentences)

similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)

  from .autonotebook import tqdm as notebook_tqdm


384
torch.Size([4, 4])
