In [1]:
import os
import sys
import nltk
import random
from langchain_community.document_loaders import DirectoryLoader
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset.persona import Persona
from langchain.schema import Document
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from ragas.testset import TestsetGenerator
import pandas as pd

sys.path.insert(1, '/home/jovyan/work/code')

path = "docs-aspire/"
num_qa_to_generate=10
openai_model='gpt-4o' #
inspect_docs = False
shuffle_docs = True

md_loader = DirectoryLoader(path, glob="**/*.md")
yml_loader = DirectoryLoader(path, glob="**/*.yml")

docs = md_loader.load() + yml_loader.load()
if len(docs) > 0 and inspect_docs is True:
    print("Sample document structure:")
    print(f"Type: {type(docs[0])}")
    print(f"Available attributes: {dir(docs[0])}")
    print(f"Content preview: {docs[0].page_content[:20]}")
    print(f"Metadata: {docs[0].metadata}")

if shuffle_docs:
    random.shuffle(docs)
    
    print(f"Total {len(docs)} documents loaded and randomised.")

processed_docs = []
for doc in docs:
    # Create a new document with headlines if they don't exist
    metadata = doc.metadata.copy()
    if 'headlines' not in metadata:
        metadata['headlines'] = []  # or extract headlines from content if needed
        #print(f"No headlines: {doc.metadata}")
    processed_docs.append(Document(page_content=doc.page_content, metadata=metadata))



Total 165 documents loaded and randomised.


In [4]:
#https://docs.ragas.io/en/stable/howtos/customizations/testgenerator/_persona_generator/#personas-in-testset-generation

personas = [
    Persona(
        name="Technical Analyst",
        role_description="Focuses on detailed system specifications and API documentation"
    ),
    Persona(
        name="Novice User",
        role_description="Asks simple questions using layman terms and basic functionality"
    ),
    Persona(
        name="Security Auditor",
        role_description="Focuses on compliance, data protection, and access control aspects"
    ),
    Persona(
        name="Docker expert",
        role_description="Has in depth experience with Docker and DSocker compose and expert at cloud native concepts"
    )
]

generator_llm = LangchainLLMWrapper(ChatOpenAI(model=openai_model, temperature=0.1))  
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
generator = TestsetGenerator(
    llm=generator_llm,
    embedding_model=generator_embeddings,
    persona_list=personas
)

In [9]:
#dataset = generator.generate_with_langchain_docs(docs, testset_size=num_qa_to_generate)

# https://docs.ragas.io/en/stable/references/synthesizers/
# 
from ragas.testset.synthesizers import (
    SingleHopSpecificQuerySynthesizer,
    MultiHopAbstractQuerySynthesizer,
    MultiHopSpecificQuerySynthesizer
)
query_distribution = [
    (SingleHopSpecificQuerySynthesizer(), 0.4),  # Simple questions
    (MultiHopSpecificQuerySynthesizer(), 0.4),   # Complex questions
    (MultiHopAbstractQuerySynthesizer(), 0.2)    # Reasoning questions
]

dataset = generator.generate_with_langchain_docs(
    docs,
    testset_size=100,
    query_distribution=query_distribution
)


Applying HeadlinesExtractor:   0%|          | 0/107 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/165 [00:00<?, ?it/s]

unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to ap

Applying SummaryExtractor:   0%|          | 0/142 [00:00<?, ?it/s]

Property 'summary' already exists in node 'c8c7e4'. Skipping!
Property 'summary' already exists in node '17792d'. Skipping!
Property 'summary' already exists in node '31305f'. Skipping!
Property 'summary' already exists in node '1c4af2'. Skipping!
Property 'summary' already exists in node '597935'. Skipping!
Property 'summary' already exists in node '486a94'. Skipping!
Property 'summary' already exists in node '842267'. Skipping!
Property 'summary' already exists in node '7b9b09'. Skipping!
Property 'summary' already exists in node '9e6906'. Skipping!
Property 'summary' already exists in node 'c3a812'. Skipping!
Property 'summary' already exists in node '82d1ed'. Skipping!
Property 'summary' already exists in node '781df7'. Skipping!
Property 'summary' already exists in node 'bf1bc4'. Skipping!
Property 'summary' already exists in node 'f4ac95'. Skipping!
Property 'summary' already exists in node 'f06c57'. Skipping!
Property 'summary' already exists in node 'cca167'. Skipping!
Property

Applying CustomNodeFilter:   0%|          | 0/185 [00:00<?, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/510 [00:00<?, ?it/s]

Property 'summary_embedding' already exists in node '1c4af2'. Skipping!
Property 'summary_embedding' already exists in node 'c8c7e4'. Skipping!
Property 'summary_embedding' already exists in node 'c3a812'. Skipping!
Property 'summary_embedding' already exists in node '31305f'. Skipping!
Property 'summary_embedding' already exists in node '597935'. Skipping!
Property 'summary_embedding' already exists in node '9e6906'. Skipping!
Property 'summary_embedding' already exists in node '17792d'. Skipping!
Property 'summary_embedding' already exists in node '842267'. Skipping!
Property 'summary_embedding' already exists in node '486a94'. Skipping!
Property 'summary_embedding' already exists in node '7b9b09'. Skipping!
Property 'summary_embedding' already exists in node '82d1ed'. Skipping!
Property 'summary_embedding' already exists in node 'bf1bc4'. Skipping!
Property 'summary_embedding' already exists in node 'f4ac95'. Skipping!
Property 'summary_embedding' already exists in node '781df7'. Sk

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/100 [00:00<?, ?it/s]

In [10]:
dataset.to_pandas()
type(dataset)

file_name= f"{openai_model}_ReducedAspireDocs_{num_qa_to_generate}"

df = dataset.to_pandas()
df.to_pickle(f"{file_name}.pkl")
df.to_csv(f"{file_name}.csv")

df1 =  pd.read_pickle(f"{file_name}.pkl")
df1


Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What are the key features of MSTest in the con...,[title: Write your first .NET Aspire test desc...,MSTest is one of the testing frameworks availa...,single_hop_specifc_query_synthesizer
1,What does the DistributedApplicationTestingBui...,[Test resource environment variables To furthe...,The DistributedApplicationTestingBuilder follo...,single_hop_specifc_query_synthesizer
2,What is the purpose of the DOTNET_DASHBOARD_OT...,[title: Enable browser telemetry description: ...,The DOTNET_DASHBOARD_OTLP_ENDPOINT_URL is the ...,single_hop_specifc_query_synthesizer
3,How does HTTP work with OTLP endpoints in the ...,[OTLP endpoint security Dashboard OTLP endpoin...,HTTP OTLP requests to the dashboard must inclu...,single_hop_specifc_query_synthesizer
4,What Qdrant do?,[title: .NET Aspire Qdrant integration descrip...,Qdrant is an open-source vector similarity sea...,single_hop_specifc_query_synthesizer
...,...,...,...,...
95,How do you configure frontend authentication f...,[<1-hop>\n\ntitle: .NET Aspire dashboard confi...,Frontend authentication for the .NET Aspire da...,multi_hop_abstract_query_synthesizer
96,What are the dashboard configuration options f...,[<1-hop>\n\nOTLP authentication The OTLP endpo...,The dashboard configuration options for log en...,multi_hop_abstract_query_synthesizer
97,How does the .NET Aspire MongoDB database inte...,[<1-hop>\n\ntitle: .NET Aspire MongoDB databas...,The .NET Aspire MongoDB database integration e...,multi_hop_abstract_query_synthesizer
98,What are the key features of the .NET Aspire A...,[<1-hop>\n\ntitle: .NET Aspire Azure Cosmos DB...,The .NET Aspire Azure Cosmos DB integration pr...,multi_hop_abstract_query_synthesizer
