In [1]:
import os
import sys
from pathlib import Path

from datasets import Dataset
from langchain_community.document_loaders import (
    DirectoryLoader,
    UnstructuredMarkdownLoader,
)
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from ragas import evaluate
from ragas.metrics import answer_correctness, faithfulness
from ragas.testset.evolutions import multi_context, reasoning, simple
from ragas.testset.generator import TestsetGenerator

sys.path.append(str(Path.cwd().parents[1]))  # nopep8
from utilities.ollama import verify_ollama_model_present
from utilities.project_paths import sample_document_dirpath

MODEL_NAME = "openchat"
# https://ollama.com/library/openchat

generator_llm = Ollama(model=MODEL_NAME)
embeddings = OllamaEmbeddings(model=MODEL_NAME)

# make sure that the ollama is available to use locally (might need to ollama pull)
verify_ollama_model_present("openchat")


  from .autonotebook import tqdm as notebook_tqdm


validating model exists...


In [10]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader, TextLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

loader = DirectoryLoader(
    path=sample_document_dirpath,
    loader_cls=TextLoader,
    show_progress=True
)
raw_documents = loader.load()

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=False
)

# split it into chunks
docs = []
for i in raw_documents:
    docs.extend(markdown_splitter.split_text(i.page_content))


generator = TestsetGenerator.from_langchain(
    generator_llm=generator_llm,
    critic_llm=generator_llm,
    embeddings=embeddings
)

100%|██████████| 11/11 [00:00<00:00, 4398.22it/s]


In [14]:
# reduce the size of docs to process
docs = docs[:40]

In [16]:
# generate testset
testset = generator.generate_with_langchain_docs(docs,
                                                 test_size=len(docs), distributions={
                                                     simple: 0.5, reasoning: 0.3, multi_context: 0.2})

df = testset.to_pandas()
df.to_excel('synthetic_testset.xlsx')
print(df)

embedding nodes:   0%|          | 0/80 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.                 
Generating:  82%|████████▎ | 33/40 [35:08<10:26, 89.45s/it]  max retries exceeded for SimpleEvolution(generator_llm=LangchainLLMWrapper(run_config=RunConfig(timeout=60, max_retries=15, max_wait=90, max_workers=16, exception_types=<class 'Exception'>)), docstore=InMemoryDocumentStore(splitter=<langchain_text_splitters.base.TokenTextSplitter object at 0x000002788200F6D0>, nodes=[Node(page_content='# AWS Chalice and AWS Lambda REST API example  \n## Purpose  \nShows how to use AWS Chalice with the AWS SDK for Python (Boto3) to\ncreate a serverless REST API that uses Amazon API Gateway, AWS Lambda, and\nAmazon DynamoDB. The REST API simulates a system that tracks daily cases\nof COVID-19 in the United States, using fictional data. Learn how to:  \n* Use AWS Chalice to define routes in AWS Lambda functions that\nare called to handle REST requests that come through Amazon API Gateway.\n* Use AWS Lambda functions to retrieve and

                                             question  \
0    "What happens when there are 10 or fewer work...   
1    "What is the role of the Flask web framework ...   
2    "How can you create the `work_items` table us...   
3    "What is the purpose of using Amazon DynamoDB...   
4    "What command should be run to install prereq...   
5    "What is the role of an AWS Lambda function i...   
6    "How do I register a verified email address w...   
7    "What role do the ItemList and Report classes...   
8    "What is the purpose of the AWS Chalice and A...   
9    "What role does the Boto3 package play in the...   
10   "What is the purpose of using AWS Chalice wit...   
11   "What is the process for registering a verifi...   
12   "What is the role of the item tracker plugin ...   
13   "What role does evaporation play in the water...   
14   "What are the key components and actions invo...   
15   "What are the steps involved in deploying a R...   
16   "How do you create a work_

In [6]:
documents

[Document(page_content='AWS Chalice and AWS Lambda REST API example\n\nPurpose\n\nShows how to use AWS Chalice with the AWS SDK for Python (Boto3) to \ncreate a serverless REST API that uses Amazon API Gateway, AWS Lambda, and \nAmazon DynamoDB. The REST API simulates a system that tracks daily cases\nof COVID-19 in the United States, using fictional data. Learn how to:\n\nUse AWS Chalice to define routes in AWS Lambda functions that\n are called to handle REST requests that come through Amazon API Gateway.\n\nUse AWS Lambda functions to retrieve and store data in an Amazon DynamoDB \ntable to serve REST requests.\n\nDefine table structure and security role resources in an AWS CloudFormation template.\n\nUse AWS Chalice and AWS CloudFormation to package and deploy all necessary resources.\n\nUse AWS CloudFormation to clean up all created resources.\n\nThis example brings together some of the same information you can find in the\ntutorials in the \nAWS Chalice GitHub repository.\n\nPrer