In [6]:
import os

# Define the path to the JSON files
# Get the current directory
current_directory = os.getcwd()

# Create the file path to the 'json_files_updated' folder
json_files_updated_path = os.path.join(current_directory, 'json_files_updated')

In [1]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import JSONLoader

In [2]:
from bs4 import BeautifulSoup

def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["TitleNumber"] = record.get("TitleNumber")
    metadata["TitleName"] = record.get("TitleName")
    
    chapter_list = record.get("ChapterList", [])
    hrefs = []
    for chapter in chapter_list:
        metadata['ChapterNum'] = chapter.get('ChapterNum')
        metadata['ChapterName'] = chapter.get('ChapterName')
        metadata['ArticleNum'] = chapter.get('ArticleNum')
        metadata['ArticleName'] = chapter.get('ArticleName')
        metadata['SectionNumber'] = chapter.get('SectionNumber')
        metadata['SectionTitle'] = chapter.get('SectionTitle')
        
        # Extract href from Body if available
        body_content = chapter.get("Body", "")
        soup = BeautifulSoup(body_content, 'html.parser')
        links = soup.find_all('a', href=True)
        for link in links:
            hrefs.append(link['href'])
            
    # Convert hrefs list into a single string
    metadata['Hrefs'] = "; ".join(hrefs) 
    
    # Extract all 'Body' contents
    body_contents = [chapter.get("Body", "") for chapter in chapter_list]
    
    return {
        "metadata": metadata,
        "page_content": "\n".join(body_contents)
    } 

In [7]:
loader = DirectoryLoader(
    json_files_updated_path,
    glob='*.json',
    loader_cls=JSONLoader,
    loader_kwargs={
        'jq_schema': '.', 
        'content_key': None,
        'metadata_func': metadata_func,
        'text_content': False
    }
)

# Load documents directly
documents = loader.load()

### Generate Test Set

In [10]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_community.llms import Ollama
from langchain_community.embeddings import HuggingFaceEmbeddings

critic_llm = Ollama(
    model="llama2"
) 


generator_llm = Ollama(
    model="llama3"
)  


model_name = "multi-qa-mpnet-base-dot-v1"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs)

In [12]:
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    hf)