# Generate Vector

In [1]:
from pymilvus import model
from pprint import pp

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
embedding_fn = model.DefaultEmbeddingFunction()

In [4]:
from pymilvus import MilvusClient
import os

client = MilvusClient(uri=os.getenv('MILVUS_ADDR'))

## Search

In [5]:
def search(query, collection, threshold=0.5, limit=3):
    query_vectors = embedding_fn.encode_queries([query])
    results = client.search(
        collection_name=collection,  # target collection
        data=query_vectors,  # query vectors
        limit=limit,  # number of returned entities
        output_fields=["text", "metadata"],  # specifies fields to be returned
        # filter="subject == 'history'", # metadata filtering
    )[0]
    
    distanceThreshold = threshold
    
    return [
        result
        for result in results if result['distance'] >= distanceThreshold
    ]

# Load PDF

In [6]:
from langchain_community.document_loaders import FileSystemBlobLoader
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import PyPDFParser

loader = GenericLoader(
    blob_loader=FileSystemBlobLoader(
        path="../datasets",
        glob="**/*.pdf",
    ),
    blob_parser=PyPDFParser(),
)
docs = loader.load()

Ignoring wrong pointing object 68 0 (offset 0)
Ignoring wrong pointing object 72 0 (offset 0)


In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [8]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200, # Overlap to maintain context between chunks
    length_function=len,
    is_separator_regex=False,
)

In [9]:
chunks = text_splitter.split_documents(docs)
data = []

is_debug = False

for i, chunk in enumerate(chunks):
    vector = embedding_fn.encode_documents([chunk.page_content])
    d = {
        "id": i,
        "vector": vector[0],
        "text": chunk.page_content,
        "metadata": chunk.metadata,
    }
    data.append(d)
    is_debug and print(i)
    is_debug and print(chunk.page_content)
    is_debug and pp(chunk.metadata)
    is_debug and print('=====')

In [10]:
client.has_collection(collection_name="pdf_collection") and client.drop_collection(collection_name="pdf_collection")
client.create_collection(
    collection_name="pdf_collection",
    dimension=embedding_fn.dim, 
)
pp(client.list_collections())
res = client.insert(collection_name="pdf_collection", data=data)

['pdf_collection']


# Test Generation

In [12]:
from llm import ModelGardenLLM
from embeddings import ModelGardenEmbeddings, OllamaRagasEmbeddings
from langchain_ollama import OllamaLLM, OllamaEmbeddings

llm_type = os.getenv('LLM_TYPE')
model = os.getenv('MODEL_GARDEN_MODEL')
embedding = os.getenv('EMBEDDING_MODEL')

if llm_type == "model_garden":
    url = os.getenv('MODEL_GARDEN_URL')
    embed_url = os.getenv('EMBEDDING_URL')
    llm = ModelGardenLLM(api_url=url, model=model)
    embeds = ModelGardenEmbeddings(api_url=embed_url, model=embedding)
elif llm_type == "ollama":
    llm = OllamaLLM(model=model, temperature=0.4)
    embeds = OllamaRagasEmbeddings(model=embedding)
else:
    raise ValueError(f"Unsupported LLM type: {llm_type}")
pp(llm_type)

'ollama'


In [13]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=llm, embedding_model=embeds)
dataset = generator.generate_with_langchain_docs(docs, testset_size=1)
dataset.to_csv('dataset.csv')

Applying SummaryExtractor:   0%|          | 0/4 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/4 [00:00<?, ?it/s]

Applying EmbeddingExtractor:   0%|          | 0/4 [00:00<?, ?it/s]

  property_name, property_value = await self.extract(node)


Applying ThemesExtractor:   0%|          | 0/4 [00:00<?, ?it/s]

Applying NERExtractor:   0%|          | 0/4 [00:00<?, ?it/s]

Applying CosineSimilarityBuilder:   0%|          | 0/1 [00:00<?, ?it/s]

Applying OverlapScoreBuilder:   0%|          | 0/1 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/3 [00:00<?, ?it/s]

In [14]:
dataset.to_pandas()

Unnamed: 0,user_input,reference_contexts,reference,persona_name,query_style,query_length,synthesizer_name
0,H Ho ow w w we e b bu ui il lt t ‘ ‘B BA AR...,[H Ho ow w w we e b bu ui il lt t ‘ ‘B BA A...,The logging infrastructure was designed to sca...,Amina,MISSPELLED,MEDIUM,single_hop_specific_query_synthesizer
1,How d'd BARITO dev'pmt addr'ss ELK scal'g iss'...,[<1-hop>\n\nH Ho ow w w we e b bu ui il lt t...,BARITO dev'pmt addr'ss'd ELK scal'g iss'z by p...,,,,multi_hop_abstract_query_synthesizer
2,How did GoTo Financial address technical chall...,"[<1-hop>\n\nGoPay, as part of Indonesia’s tech...",GoTo Financial addressed technical challenges ...,,,,multi_hop_specific_query_synthesizer
