In [71]:
import os
import uuid
import torch
import nltk
import weaviate
from weaviate.embedded import EmbeddedOptions
from transformers import AutoTokenizer, AutoModel
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor

load_dotenv()

True

In [72]:
openai_api_key = os.getenv("OPENAI_API_KEY")

In [73]:
def embed_text(text: list[str], tokenizer, model):
    tokens = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        output = model(**tokens).last_hidden_state
    avg_pooled = output.mean(dim=1)
    return avg_pooled.tolist()[0]

In [74]:
def embed_chunk(chunk, tokenizer, model):
    tokens = tokenizer(chunk, return_tensors="pt")
    with torch.no_grad():
        output = model(**tokens).last_hidden_state
    avg_pooled = output.mean(dim=1)
    return {"chunk": chunk, "embeddings": avg_pooled.tolist()[0]}

In [75]:
def embed_chunked_text(chunked_text):
    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    with ThreadPoolExecutor() as executor:
        chunk_and_embeddings = list(executor.map(embed_chunk, chunked_text, [tokenizer]*len(chunked_text), [model]*len(chunked_text)))

    return chunk_and_embeddings

In [76]:
def chunk_text(text, max_chunk_length, overlap=0):
    tokens = nltk.word_tokenize(text)
    step_size = max_chunk_length - overlap
    num_chunks = (len(tokens) - overlap) // step_size + 1

    chunks = [
        tokens[i * step_size : i * step_size + max_chunk_length]
        for i in range(num_chunks)
    ]

    chunked_text = [" ".join(chunk) for chunk in chunks]

    return chunked_text

In [77]:
def process_pdf(pdf_id: str, pdf_text_list: list[str], chunk_threshold: int, token_overlap: int):
    chunked_pdf_data = []
    for text_index, text in enumerate(pdf_text_list):
        chunked_text = chunk_text(text, chunk_threshold, token_overlap)
        chunked_and_embedded_text = embed_chunked_text(chunked_text)
        for chunk_index, chunk in enumerate(chunked_and_embedded_text):
            chunked_pdf_data.append(
                {
                    "source": pdf_id,
                    "chunk": chunk["chunk"],
                    "chunk_index": chunk_index,
                    "embeddings": chunk["embeddings"],
                }
            )
    
    return chunked_pdf_data

In [78]:
client = weaviate.Client(
    embedded_options=EmbeddedOptions(),
    additional_headers={"X-OpenAI-Api-Key": openai_api_key},
)

embedded weaviate is already listening on port 8079


In [79]:
client.schema.delete_class("Testing")

In [80]:
class_name = "Testing"
class_definition = {
    "class": class_name,
    "vectorizer": "text2vec-openai",
}
client.schema.create_class(class_definition)

In [91]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [92]:
text_list = [
    " A Simple PDF File \n This is a small demonstration .pdf file - \n just for use in the Virtual Mechanics tutorials. More text. And more \n text. And more text. And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. And more text. Boring, zzzzz. And more text. And more text. And \n more text. And more text. And more text. And more text. And more text. \n And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. And more text. And more text. Even more. Continued on page 2 ...\n",
    " Simple PDF File 2 \n ...continued from page 1. Yet more text. And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. Oh, how boring typing this stuff. But not as boring as watching \n paint dry. And more text. And more text. And more text. And more text. \n Boring.  More, a little more text. The end, and just as well. \n",
]

In [93]:
pdf_id = uuid.uuid4()
print(pdf_id)
chunked_embedded_text = process_pdf(pdf_id.__str__(), text_list, 10, 5)

12abfc78-0c3d-4496-9120-bdcf468ed9c0


In [94]:
for data in chunked_embedded_text:
    client.data_object.create(data, "Testing", vector=data["embeddings"])

In [95]:
text = "what page"
vector = embed_text([text], tokenizer, model)

In [96]:
# client.query.get(
#     class_name="Chatpdf", properties=["chunk", "file_name"]
# ).with_near_text({"concepts": ["sample pdf"]}).with_limit(3).do()

In [97]:
# client.schema.get("Testing")

In [100]:
client.query.get(class_name="Testing", properties=["chunk", "source"]).with_hybrid(
    query=text, vector=vector
).with_where({
    "path": ["source"],
    "operator": "Equal",
    "valueText": "12abfc78-0c3d-4496-9120-bdcf468ed9c0"
}).with_limit(3).do()

{'data': {'Get': {'Testing': [{'chunk': 'Continued on page 2 ...',
     'source': '12abfc78-0c3d-4496-9120-bdcf468ed9c0'},
    {'chunk': 'continued from page 1 . Yet more text . And',
     'source': '12abfc78-0c3d-4496-9120-bdcf468ed9c0'},
    {'chunk': 'text . Even more . Continued on page 2 ...',
     'source': '12abfc78-0c3d-4496-9120-bdcf468ed9c0'}]}}}

In [99]:
client.query.get("Testing", properties=["chunk"]).do()

{'data': {'Get': {'Testing': [{'chunk': 'more text . And more text . And more text'},
    {'chunk': 'text . And more text . And more text .'},
    {'chunk': 'text . And more text . And more text .'},
    {'chunk': '. And more text . And more text . And'},
    {'chunk': 'But not as boring as watching paint dry . And'},
    {'chunk': 'And more text . And more text . And more'},
    {'chunk': 'file - just for use in the Virtual Mechanics tutorials'},
    {'chunk': '. And more text . And more text . And'},
    {'chunk': 'text . Even more . Continued on page 2 ...'},
    {'chunk': 'text . And more text . And more text .'},
    {'chunk': 'And more text . Boring , zzzzz . And more'},
    {'chunk': 'more text . And more text . And more text'},
    {'chunk': 'more text . And more text . And more text'},
    {'chunk': 'watching paint dry . And more text . And more'},
    {'chunk': 'text . Oh , how boring typing this stuff .'},
    {'chunk': 'Yet more text . And more text . And more'},
    {'chun