In [312]:
import os
import torch
import nltk
import weaviate
from weaviate.embedded import EmbeddedOptions
from transformers import AutoTokenizer, AutoModel
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor

load_dotenv()

True

In [313]:
openai_api_key = os.getenv("OPENAI_API_KEY")

In [314]:
def embed_text(text: list[str], tokenizer, model):
    tokens = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        output = model(**tokens).last_hidden_state
    avg_pooled = output.mean(dim=1)
    return avg_pooled.tolist()[0]

In [315]:
def embed_chunk(chunk, tokenizer, model):
    tokens = tokenizer(chunk, return_tensors="pt")
    with torch.no_grad():
        output = model(**tokens).last_hidden_state
    avg_pooled = output.mean(dim=1)
    return {"chunk": chunk, "embeddings": avg_pooled.tolist()[0]}

In [316]:
def embed_chunked_text(chunked_text):
    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    with ThreadPoolExecutor() as executor:
        chunk_and_embeddings = list(executor.map(embed_chunk, chunked_text, [tokenizer]*len(chunked_text), [model]*len(chunked_text)))

    return chunk_and_embeddings

In [317]:
def chunk_text(text, max_chunk_length, overlap=0):
    tokens = nltk.word_tokenize(text)
    step_size = max_chunk_length - overlap
    num_chunks = (len(tokens) - overlap) // step_size + 1

    chunks = [
        tokens[i * step_size : i * step_size + max_chunk_length]
        for i in range(num_chunks)
    ]

    chunked_text = [" ".join(chunk) for chunk in chunks]

    return chunked_text

In [318]:
def process_pdf(pdf_text_list: list[str], chunk_threshold: int, token_overlap: int):
    chunked_pdf_data = []
    for text_index, text in enumerate(pdf_text_list):
        chunked_text = chunk_text(text, chunk_threshold, token_overlap)
        chunked_and_embedded_text = embed_chunked_text(chunked_text)
        for chunk_index, chunk in enumerate(chunked_and_embedded_text):
            chunked_pdf_data.append(
                {
                    "chunk": chunk["chunk"],
                    "chunk_index": chunk_index,
                    "embeddings": chunk["embeddings"],
                }
            )
    
    return chunked_pdf_data

In [319]:
client = weaviate.Client(
    embedded_options=EmbeddedOptions(),
    additional_headers={"X-OpenAI-Api-Key": openai_api_key},
)

embedded weaviate is already listening on port 8079


In [320]:
client.schema.delete_class("Testing")

In [321]:
class_name = "Testing"
class_definition = {
    "class": class_name,
    "vectorizer": "text2vec-openai",
}
client.schema.create_class(class_definition)

{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"testing_9cPyXMvVLpdB","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2023-12-18T22:02:11-05:00","took":66378}


In [322]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [323]:
text_list = [
    " A Simple PDF File \n This is a small demonstration .pdf file - \n just for use in the Virtual Mechanics tutorials. More text. And more \n text. And more text. And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. And more text. Boring, zzzzz. And more text. And more text. And \n more text. And more text. And more text. And more text. And more text. \n And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. And more text. And more text. Even more. Continued on page 2 ...\n",
    " Simple PDF File 2 \n ...continued from page 1. Yet more text. And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. Oh, how boring typing this stuff. But not as boring as watching \n paint dry. And more text. And more text. And more text. And more text. \n Boring.  More, a little more text. The end, and just as well. \n",
]

In [324]:
chunked_embedded_text = process_pdf(text_list, 10, 5)

In [325]:
for data in chunked_embedded_text:
    client.data_object.create({"chunk": data["chunk"]}, "Testing", vector=data["embeddings"])

In [326]:
text = "what page"
vector = embed_text([text], tokenizer, model)

In [327]:
# client.query.get(
#     class_name="Chatpdf", properties=["chunk", "file_name"]
# ).with_near_text({"concepts": ["sample pdf"]}).with_limit(3).do()

In [331]:
client.schema.get("Testing")

{'class': 'Testing',
 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
  'cleanupIntervalSeconds': 60,
  'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
 'moduleConfig': {'text2vec-openai': {'baseURL': 'https://api.openai.com',
   'model': 'ada',
   'modelVersion': '002',
   'type': 'text',
   'vectorizeClassName': True}},
 'multiTenancyConfig': {'enabled': False},
 'properties': [{'dataType': ['text'],
   'description': "This property was generated by Weaviate's auto-schema feature on Mon Dec 18 22:02:15 2023",
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'text2vec-openai': {'skip': False,
     'vectorizePropertyName': False}},
   'name': 'chunk',
   'tokenization': 'word'}],
 'replicationConfig': {'factor': 1},
 'shardingConfig': {'virtualPerPhysical': 128,
  'desiredCount': 1,
  'actualCount': 1,
  'desiredVirtualCount': 128,
  'actualVirtualCount': 128,
  'key': '_id',
  'strategy': 'hash',
  'function': 'murmur3'},
 'vecto

{"action":"requests_total","api":"rest","class_name":"Chatpdf","error":"update vector: connection to: OpenAI API failed with status: 429 error: You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.","level":"error","msg":"unexpected error","query_type":"objects","time":"2023-12-18T22:21:47-05:00"}


In [328]:
client.query.get(class_name="Testing", properties=["chunk"]).with_hybrid(query=text, vector=vector).with_limit(3).do()

{'data': {'Get': {'Testing': [{'chunk': 'Continued on page 2 ...'},
    {'chunk': 'continued from page 1 . Yet more text . And'},
    {'chunk': 'text . Even more . Continued on page 2 ...'}]}}}

In [330]:
client.query.get("Testing", properties=["chunk"]).do()

{'data': {'Get': {'Testing': [{'chunk': '. More text . And more text . And more'},
    {'chunk': 'little more text . The end , and just as'},
    {'chunk': '. And more text . And more text . Boring'},
    {'chunk': 'more text . And more text . Oh , how'},
    {'chunk': 'end , and just as well .'},
    {'chunk': 'text . Even more . Continued on page 2 ...'},
    {'chunk': 'watching paint dry . And more text . And more'},
    {'chunk': '. And more text . And more text . And'},
    {'chunk': '. And more text . And more text . And'},
    {'chunk': 'And more text . And more text . And more'},
    {'chunk': 'text . Oh , how boring typing this stuff .'},
    {'chunk': 'in the Virtual Mechanics tutorials . More text . And'},
    {'chunk': 'more text . And more text . And more text'},
    {'chunk': 'file - just for use in the Virtual Mechanics tutorials'},
    {'chunk': 'text . And more text . And more text .'},
    {'chunk': 'text . And more text . And more text .'},
    {'chunk': '. And more 