In [1]:
import weaviate
import weaviate.classes as wvc
import os
import requests
import json
from dotenv import load_dotenv

load_dotenv()
openai_api_key = os.getenv("OPENAI_KEY")

client = weaviate.connect_to_local(
    port=8080,
    grpc_port=50051,
    additional_config=weaviate.config.AdditionalConfig(timeout=(10, 25)),
    headers={
        "X-OpenAI-Api-Key": openai_api_key  # Replace with your inference API key
    }
)

In [2]:
assert client.is_live()

In [3]:
client.get_meta()

{'hostname': 'http://[::]:8080',
 'modules': {'generative-cohere': {'documentationHref': 'https://docs.cohere.com/reference/generate',
   'name': 'Generative Search - Cohere'},
  'generative-openai': {'documentationHref': 'https://platform.openai.com/docs/api-reference/completions',
   'name': 'Generative Search - OpenAI'},
  'generative-palm': {'documentationHref': 'https://cloud.google.com/vertex-ai/docs/generative-ai/chat/test-chat-prompts',
   'name': 'Generative Search - Google PaLM'},
  'qna-openai': {'documentationHref': 'https://platform.openai.com/docs/api-reference/completions',
   'name': 'OpenAI Question & Answering Module'},
  'ref2vec-centroid': {},
  'reranker-cohere': {'documentationHref': 'https://txt.cohere.com/rerank/',
   'name': 'Reranker - Cohere'},
  'text2vec-cohere': {'documentationHref': 'https://docs.cohere.ai/embedding-wiki/',
   'name': 'Cohere Module'},
  'text2vec-huggingface': {'documentationHref': 'https://huggingface.co/docs/api-inference/detailed_para

In [4]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import CharacterTextSplitter


# embeddings = OllamaEmbeddings()

path_to_pdf = 'Pregnancy and social welfare payments.pdf'
loader = PyPDFLoader(path_to_pdf)
pages = loader.load_and_split()
text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
docs = text_splitter.split_documents(pages)

In [5]:
for d in docs:
    print(d.metadata)
    print(d.page_content)
    print(d.metadata['source'])
    print(d.metadata['page'])
    break

{'source': 'Pregnancy and social welfare payments.pdf', 'page': 0}
4/12/24, 2:46 PM Pregnancy and social welfare payments
https://www.citizensinformation.ie/en/social-welfare/families-and-children/pregnancy-and-social-welfare-payments/#5011b8 1/3Pregnancy and social welfare payments
Introduc on
Pregnancy and jobseeker's payments
Pregnancy and Illness Beneﬁt
Pregnancy and One-Parent Family Payment
Pregnancy and employment schemes
Introduc on
If you are pregnant and ge ng a social welfare payment, you may be able to con nue to get your
payment as long as you sa sfy the condi ons, or transfer to another payment that be er suits your
needs.
This page explains how pregnancy is treated if you are ge ng Jobseeker’s Beneﬁt, Jobseeker’s
Allowance, Illness Beneﬁt, One-Parent Family Payment or are on an employment scheme.
Pregnancy and Maternity Beneﬁt
If you are on maternity leave from work and have paid enough social insurance (PRSI), you can get
Maternity Beneﬁt. You should apply for Maternity

In [6]:
client.collections.delete("Documents") 

In [8]:
from weaviate.classes.config import Configure, Property, DataType, Tokenization

client.collections.delete("Documents") 

client.collections.create(
    "Documents",
    # vectorizer_config=Configure.Vectorizer.text2vec_openai(),
    properties=[  # properties configuration is optional
        Property(
            name="source",
            data_type=DataType.TEXT,
            vectorize_property_name=True,
            tokenization=Tokenization.LOWERCASE
        ),
        Property(
            name="page", 
            data_type=DataType.TEXT,
            vectorize_property_name=True,
            tokenization=Tokenization.LOWERCASE),

        Property(name="content", data_type=DataType.TEXT),
    ],

    vectorizer_config=[
        Configure.NamedVectors.text2vec_openai(
            name="title", source_properties=['title']
        ),
        Configure.NamedVectors.text2vec_openai(
            name="content", source_properties=['content']
        )
    ]
)

<weaviate.collections.collection.Collection at 0x7fd0f6432bf0>

In [14]:
with client.batch as batch:
    for i, d in enumerate(docs):
        # Extracting data from your document's structure
        source = d.metadata['source']
        page = str(d.metadata['page'])  # Converting page number to string if it's not already
        content = d.page_content

        # Create a data object for each document
        properties = {
            "source": d['source'],
            "page": d['page'],
            "content": d['content']
        }

        # Insert the data object into the Weaviate collection
        batch.add_data_object(properties, "Question")

AttributeError: 'WeaviateClient' object has no attribute 'data_object_creator'

In [12]:
df

Unnamed: 0,description,content
0,page_content,"4/12/24, 2:46 PM Pregnancy and social welfare ..."
1,metadata,{'source': 'Pregnancy and social welfare payme...
2,type,Document
