In [1]:
import weaviate
import weaviate.classes as wvc
import os
from dotenv import load_dotenv
from weaviate.classes.config import Property, DataType
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

import torch.nn.functional as F

load_dotenv()
openai_api_key = os.getenv("OPENAI_KEY")

cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
os.chdir(parent_dir)

client = weaviate.connect_to_local(
    port=8080,
    grpc_port=50051,
    additional_config=weaviate.config.AdditionalConfig(timeout=(60, 180)),
    headers={
        "X-OpenAI-Api-Key": openai_api_key  # Replace with your inference API key
    }
)
path_to_pdf = 'pdf_docs'

documents_text = []

for doc in os.listdir(path_to_pdf):

    doc_path = f'{path_to_pdf}/{doc}'
    loader = PyPDFLoader(doc_path)
    pages = loader.load_and_split()
    text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
    docs = text_splitter.split_documents(pages)
    documents_text.append(docs)

documents_text = [item for sublist in documents_text for item in sublist]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

body_vectors = []
for d in documents_text:
    body = d.page_content
    embeddings = model.encode(body)
    body_vectors.append(embeddings)

document_objs = list()
for d in documents_text:
    # Extract data from each document
    title = d.metadata['source']
    page = str(d.metadata['page'])  # page number to string
    body = d.page_content

    # Prepare properties for the data object
    document_objs.append({
        "page": page,
        "title": title,
        "body": body
    })

In [3]:
client.collections.delete("citizens_info_docs") 
client.collections.create(
    "citizens_info_docs",

    properties=[  
        Property(name="page", data_type=DataType.TEXT),
        Property(name="title", data_type=DataType.TEXT),
        Property(name="body", data_type=DataType.TEXT),
    ]
)

collection = client.collections.get("citizens_info_docs")

with collection.batch.dynamic() as batch:
    for i, data_row in enumerate(document_objs):
        batch.add_object(
            # print(data_row),
            properties=data_row,
            vector = body_vectors[i].tolist(),
        )

In [4]:
data_object = collection.query.fetch_objects(
    include_vector=True  # Specify names of the vectors to include
)

In [5]:
from weaviate.classes.query import MetadataQuery
collection = client.collections.get("citizens_info_docs")
question = f"What do I do if my neighbour is having a party"
question_output = model.encode(question).tolist()

response = collection.query.near_vector(
    near_vector=question_output,  # Pass the list of vectors
    target_vector='default', 
    return_properties=['body', 'title'],
    limit=2,
    return_metadata=MetadataQuery(distance=True)
)

for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)

{'title': 'pdf_docs/Noise pollution and noise complaints.pdf', 'body': 'If your pet is distressed by ﬁreworks, dogstrust.ie have ﬁrework advice and resources available.\nNoise from homes\nDomes\x00c noise is noise from a domes\x00c se\x00ng, like a home. Depending on the cause of the noise, you will\ncomplain in a diﬀerent way. This is outlined below.\nIf the noise is coming from a person’s home, the Gardaí can ask them to lower noise but they can’t enter the\nproperty with the sole inten\x00on of asking them to lower the noise.\nIf the noise con\x00nues, you can complain to the District Court (see above).\nThere is useful informa\x00on in the leaﬂet on neighbour disputes (pdf) published by FLAC, the Free Legal Advice\nCentres.\nNoise from rented proper\x00es\nIf there is noise coming from a rented home, you should approach the tenant ﬁrst and ask them to lower the\nnoise. If this doesn’t work, you can complain to the landlord. This may be a:\nPrivate landlord\nLocal authority\nHousing

In [6]:
from weaviate.classes.query import MetadataQuery
collection = client.collections.get("citizens_info_docs")
question = f"How do I apply for maternity leave"
question_output = model.encode(question).tolist()

response = collection.query.near_vector(
    near_vector=question_output,  # Pass the list of vectors
    target_vector='default', 
    return_properties=['body', 'title'],
    limit=2,
    return_metadata=MetadataQuery(distance=True)
)

for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)

{'title': 'pdf_docs/Pregnancy and social welfare payments.pdf', 'body': "4/12/24, 2:46 PM Pregnancy and social welfare payments\nhttps://www.citizensinformation.ie/en/social-welfare/families-and-children/pregnancy-and-social-welfare-payments/#5011b8 1/3Pregnancy and social welfare payments\nIntroduc\x00on\nPregnancy and jobseeker's payments\nPregnancy and Illness Beneﬁt\nPregnancy and One-Parent Family Payment\nPregnancy and employment schemes\nIntroduc\x00on\nIf you are pregnant and ge\x00ng a social welfare payment, you may be able to con\x00nue to get your\npayment as long as you sa\x00sfy the condi\x00ons, or transfer to another payment that be\x00er suits your\nneeds.\nThis page explains how pregnancy is treated if you are ge\x00ng Jobseeker’s Beneﬁt, Jobseeker’s\nAllowance, Illness Beneﬁt, One-Parent Family Payment or are on an employment scheme.\nPregnancy and Maternity Beneﬁt\nIf you are on maternity leave from work and have paid enough social insurance (PRSI), you can get\nMat

In [7]:
from weaviate.classes.query import MetadataQuery
collection = client.collections.get("citizens_info_docs")
question = f"How do I ship from the UK"
question_output = model.encode(question).tolist()

response = collection.query.near_vector(
    near_vector=question_output,  # Pass the list of vectors
    target_vector='default', 
    return_properties=['body', 'title'],
    limit=2,
    return_metadata=MetadataQuery(distance=True)
)

for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)

{'title': 'pdf_docs/Shipping your personal belongings back to Ireland.pdf', 'body': '4/17/24, 8:43 AM Shipping your personal belongings back to Ireland\nhttps://www.citizensinformation.ie/en/returning-to-ireland/planning-your-journey-home/shipping-your-belongings-back-to-ireland/ 4/5If you are travelling from the UK to Ireland with your belongings via passenger ferry, go to customs\nwhen you arrive at the port.\nFreight ferry\nIf shipping your belongings by freight ferry, you must give the ferry company your ‘Pre-Boarding\nNo\x00ﬁca\x00on (PBN) ID’. You can get your PBN ID from Revenue – simply request it when sending your\ncompleted ‘Transfer of residence’ form. See ‘Moving to Ireland from outside the EU’ above.\nRead more in Revenue’s guide to the procedure at importa\x00on.\nHow do I bring my car to Ireland?\nIf you bring your vehicle to Ireland, you must register the vehicle at a Na\x00onal Car Tes\x00ng Service\n(NCTS) centre. You may also have to pay Vehicle Registra\x00on Tax (V