In [1]:
import weaviate
import weaviate.classes as wvc
import os
from dotenv import load_dotenv
from weaviate.classes.config import Property, DataType
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from weaviate.classes.query import MetadataQuery

import torch.nn.functional as F

cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
os.chdir(parent_dir)

load_dotenv()
openai_api_key = os.getenv("OPENAI_KEY")

client = weaviate.connect_to_local(
    port=8080,
    grpc_port=50051,
    additional_config=weaviate.config.AdditionalConfig(timeout=(60, 180)),
    headers={
        "X-OpenAI-Api-Key": openai_api_key  # Replace with your inference API key
    }
)
path_to_pdf = 'pdf_docs'

documents_text = []

for doc in os.listdir(path_to_pdf):

    doc_path = f'{path_to_pdf}/{doc}'
    loader = PyPDFLoader(doc_path)
    pages = loader.load_and_split()
    text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
    docs = text_splitter.split_documents(pages)
    documents_text.append(docs)

documents_text = [item for sublist in documents_text for item in sublist]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
document_objs = list()
for d in documents_text:
    # Extract data from each document
    title = d.metadata['source']
    page = str(d.metadata['page'])  # page number to string
    body = d.page_content

    # Prepare properties for the data object
    document_objs.append({
        "page": page,
        "title": title,
        "body": body
    })

In [3]:
from transformers import BitsAndBytesConfig, AutoModel, AutoTokenizer, BitsAndBytesConfig
from torch.nn.functional import Tensor
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

encoder = AutoModel.from_pretrained(
    'Salesforce/SFR-Embedding-Mistral',
    trust_remote_code=True,
    device_map='auto',
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)

def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
    return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

Loading checkpoint shards: 100%|██████████| 3/3 [00:15<00:00,  5.22s/it]


In [4]:
tokenizer = AutoTokenizer.from_pretrained('Salesforce/SFR-Embedding-Mistral')
max_length = 4096
body_vectors = []
title_vectors = []

for d in documents_text:

    body = d.page_content
    title = d.metadata['source']
    body_batch_dict = tokenizer(body, max_length=max_length, padding=True, truncation=True, return_tensors="pt").to('cuda')
    title_batch_dict = tokenizer(title, max_length=max_length, padding=True, truncation=True, return_tensors="pt").to('cuda')
    with torch.no_grad():
        body_outputs = encoder(**body_batch_dict)
        title_outputs = encoder(**title_batch_dict)
    body_embs = last_token_pool(body_outputs.last_hidden_state, body_batch_dict['attention_mask'])[0].float().cpu().detach().numpy()
    title_embs = last_token_pool(title_outputs.last_hidden_state, title_batch_dict['attention_mask'])[0].float().cpu().detach().numpy()

    body_vectors.append(body_embs)
    title_vectors.append(title_embs)

torch.cuda.empty_cache()

In [5]:
client.collections.delete("citizens_info_docs") 
client.collections.create(
    "citizens_info_docs",

    properties=[  
        Property(name="page", data_type=DataType.TEXT),
        Property(name="title", data_type=DataType.TEXT),
        Property(name="body", data_type=DataType.TEXT),
    ]
)

collection = client.collections.get("citizens_info_docs")

with collection.batch.dynamic() as batch:
    for i, data_row in enumerate(document_objs):
        batch.add_object(
            properties=data_row,
            vector = body_vectors[i].tolist(),
        )

In [6]:
data_object = collection.query.fetch_objects(
    include_vector=True  # Specify names of the vectors to include
)

In [7]:
def convert_text_to_tokens(text:str, tokenizer, max_length):

    batch_dict = tokenizer(text, max_length=max_length, padding=True, truncation=True, return_tensors="pt").to('cuda')
    output = encoder(**batch_dict)
    embeddings = last_token_pool(output.last_hidden_state, batch_dict['attention_mask'])[0].float().cpu().detach().numpy()
    return embeddings

In [8]:
collection = client.collections.get("citizens_info_docs")
question = f"What do I do if my neighbour is having a party"
question_embeddings = convert_text_to_tokens(question, tokenizer, max_length)

response = collection.query.near_vector(
    near_vector=question_embeddings.tolist(),  # Pass the list of vectors
    target_vector='default', 
    return_properties=['body', 'title'],
    limit=2,
    return_metadata=MetadataQuery(distance=True)
)

for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)

{'title': 'pdf_docs/Noise pollution and noise complaints.pdf', 'body': 'If your pet is distressed by ﬁreworks, dogstrust.ie have ﬁrework advice and resources available.\nNoise from homes\nDomes\x00c noise is noise from a domes\x00c se\x00ng, like a home. Depending on the cause of the noise, you will\ncomplain in a diﬀerent way. This is outlined below.\nIf the noise is coming from a person’s home, the Gardaí can ask them to lower noise but they can’t enter the\nproperty with the sole inten\x00on of asking them to lower the noise.\nIf the noise con\x00nues, you can complain to the District Court (see above).\nThere is useful informa\x00on in the leaﬂet on neighbour disputes (pdf) published by FLAC, the Free Legal Advice\nCentres.\nNoise from rented proper\x00es\nIf there is noise coming from a rented home, you should approach the tenant ﬁrst and ask them to lower the\nnoise. If this doesn’t work, you can complain to the landlord. This may be a:\nPrivate landlord\nLocal authority\nHousing

In [9]:
collection = client.collections.get("citizens_info_docs")
question = f"What is the cheapeast way to return back to Ireland"
question_embeddings = convert_text_to_tokens(question, tokenizer, max_length)

response = collection.query.near_vector(
    near_vector=question_embeddings.tolist(),  # Pass the list of vectors
    target_vector='default', 
    return_properties=['body', 'title'],
    limit=2,
    return_metadata=MetadataQuery(distance=True)
)

for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)

{'title': 'pdf_docs/Shipping your personal belongings back to Ireland.pdf', 'body': '4/17/24, 8:43 AM Shipping your personal belongings back to Ireland\nhttps://www.citizensinformation.ie/en/returning-to-ireland/planning-your-journey-home/shipping-your-belongings-back-to-ireland/ 5/5Dublin\nIreland\nHomepage: h\x00ps://www.revenue.ie/en/contact-us/index.aspx\nPage edited: 12 January 2024\nManage cookie preferences\nManage preferences'}
0.2523820400238037
{'title': 'pdf_docs/Shipping your personal belongings back to Ireland.pdf', 'body': '4/17/24, 8:43 AM Shipping your personal belongings back to Ireland\nhttps://www.citizensinformation.ie/en/returning-to-ireland/planning-your-journey-home/shipping-your-belongings-back-to-ireland/ 1/5Shipping your personal belongings back to Ireland\nIntroduc\x00on\nHow much does it cost to ship belongings to Ireland?\nDo I have to pay customs duty or tax?\nClaim relief from customs duty and VAT\nWhat documents do I need when shipping my items?\nHow do 