In [1]:
import weaviate
import weaviate.classes as wvc
import os
import requests
import json
import torch
from dotenv import load_dotenv
from weaviate.classes.config import Configure, Property, DataType, Tokenization
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch import Tensor
import torch.nn.functional as F

def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

load_dotenv()
openai_api_key = os.getenv("OPENAI_KEY")

client = weaviate.connect_to_local(
    port=8080,
    grpc_port=50051,
    additional_config=weaviate.config.AdditionalConfig(timeout=(60, 180)),
    headers={
        "X-OpenAI-Api-Key": openai_api_key  # Replace with your inference API key
    }
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
path_to_pdf = 'pdf_docs'

documents_text = []

for doc in os.listdir(path_to_pdf):

    doc_path = f'{path_to_pdf}/{doc}'
    loader = PyPDFLoader(doc_path)
    pages = loader.load_and_split()
    text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
    docs = text_splitter.split_documents(pages)
    documents_text.append(docs)

documents_text = [item for sublist in documents_text for item in sublist]

In [30]:
client.collections.delete("citizens_info_docs") 
client.collections.create(
    "citizens_info_docs",

    properties=[  
        Property(name="page", data_type=DataType.TEXT),
        Property(name="title", data_type=DataType.TEXT),
        Property(name="body", data_type=DataType.TEXT),
    ]
)

<weaviate.collections.collection.Collection at 0x7f83e6f43750>

In [None]:
document_objs = list()

for d in documents_text:
    # Extract data from each document
    source = d.metadata['source']
    page = str(d.metadata['page'])  # page number to string
    content = d.page_content

    # Prepare properties for the data object
    document_objs.append({
        "page": page,
        "title": source,
        "body": content
    })

# Assuming you have a collection named 'Documents' set up in Weaviate
documents = client.collections.get("citizens_info_docs")
documents.data.insert_many(document_objs)  # Insert all documents at once

BatchObjectReturn(all_responses=[UUID('538c0ed0-6bab-4943-a12e-d1aebad3822f'), UUID('e0ec4d43-0f44-4d43-8beb-c603c6e33a4d'), UUID('113d1da1-d0e3-43e9-9a74-115b0b3fe24b'), UUID('70791ef7-3a4e-4b03-a38d-2990ebec7396'), UUID('aa6c5ee9-b922-4c73-add2-07782740a72c'), UUID('24be04d4-905b-4188-961b-13d8ce572e6a'), UUID('c3e97d9d-c7dd-4477-9c2a-35b2ea2a601c'), UUID('a9046fdb-a8dc-4469-a513-80fac5b6e010'), UUID('59bf8978-ec59-461d-a494-6a9a46a284a5'), UUID('45364b91-4ade-40ec-b8c6-2330f9cac8ec'), UUID('9a078cad-5906-47c5-9f80-94ee62d48d0a'), UUID('47f5fc8d-0756-40b4-889d-1b040e13e42c'), UUID('b277543a-d342-4336-8f3c-65e910c39ec1'), UUID('1c4c5860-7f5d-428e-8d19-57907919acac'), UUID('182df9c9-f1ca-475d-836e-0e68777bf2e0'), UUID('ba3bfb11-0595-4015-b806-1ba7bd650939'), UUID('427c01d1-bebb-45ee-af50-492b836fb061'), UUID('c10a3b39-cf27-433e-83af-8872632d07ed'), UUID('4fd36b5c-c344-4647-a6ac-313a3378bad6')], elapsed_seconds=0.8575887680053711, errors={}, uuids={0: UUID('538c0ed0-6bab-4943-a12e-d1aeb

In [4]:
import torch
from transformers import BitsAndBytesConfig, AutoModel

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

encoder = AutoModel.from_pretrained(
    'Salesforce/SFR-Embedding-Mistral',
    trust_remote_code=True,
    device_map='auto',
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)

Loading checkpoint shards: 100%|██████████| 3/3 [00:11<00:00,  3.87s/it]


In [6]:
tokenizer = AutoTokenizer.from_pretrained('Salesforce/SFR-Embedding-Mistral')
max_length = 4096
body_vectors = []
title_vectors = []

for d in documents_text:

    body = d.page_content
    title = d.metadata['source']
    body_batch_dict = tokenizer(body, max_length=max_length, padding=True, truncation=True, return_tensors="pt").to('cuda')
    title_batch_dict = tokenizer(title, max_length=max_length, padding=True, truncation=True, return_tensors="pt").to('cuda')
    body_outputs = encoder(**body_batch_dict)
    title_outputs = encoder(**title_batch_dict)
    body_embs = last_token_pool(body_outputs.last_hidden_state, body_batch_dict['attention_mask'])[0].float().cpu().detach().numpy(force=True)
    title_embs = last_token_pool(title_outputs.last_hidden_state, title_batch_dict['attention_mask'])[0].float().cpu().detach().numpy(force=True)

    body_vectors.append(body_embs)
    title_vectors.append(title_embs)
    torch.cuda.empty_cache()

In [11]:
### Maybe not needed
# flattened_body_vectors = [vector for vector_list in body_vectors for vector in vector_list]
# flattened_title_vectors = [vector for vector_list in title_vectors for vector in vector_list]

In [None]:
# Create the properties which is the text of the 3 fields of my object (class)
# Create the two vectors and insert them

'''

with collection.batch.dynamic() as batch:
    for i, data_row in enumerate(data_rows):
        batch.add_object(
            properties=data_row,
            vector={
                "title": title_vectors[i],
                "body": body_vectors[i],
            }
        )

'''

In [33]:
document_objs = list()

for d in documents_text:
    # Extract data from each document
    source = d.metadata['source']
    page = str(d.metadata['page'])  # page number to string
    content = d.page_content

    # Prepare properties for the data object
    document_objs.append({
        "page": page,
        "title": source,
        "body": content
    })

In [35]:
collection = client.collections.get("citizens_info_docs")

with collection.batch.dynamic() as batch:
    for i, data_row in enumerate(document_objs):
        batch.add_object(
            properties=data_row,
            vector={
                "title": title_vectors[i],
                "body": body_vectors[i],
            }
        )

In [69]:
vector_names = ["title", "body"]

data_object = collection.query.fetch_objects(
    include_vector=vector_names  # Specify names of the vectors to include
)

In [90]:
random_uuid = '16191569-10df-4e3a-8ccb-a6a16587d26c'

one_data_object = collection.query.fetch_object_by_id(
    random_uuid,
    include_vector=vector_names
)

print(one_data_object)

ObjectSingleReturn(uuid=_WeaviateUUIDInt('16191569-10df-4e3a-8ccb-a6a16587d26c'), metadata=MetadataSingleObjectReturn(creation_time=datetime.datetime(2024, 4, 17, 13, 41, 16, 706000, tzinfo=datetime.timezone.utc), last_update_time=datetime.datetime(2024, 4, 17, 13, 41, 16, 706000, tzinfo=datetime.timezone.utc), is_consistent=None), properties={'title': 'pdf_docs/Pregnancy and social welfare payments.pdf', 'page': '0', 'body': "4/12/24, 2:46 PM Pregnancy and social welfare payments\nhttps://www.citizensinformation.ie/en/social-welfare/families-and-children/pregnancy-and-social-welfare-payments/#5011b8 1/3Pregnancy and social welfare payments\nIntroduc\x00on\nPregnancy and jobseeker's payments\nPregnancy and Illness Beneﬁt\nPregnancy and One-Parent Family Payment\nPregnancy and employment schemes\nIntroduc\x00on\nIf you are pregnant and ge\x00ng a social welfare payment, you may be able to con\x00nue to get your\npayment as long as you sa\x00sfy the condi\x00ons, or transfer to another p

In [91]:
one_data_object.vector

{'title': [3.46875,
  1.1484375,
  2.5,
  0.9765625,
  -1.578125,
  -0.9375,
  -2.25,
  3.53125,
  9.6875,
  -1.109375,
  -2.109375,
  2.125,
  2.359375,
  3.40625,
  1.0546875,
  -4.5,
  3.25,
  -0.71484375,
  3.109375,
  -3.96875,
  -7.09375,
  -11.125,
  -1.8203125,
  1.4375,
  3.0625,
  -4.15625,
  -2.21875,
  -4.5625,
  -1.71875,
  6.09375,
  -1.6171875,
  8.625,
  5.0,
  2.1875,
  3.796875,
  -2.953125,
  -2.34375,
  -4.21875,
  -0.8828125,
  -0.6640625,
  -3.9375,
  -3.5,
  4.40625,
  -0.484375,
  4.3125,
  6.71875,
  6.46875,
  2.140625,
  -2.03125,
  -3.125,
  0.56640625,
  3.59375,
  -0.5703125,
  32.0,
  -1.2421875,
  0.6484375,
  8.1875,
  2.921875,
  3.109375,
  -1.515625,
  -1.7421875,
  7.78125,
  -4.5625,
  -6.6875,
  -0.224609375,
  5.3125,
  3.0625,
  0.94921875,
  -5.0,
  -0.9296875,
  1.0625,
  -7.65625,
  -4.71875,
  3.546875,
  6.625,
  -1.5234375,
  -4.46875,
  -1.2890625,
  0.8125,
  -4.84375,
  3.1875,
  -2.203125,
  5.28125,
  1.578125,
  -1.1953125,
  4.375,


In [81]:
data_object.objects[0].vector['body']

[0.73046875,
 3.484375,
 5.84375,
 2.578125,
 -3.4375,
 -1.71875,
 -0.9296875,
 4.75,
 8.375,
 -1.1875,
 0.4921875,
 0.9765625,
 4.5,
 2.96875,
 -0.0791015625,
 -3.96875,
 4.03125,
 -0.318359375,
 4.5,
 -4.8125,
 -5.1875,
 -6.625,
 0.6796875,
 -1.171875,
 5.78125,
 2.484375,
 -1.0625,
 1.921875,
 1.484375,
 6.6875,
 0.9765625,
 3.15625,
 4.375,
 0.1728515625,
 -0.263671875,
 -3.78125,
 -1.546875,
 -1.6484375,
 -6.34375,
 2.71875,
 -0.26171875,
 -1.375,
 2.0,
 -1.0078125,
 4.8125,
 3.65625,
 8.875,
 4.28125,
 1.609375,
 -4.25,
 -1.2734375,
 2.609375,
 -0.0306396484375,
 20.375,
 -2.625,
 -0.5859375,
 4.375,
 -4.5,
 1.1171875,
 -3.984375,
 -1.609375,
 9.1875,
 -3.734375,
 -5.875,
 4.09375,
 6.8125,
 2.765625,
 2.546875,
 -3.15625,
 -5.125,
 0.12451171875,
 -7.90625,
 -7.375,
 0.0849609375,
 4.84375,
 -4.25,
 -4.875,
 3.4375,
 -1.8984375,
 -2.34375,
 1.1796875,
 -5.375,
 5.71875,
 1.84375,
 -1.3203125,
 10.375,
 2.125,
 -4.59375,
 7.09375,
 -4.25,
 -0.74609375,
 -6.125,
 5.1875,
 2.65625,

In [111]:
question = f"What do I do if my neighbour is having a party"

question_batch_dict = tokenizer(question, max_length=max_length, padding=True, truncation=True, return_tensors="pt").to('cuda')
question_output = encoder(**question_batch_dict)
question_embs = last_token_pool(body_outputs.last_hidden_state, body_batch_dict['attention_mask'])[0].float().cpu().detach().numpy()[0]

In [112]:
from weaviate.classes.query import MetadataQuery

# Convert the NumPy array to a list with a single element
question_embs_list = [question_embs.tolist()]

response = collection.query.near_vector(
    near_vector=question_embs_list,  # Pass the list of vectors
    target_vector="body",  # Specify the vector properties as a comma-separated string
    return_properties=['body', 'title'],
    limit=2,
    return_metadata=MetadataQuery(distance=True)
)

for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)

In [100]:
len(question_embs_list)

4096

In [109]:
response

QueryReturn(objects=[])