In [1]:
import weaviate
import weaviate.classes as wvc
import os
import torch
from dotenv import load_dotenv
from weaviate.classes.config import Configure, Property, DataType, Tokenization
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch import Tensor
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BitsAndBytesConfig, AutoModel
from sentence_transformers import SentenceTransformer

import torch.nn.functional as F

load_dotenv()
openai_api_key = os.getenv("OPENAI_KEY")

cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
os.chdir(parent_dir)

client = weaviate.connect_to_local(
    port=8080,
    grpc_port=50051,
    additional_config=weaviate.config.AdditionalConfig(timeout=(60, 180)),
    headers={
        "X-OpenAI-Api-Key": openai_api_key  # Replace with your inference API key
    }
)
path_to_pdf = 'pdf_docs'

documents_text = []

for doc in os.listdir(path_to_pdf):

    doc_path = f'{path_to_pdf}/{doc}'
    loader = PyPDFLoader(doc_path)
    pages = loader.load_and_split()
    text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
    docs = text_splitter.split_documents(pages)
    documents_text.append(docs)

documents_text = [item for sublist in documents_text for item in sublist]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

body_vectors = []
for d in documents_text:
    body = d.page_content
    embeddings = model.encode(body)
    body_vectors.append(embeddings)

document_objs = list()
for d in documents_text:
    # Extract data from each document
    title = d.metadata['source']
    page = str(d.metadata['page'])  # page number to string
    body = d.page_content

    # Prepare properties for the data object
    document_objs.append({
        "page": page,
        "title": title,
        "body": body
    })

In [3]:
client.collections.delete("citizens_info_docs") 
client.collections.create(
    "citizens_info_docs",

    properties=[  
        Property(name="page", data_type=DataType.TEXT),
        Property(name="title", data_type=DataType.TEXT),
        Property(name="body", data_type=DataType.TEXT),
    ]
)

collection = client.collections.get("citizens_info_docs")

with collection.batch.dynamic() as batch:
    for i, data_row in enumerate(document_objs):
        batch.add_object(
            # print(data_row),
            properties=data_row,
            vector = body_vectors[i],
        )

In [4]:
client.collections.delete("citizens_info_docs") 
client.collections.create(
    "citizens_info_docs",

    properties=[  
        Property(name="page", data_type=DataType.TEXT),
        Property(name="title", data_type=DataType.TEXT),
        Property(name="body", data_type=DataType.TEXT),
    ]
)

collection = client.collections.get("citizens_info_docs")

with collection.batch.dynamic() as batch:
    for i, data_row in enumerate(document_objs):
        batch.add_object(
            # print(data_row),
            properties=data_row,
            vector = body_vectors[i].tolist(),
        )

In [5]:
data_object = collection.query.fetch_objects(
    include_vector=True  # Specify names of the vectors to include
)

In [None]:
max_length = 4096
tokenizer = AutoTokenizer.from_pretrained('Salesforce/SFR-Embedding-Mistral')
collection = client.collections.get("citizens_info_docs")
question = f"What do I do if my neighbour is having a party"

question_batch_dict = tokenizer(question, max_length=max_length, padding=True, truncation=True, return_tensors="pt").to('cuda')
question_output = encoder(**question_batch_dict)
question_embs = last_token_pool(question_output.last_hidden_state, question_batch_dict['attention_mask'])[0].float().cpu().detach().numpy()
# question_embs = question_output.last_hidden_state[0][0].float().cpu().detach().numpy()
from weaviate.classes.query import MetadataQuery

# Convert the NumPy array to a list with a single element
# question_embs_list = [question_embs.tolist()]
q_emb_list = question_embs.tolist()
response = collection.query.near_vector(
    near_vector=question_embs.tolist(),  # Pass the list of vectors
    target_vector="body", 
    return_properties=['body', 'title'],
    limit=2,
    return_metadata=MetadataQuery(distance=True)
)

for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)