In [4]:
from dotenv import load_dotenv
from conversationService import get_embedding
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import loggingService
import os
import weaviate

load_dotenv()
logger = loggingService.get_logger()


In [None]:
apikey = os.getenv("GEANAI_KEY", None)
class_name = os.getenv("WEVIATE_CLASS", 'LivrosVectorizer')
path = os.getenv("DATA_PATH", "data")
weaviate_url = os.getenv("WEAVIATE_URL", 'http://127.0.0.1:8080')

client = weaviate.Client(
    url=weaviate_url,
)

pages = []
pdf_loader = PyPDFDirectoryLoader(path)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50, separators=["\n\n", "\n", " ", "", "\n \n", "\n \n \n","\n\n\n"])
documents = pdf_loader.load_and_split(text_splitter=text_splitter)
print(len(documents))
print(documents[0])

In [None]:
def pdf_text_splitter(pdf_text) -> dict:
  retorno = {'content': '', 'source': '', 'page': 0}
  
  retorno['content'] = getattr(pdf_text, 'page_content')
  retorno['souce'] = getattr(pdf_text, 'metadata')['source']
  retorno['page'] = getattr(pdf_text, 'metadata')['page']
  
  return retorno

for doc in documents:
  # logger.debug(pdf_text_splitter(doc))
  # print(pdf_text_splitter(doc))
  pages.append(pdf_text_splitter(doc))
  
print(pages[0])

In [None]:
class_document =  {
  "class": "Livros",
  "description": "Dados do documento PDF",
  "moduleConfig": {
    "text2vec-transformers": {
      "poolingStrategy": "masked_mean",
      "vectorizeClassName": False
    }
  },
  "properties": [
    {
      "dataType": [
        "text"
      ],
      "description": "Conteúdo do documento PDF",
      "moduleConfig": {
        "text2vec-transformers": {
          "skip": False,
          "vectorizePropertyName": False
        }
      },
      "name": "content"
    },
    {
      "dataType": [
        "text"
      ],
      "description": "",
      "moduleConfig": {
        "text2vec-transformers": {
          "skip": False,
          "vectorizePropertyName": False
        }
      },
      "name": "source"
    },
    {
      "dataType": [
        "text"
      ],
      "description": "",
      "moduleConfig": {
        "text2vec-transformers": {
          "skip": False,
          "vectorizePropertyName": False
        }
      },
      "name": "page"
    }
  ],
  "vectorizer": "none"
}

class_document2 =  {
  "class": "LivrosVectorizer",
  "description": "Dados do documento PDF",
  "moduleConfig": {
    "text2vec-transformers": {
      "poolingStrategy": "masked_mean",
      "vectorizeClassName": False
    }
  },
  "properties": [
    {
      "dataType": [
        "text"
      ],
      "description": "Conteúdo do documento PDF",
      "moduleConfig": {
        "text2vec-transformers": {
          "skip": False,
          "vectorizePropertyName": False
        }
      },
      "name": "content"
    },
    {
      "dataType": [
        "text"
      ],
      "description": "",
      "moduleConfig": {
        "text2vec-transformers": {
          "skip": False,
          "vectorizePropertyName": False
        }
      },
      "name": "source"
    },
    {
      "dataType": [
        "text"
      ],
      "description": "",
      "moduleConfig": {
        "text2vec-transformers": {
          "skip": False,
          "vectorizePropertyName": False
        }
      },
      "name": "page"
    }
  ],
  "vectorizer": "text2vec-transformers"
}

client.schema.create_class(class_document)
# client.schema.delete_class('Livros')
# client.schema.get("LivrosVectorizer")
# client.schema.property.create(class_name, {
#     'name': 'exemplar',
#     'dataType': ['text'],
# })


In [None]:
client.batch.configure(batch_size=10)  # Configure batch
with client.batch as batch:
  i = 0
    
  for page in pages:
    logger.info(f"importing question: {i+1}")
    i = i+1
    
    properties = {
      "content": page["content"],
      "page": str(page["page"]),
      "source": page["source"],
    }
    
    vector = get_embedding(page["content"])

    client.batch.add_data_object(properties, 'Livros', vector=vector)
    # client.batch.add_data_object(properties, 'LivrosVectorizer')

In [None]:
result = (client.query
.get('Livros', ["content", "source", "page"])
.with_additional(["certainty", "distance"]) # note that certainty is only supported if distance==cosine
.with_near_vector({
  "vector": get_embedding('o que significa casmurro'),
  "certainty": 0.8
})
.with_limit(4)
.do()
)

print(result['data']['Get'][class_name])

retorno = ''
if len(result['data']['Get'][class_name]) > 0:
  retorno = result['data']['Get'][class_name][0]['content']

  for contexto in result['data']['Get'][class_name][1:]:
    retorno += f"\n{contexto['content']}"

logger.info(retorno)


In [None]:
print(client.query.aggregate('Livros').with_meta_count().do())

result = (client.query
  .get('Livros', ["content", "source", "page"])
  .with_additional(["certainty", "distance"]) # note that certainty is only supported if distance==cosine
  .with_near_vector({
    "vector": get_embedding('um vasto círculo de saias ou, t repadas no a r, choviam pés e pernas sobre a minha cabeça. Assim fui \naté madruga da. Não dorm i mais;'),
    "certainty": 0.8
  })
  .with_limit(4)
  .do()
)

print(result)
print(result['data']['Get']['Livros'])

In [None]:
result = client.data_object.get(class_name='Livros', offset=300, with_vector=True)

print(result['objects'][0]['properties'])
print(len(result['objects'][0]['vector']))