Skip to content

Commit

Permalink
returning the image gsurl
Browse files Browse the repository at this point in the history
  • Loading branch information
MarkEdmondson1234 committed Mar 25, 2024
1 parent 9e42e3c commit 04594d1
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 4 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from setuptools import setup, find_packages

# Define your base version
version = '0.32.4'
version = '0.32.5'

setup(
name='sunholo',
Expand Down
3 changes: 3 additions & 0 deletions sunholo/chunker/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,6 @@ def upload_doc_images(metadata):
)
os.remove(temp_image.name)
logging.info(f"Uploaded image to GCS: {image_gsurl}")

return image_gsurl

9 changes: 6 additions & 3 deletions sunholo/chunker/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,14 @@ def chunk_doc_to_docs(documents: list, extension: str = ".md", min_size: int = 8
for document in documents:
content = remove_whitespace(document.page_content)

# look for images and upload them for later extraction
upload_doc_images(document.metadata)
# look for images and upload them for later extraction, add metadata of location
image_gsurl = upload_doc_images(document.metadata)
if image_gsurl:
document.metadata["image_gsurl"] = image_gsurl

if len(content) < min_size:
combined_documents_content += content + "\n"
logging.info(f"Appending document as its smaller than {min_size}: length {len(content)}")
logging.info(f"Appending document as its smaller than {min_size}: length {len(content)} - appended doc length {len(combined_documents_content)}")
else:
if combined_documents_content:
combined_documents.append(Document(page_content=combined_documents_content, metadata=document.metadata))
Expand Down Expand Up @@ -76,6 +78,7 @@ def chunk_doc_to_docs(documents: list, extension: str = ".md", min_size: int = 8
logging.info(f"Appending chunk as its smaller than {min_size}: length {len(chunk)}")
continue

logging.info(f"Adding chunk of length {len(chunk)}")
document.metadata["chunk_number"] = chunk_number
source_chunks.append(Document(page_content=chunk, metadata=document.metadata))

Expand Down

0 comments on commit 04594d1

Please sign in to comment.