Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion apps/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ def add_data_entry_task(datasource: DataSource, datasource_entry_items: List[Dat
)
try:
result = datasource_entry_handler.add_entry(datasource_entry_item)
datasource_entry_object.config = result.config
datasource_entry_config = result.config
datasource_entry_config["input"] = datasource_entry_item.dict()
datasource_entry_object.config = datasource_entry_config
datasource_entry_object.size = result.size
datasource_entry_object.status = DataSourceEntryStatus.READY
datasource_entries_size += result.size
Expand Down Expand Up @@ -85,6 +87,30 @@ def delete_data_entry_task(datasource: DataSource, entry_data: DataSourceEntry):
datasource.save()
return datasource_entry_items

def resync_data_entry_task(datasource: DataSource, entry_data: DataSourceEntry):
logger.info(f'Resyncing task for data_source_entry: %s' % str(entry_data))

datasource_entry_handler_cls = DataSourceTypeFactory.get_datasource_type_handler(
datasource.type,
)
datasource_entry_handler: DataSourceProcessor = datasource_entry_handler_cls(
datasource,
)
entry_data.status = DataSourceEntryStatus.PROCESSING
entry_data.save()
old_size = entry_data.size

result = datasource_entry_handler.resync_entry(entry_data.config)
entry_data.size = result.size
config_entry = result.config
config_entry["input"] = entry_data.config["input"]
entry_data.config = config_entry
entry_data.status = DataSourceEntryStatus.READY
entry_data.save()

datasource.size = datasource.size - old_size + result.size
datasource.save()


def delete_data_source_task(datasource):
datasource_type = datasource.type
Expand Down
25 changes: 22 additions & 3 deletions client/src/pages/data.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import { TextareaAutosize } from "@mui/base";

import DeleteOutlineOutlinedIcon from "@mui/icons-material/DeleteOutlineOutlined";
import AddOutlinedIcon from "@mui/icons-material/AddOutlined";
import SyncOutlinedIcon from "@mui/icons-material/SyncOutlined";
import PeopleOutlineOutlinedIcon from "@mui/icons-material/PeopleOutlineOutlined";
import PersonOutlineOutlinedIcon from "@mui/icons-material/PersonOutlineOutlined";

Expand Down Expand Up @@ -313,6 +314,8 @@ export default function DataPage() {
title: "Action",
key: "operation",
render: (record) => {
const isAdhocSyncSupported = record?.sync_config;

return (
<Box>
<IconButton
Expand All @@ -330,8 +333,22 @@ export default function DataPage() {
setDeleteConfirmationModalOpen(true);
}}
>
<DeleteOutlineOutlinedIcon />
<DeleteOutlineOutlinedIcon className="delete-dataentry-icon" />
</IconButton>
{isAdhocSyncSupported && (
<IconButton
onClick={() => {
axios()
.post(`/api/datasource_entries/${record.uuid}/resync`)
.then((response) => {
reloadDataSourceEntries();
reloadDataSourceEntries();
});
}}
>
<SyncOutlinedIcon className="resync-dataentry-icon" />
</IconButton>
)}
</Box>
);
},
Expand All @@ -348,8 +365,10 @@ export default function DataPage() {
onRow={(record, rowIndex) => {
return {
onClick: (event) => {
setDataSourceEntryData(record);
setDataSourceEntryDrawerOpen(true);
if (event.target.tagName === "TD") {
setDataSourceEntryData(record);
setDataSourceEntryDrawerOpen(true);
}
},
};
}}
Expand Down
13 changes: 12 additions & 1 deletion datasources/apis.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from .serializers import DataSourceEntrySerializer
from .serializers import DataSourceSerializer
from .serializers import DataSourceTypeSerializer
from apps.tasks import add_data_entry_task
from apps.tasks import add_data_entry_task, resync_data_entry_task
from apps.tasks import delete_data_entry_task
from apps.tasks import delete_data_source_task
from common.utils.utils import extract_urls_from_sitemap
Expand Down Expand Up @@ -87,7 +87,18 @@ def text_content(self, request, uid):
)
return DRFResponse({'content': content, 'metadata': metadata})

def resync(self, request, uid):
datasource_entry_object = get_object_or_404(
DataSourceEntry, uuid=uuid.UUID(uid),
)
if datasource_entry_object.datasource.owner != request.user:
return DRFResponse(status=404)

resync_data_entry_task(
datasource_entry_object.datasource, datasource_entry_object,
)

return DRFResponse(status=202)
class DataSourceViewSet(viewsets.ModelViewSet):
queryset = DataSource.objects.all()
serializer_class = DataSourceSerializer
Expand Down
22 changes: 20 additions & 2 deletions datasources/handlers/datasource_type_interface.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from enum import Enum
import json
import logging
from string import Template
Expand Down Expand Up @@ -56,13 +57,23 @@ class DataSourceEntryItem(BaseModel):
metadata: dict = {}
data: Optional[dict] = None


class DataSourceSyncType(str, Enum):
FULL = 'full'
INCREMENTAL = 'incremental'

class DataSourceSyncConfiguration(_Schema):
sync_type: DataSourceSyncType = 'full'

class DataSourceProcessor(ProcessorInterface[BaseInputType, None, None]):

@classmethod
def get_content_key(cls) -> str:
datasource_type_interface = cls.__orig_bases__[0]
return datasource_type_interface.__args__[0].get_content_key()

@classmethod
def get_sync_configuration(cls) -> Optional[dict]:
return None

@classmethod
def get_weaviate_schema(cls, class_name: str) -> dict:
Expand Down Expand Up @@ -158,6 +169,7 @@ def _get_document_embeddings(self, text: str) -> OpenAIEmbeddingOutput:
return None

def add_entry(self, data: dict) -> Optional[DataSourceEntryItem]:
logger.info(f'Adding data_source_entry: {data}')
documents = self.get_data_documents(data)

documents = map(
Expand Down Expand Up @@ -206,7 +218,13 @@ def delete_entry(self, data: dict) -> None:
self.vectorstore._client.data_object.delete(
document_id, self.datasource_class_name,
)


def resync_entry(self, data: dict) -> Optional[DataSourceEntryItem]:
# Delete old data
self.delete_entry(data)
# Add new data
return self.add_entry(DataSourceEntryItem(**data["input"]))

def delete_all_entries(self) -> None:
self.vectorstore._client.schema.delete_class(
self.datasource_class_name,
Expand Down
41 changes: 25 additions & 16 deletions datasources/handlers/website/url.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import List
from typing import Any, List
from typing import Optional

from pydantic import Field
Expand All @@ -9,7 +9,7 @@
from common.utils.text_extract import ExtraParams
from common.utils.splitter import SpacyTextSplitter
from common.utils.utils import extract_urls_from_sitemap
from datasources.handlers.datasource_type_interface import DataSourceEntryItem
from datasources.handlers.datasource_type_interface import DataSourceEntryItem, DataSourceSyncConfiguration, DataSourceSyncType
from datasources.handlers.datasource_type_interface import DataSourceSchema
from datasources.handlers.datasource_type_interface import DataSourceProcessor
from datasources.handlers.datasource_type_interface import WEAVIATE_SCHEMA
Expand Down Expand Up @@ -54,6 +54,26 @@ def name() -> str:
@staticmethod
def slug() -> str:
return 'url'

@classmethod
def get_sync_configuration(cls) -> Optional[dict]:
return DataSourceSyncConfiguration(sync_type=DataSourceSyncType.FULL).dict()

def get_url_data(self, url: str) -> Optional[DataSourceEntryItem]:
if not url.startswith('https://') and not url.startswith('http://'):
url = f'https://{url}'

text = extract_text_from_url(
url, extra_params=ExtraParams(openai_key=self.openai_key),
)
docs = [
Document(
page_content_key=self.get_content_key(), page_content=t, metadata={
'source': url,
},
) for t in SpacyTextSplitter(chunk_size=1500, length_func=len).split_text(text)
]
return docs

def validate_and_process(self, data: dict) -> List[DataSourceEntryItem]:
entry = URLSchema(**data)
Expand Down Expand Up @@ -83,22 +103,11 @@ def validate_and_process(self, data: dict) -> List[DataSourceEntryItem]:

return list(map(lambda entry: DataSourceEntryItem(name=entry, data={'url': entry}), urls + sitemap_urls))



def get_data_documents(self, data: DataSourceEntryItem) -> Optional[DataSourceEntryItem]:
url = data.data['url']
if not url.startswith('https://') and not url.startswith('http://'):
url = f'https://{url}'

text = extract_text_from_url(
url, extra_params=ExtraParams(openai_key=self.openai_key),
)
docs = [
Document(
page_content_key=self.get_content_key(), page_content=t, metadata={
'source': url,
},
) for t in SpacyTextSplitter(chunk_size=1500, length_func=len).split_text(text)
]
return docs
return self.get_url_data(url)

def similarity_search(self, query: str, *args, **kwargs) -> List[dict]:
return super().similarity_search(query, *args, **kwargs)
25 changes: 23 additions & 2 deletions datasources/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
class DataSourceTypeSerializer(serializers.ModelSerializer):
entry_config_schema = serializers.SerializerMethodField()
entry_config_ui_schema = serializers.SerializerMethodField()
sync_config = serializers.SerializerMethodField()

def get_entry_config_schema(self, obj):
datasource_type_handler_cls = DataSourceTypeFactory.get_datasource_type_handler(
Expand All @@ -27,12 +28,20 @@ def get_entry_config_ui_schema(self, obj):
if datasource_type_handler_cls is None:
return {}
return datasource_type_handler_cls.get_input_ui_schema()

def get_sync_config(self, obj):
datasource_type_handler_cls = DataSourceTypeFactory.get_datasource_type_handler(
obj,
)
if datasource_type_handler_cls is None:
return None
return datasource_type_handler_cls.get_sync_configuration()

class Meta:
model = DataSourceType
fields = [
'id', 'name', 'description',
'entry_config_schema', 'entry_config_ui_schema',
'entry_config_schema', 'entry_config_ui_schema', 'sync_config'
]


Expand All @@ -53,10 +62,22 @@ class Meta:

class DataSourceEntrySerializer(serializers.ModelSerializer):
datasource = DataSourceSerializer()
sync_config = serializers.SerializerMethodField()

def get_sync_config(self, obj):
datasource_type_handler_cls = DataSourceTypeFactory.get_datasource_type_handler(
obj.datasource.type,
)
if datasource_type_handler_cls is None:
return None
if "input" not in obj.config:
return None

return datasource_type_handler_cls.get_sync_configuration()

class Meta:
model = DataSourceEntry
fields = [
'uuid', 'datasource', 'config',
'name', 'size', 'status', 'created_at', 'updated_at',
'name', 'size', 'status', 'created_at', 'updated_at', 'sync_config'
]
4 changes: 4 additions & 0 deletions datasources/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,8 @@
'api/datasource_entries/<str:uid>/text_content',
apis.DataSourceEntryViewSet.as_view({'get': 'text_content'}),
),
path(
'api/datasource_entries/<str:uid>/resync',
apis.DataSourceEntryViewSet.as_view({'post': 'resync'}),
),
]