# Improve Weaviate Interoperability

We want to make it easier for Weaviate developers to easily move their data between Weaviate and other data formats such as the HuggingFace Hub or JSON.

Down the road, I think this line of thinking is also relevant for adding Unity Catalog support with Weaviate.

# HuggingFace Hub

### Load Weaviate Collection from a HuggingFace Dataset

```python
code_collection = weaviate_client.collections.from_hub(
    collection_name="Code",
    hub_path="weaviate/dspy_code"
    keys=["content", "summary"] # default all
)
```



### Load Weaviate Collection from multiple HuggingFace Datasets

```python
knowledge_base_collection = weaviate_client.collections.from_hub(
    hub_paths=["weaviate/dspy_code", "weaviate/dspy_docs"],
    keys=["content", "summary"] # default all
)
```

### Weaviate Collection to HuggingFace Hub

```python
code_collection.to_hub(
    repo_id="CShorten/KnowledgeBase",
    hf_token="hf_foobar",
    keys=["content", "summary"] # default all
)
```

# JSON

### Load Weaviate Collection from a JSON file

```python
code_collection = weaviate_client.collections.from_json(
    collection_name="Code",
    filepath="dspy_code.json"
)
```

### Load Weaviate Collection from JSON files

```python
knowledge_base_collection = weaviate_client.collections.from_jsons(
    collection_name="KnowledgeBase",
    filepaths=["dspy_code.json", "dspy_docs.json"],
    keys=["content", "summary"] # default all
)
```

### Weaviate Collection to JSON file

```python
code.to_json(
    filepath="dspy_code.json",
    keys=["content", "summary"] # default all
)
```


# Other Ideas

### Merge Weaviate Collections
```python
knowledge_base_collection = weaviate_client.collections.merge_collections(
    from_collections=["Code", "Docs"],
    to_collection=["KnowledgeBase"],
    keys=["content", "summary"]
)
```

### Load Weaviate Collection from a List of Dictionaries in memory

```python
code_collection = weaviate_client.collections.from_list(
    collection_name="Code",
    data=data
)
```

# Save Weaviate Collection to a JSON File

In [28]:
import weaviate

weaviate_client = weaviate.connect_to_local()

collection = weaviate_client.collections.get("Code")

data = []

for item in collection.iterator():
    values_dictionary = {}
    values_dictionary["uuid"] = str(item.uuid)
    for key in item.properties.keys():
        values_dictionary[key] = item.properties[key]
    data.append(values_dictionary)

import json

my_collection_name="dspy_code.json"

with open(my_collection_name, "w") as json_file:
    json.dump(data, json_file, indent=4)

# Save Weaviate Collection to HuggingFace Hub

In [37]:
## Again, get data out of Weaviate

import weaviate

weaviate_client = weaviate.connect_to_local()

collection = weaviate_client.collections.get("KnowledgeBase")

data = []

for item in collection.iterator():
    values_dictionary = {}
    values_dictionary["uuid"] = str(item.uuid)
    for key in item.properties.keys():
        values_dictionary[key] = item.properties[key]
    data.append(values_dictionary)

## Save to File
    
import json

my_collection_name="dspy_knowledge_base.json"

with open(my_collection_name, "w") as json_file:
    json.dump(data, json_file, indent=4)
    
## Import File to HF Hub
from huggingface_hub import HfApi

hf_api = HfApi(
    endpoint="https://huggingface.co",
    token="hf_foobar",
)

hf_api.create_repo(
    repo_id="knowledge_base",
    token="hf_foobar",
    repo_type="dataset"
)

hf_api.upload_file(
    path_or_fileobj="dspy_knowledge_base.json",
    path_in_repo="dspy_knowledge_base.json",
    repo_id="CShorten/knowledge_base",
    repo_type="dataset"
)

CommitInfo(commit_url='https://huggingface.co/datasets/CShorten/knowledge_base/commit/e017d5771672ec0e0e5460d0bc80587aea7d72e2', commit_message='Upload dspy_knowledge_base.json with huggingface_hub', commit_description='', oid='e017d5771672ec0e0e5460d0bc80587aea7d72e2', pr_url=None, pr_revision=None, pr_num=None)

# `weaviate_client.collections.from_jsons`

In [12]:
from typing import List
import json

def read_json(filepath: str) -> List:
    with open(filepath, "r") as opened_file:
        file_data = json.load(opened_file)
    return file_data

dspy_docs = read_json("dspy_docs.json")
dspy_code = read_json("dspy_code.json")

In [13]:
dspy_docs[0]

{'id': '002b20a8-031f-4a13-b7d7-219090c3352e',
 'content': 'link: {"type": "generated-index", "description": "Retrieval Models in DSPy"}',
 'summary': 'This provides a link to a generated index with a description of Retrieval Models in DSPy, offering additional context and resources.'}

In [14]:
dspy_code[0]

{'id': '001e04e2-6c05-4ce4-bff6-37bd04c5be26',
 'content': 'import random\nimport re\nfrom typing import Union\n\nimport numpy as np\n\nfrom dsp.modules import LM\nfrom dsp.utils.utils import dotdict\n\n\nclass DummyLM(LM):\n    """Dummy language model for unit testing purposes."""',
 'summary': 'The document begins with importing necessary libraries and modules, followed by the definition of the DummyLM class, which is a dummy language model intended for unit testing purposes.'}

In [24]:
'''
Also maybe an idea to add an "Are you sure?" interface to this
'''
weaviate_client.collections.delete("KnowledgeBase")

In [25]:
import weaviate
import weaviate.classes.config as wvcc

weaviate_client = weaviate.connect_to_local()

knowledge_base_collection = weaviate_client.collections.create(
    name="KnowledgeBase",
    vectorizer_config=wvcc.Configure.Vectorizer.text2vec_cohere(
        model="embed-english-v3.0"
    ),
    properties=[
        wvcc.Property(name="content", data_type=wvcc.DataType.TEXT),
        wvcc.Property(name="summary", data_type=wvcc.DataType.TEXT),
        wvcc.Property(name="_source", data_type=wvcc.DataType.TEXT) # `_property_name` for internal properties?
    ]
)

In [26]:
from weaviate.util import get_valid_uuid
from uuid import uuid4

for obj in dspy_docs:
    uuid = get_valid_uuid(uuid4())
    dspy_collection.data.insert(
        properties={
            "content": obj["content"],
            "summary": obj["summary"],
            "_source": "Docs"
        },
        uuid=uuid
    )

for obj in dspy_code:
    uuid = get_valid_uuid(uuid4())
    dspy_collection.data.insert(
        properties={
            "content": obj["content"],
            "summary": obj["summary"],
            "_source": "Code"
        },
        uuid=uuid
    )

In [27]:
response = dspy_collection.aggregate.over_all(total_count=True)
print(f"{response.total_count} objects in the Weaviate \033[92m`KnowledgeBase`\033[0m collection")

1660 objects in the Weaviate [92m`KnowledgeBase`[0m collection
