[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/weaviate/recipes/blob/main/integrations/data-platforms/huggingface/weaviate-collection-to-hf-dataset-hub.ipynb)

# Weaviate Collection to HuggingFace Dataset Hub

This notebook will show you how to upload the data stored in a Weaviate Collection onto the HuggingFace Dataset Hub!

In [2]:
import weaviate
import os
from weaviate.classes.init import Auth
import weaviate.classes.config as wvcc
import re
from weaviate.util import get_valid_uuid
from uuid import uuid4

In [4]:
WEAVIATE_URL = os.getenv("WEAVIATE_URL")
WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY")

weaviate_client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY)
)

print(weaviate_client.is_ready())

True


### Replace this with the name of the Weaviate Collection you want to upload!

In [5]:
blogs_collection = weaviate_client.collections.get("WeaviateBlogChunks")

In [8]:
import time

data = []

start_time = time.time()

for item in blogs_collection.iterator():
    values_dictionary = {}
    values_dictionary["uuid"] = str(item.uuid)
    for key in item.properties.keys():
        values_dictionary[key] = item.properties[key]
    data.append(values_dictionary)

end_time = time.time()
execution_time = end_time - start_time

print(f"\033[32mProcessed {len(data)} items in {execution_time:.2f} seconds\033[0m")

[32mProcessed 2160 items in 1.88 seconds[0m


In [10]:
import json
with open("./weaviate-blogs-with-synthetic-questions.json", "w") as json_file:
    json.dump(data, json_file, indent=4)

### Upload to HuggingFace Dataset Hub!

In [11]:
from huggingface_hub import HfApi

hf_api = HfApi(
    endpoint="https://huggingface.co",
    token=os.getenv("HUGGINGFACE_TOKEN")
)

hf_api.create_repo(
    repo_id="weaviate/weaviate-blogs-with-synthetic-questions",
    token=os.getenv("HUGGINGFACE_TOKEN"),
    repo_type="dataset"
)

hf_api.upload_file(
    path_or_fileobj="./weaviate-blogs-with-synthetic-questions.json",
    path_in_repo="weaviate-blogs-with-synthetic-questions.json",
    repo_id="weaviate/weaviate-blogs-with-synthetic-questions",
    repo_type="dataset"
)

CommitInfo(commit_url='https://huggingface.co/datasets/weaviate/weaviate-blogs-with-synthetic-questions/commit/dde65e64bb9ee3c9ff7585fbd3fc988fb4b27bc0', commit_message='Upload weaviate-blogs-with-synthetic-questions.json with huggingface_hub', commit_description='', oid='dde65e64bb9ee3c9ff7585fbd3fc988fb4b27bc0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/weaviate/weaviate-blogs-with-synthetic-questions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='weaviate/weaviate-blogs-with-synthetic-questions'), pr_revision=None, pr_num=None)

# ![HuggingFace Dataset](./images/huggingface-datasets.png "HuggingFace Dataset")