# Export Vectorized Data to Parquet Files

In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

load_dotenv()

WEAVIATE_KEY = os.getenv("WEAVIATE_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_URL = os.getenv("OPENAI_URL")

print(f"Weaviate Key: {WEAVIATE_KEY}")
print(f"OpenAI URL: {OPENAI_URL}")

Weaviate Key: root-user-key
OpenAI URL: http://host.docker.internal:11434


In [2]:
import weaviate
from weaviate.classes.init import Auth

# Connect to Weaviate
client = weaviate.connect_to_local(
    host="localhost",
    port=8080,
    grpc_port=50051,
    auth_credentials=Auth.api_key(WEAVIATE_KEY),
    headers={
        "X-OpenAI-Api-Key": OPENAI_API_KEY,
        "X-OpenAI-BaseURL": OPENAI_URL
    }
)

print(f"Connected: {client.is_ready()}")

Connected: True


In [3]:
# Get Wiki collection and check count
wiki = client.collections.get("Wiki")
total_count = len(wiki)
print(f"Found {total_count} items in Wiki collection")

Found 0 items in Wiki collection


In [4]:
# Create output directory
output_dir = "wiki-data/weaviate/nomic-embed-text"
os.makedirs(output_dir, exist_ok=True)
print(f"Exporting to: {output_dir}")

Exporting to: wiki-data/weaviate/nomic-embed-text


In [5]:
# Export data in batches
all_data = []
batch_size = 1000

print("Exporting data...")
for i in tqdm(range(0, total_count, batch_size), desc="Fetching batches"):
    response = wiki.query.fetch_objects(
        limit=batch_size,
        offset=i,
        include_vector=True
    )

    for obj in response.objects:
        all_data.append({
            "title": obj.properties["title"],
            "text": obj.properties["text"],
            "wiki_id": obj.properties["wiki_id"],
            "url": obj.properties["url"],
            "vector": obj.vector["default"]  # Get the 768-dim vector
        })

print(f"Collected {len(all_data)} items")

Exporting data...


Fetching batches: 0it [00:00, ?it/s]

Collected 0 items





In [6]:
# Convert to DataFrame and save as parquet files
df = pd.DataFrame(all_data)

# Split into files of 25k each (like original)
items_per_file = 25000
total_files = (len(df) + items_per_file - 1) // items_per_file

print(f"Splitting into {total_files} files...")
for i in range(total_files):
    start_idx = i * items_per_file
    end_idx = min((i + 1) * items_per_file, len(df))

    file_df = df.iloc[start_idx:end_idx]
    filename = f"{output_dir}/{i+1:04d}.parquet"
    file_df.to_parquet(filename, index=False)
    print(f"✓ Saved {filename} with {len(file_df)} items")

print(f"✅ Export completed! {total_files} files saved to {output_dir}/")
print(f"Total items exported: {len(all_data)}")

Splitting into 0 files...
✅ Export completed! 0 files saved to wiki-data/weaviate/nomic-embed-text/
Total items exported: 0


In [None]:
# Verify the exported data
test_file = f"{output_dir}/0001.parquet"
test_df = pd.read_parquet(test_file)
print(f"Sample file shape: {test_df.shape}")
print(f"Columns: {test_df.columns.tolist()}")
print(f"Vector dimensions: {len(test_df['vector'].iloc[0])}")

In [None]:
client.close()
print("✓ Done!")