# Export Vectorized Data to Parquet Files

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

load_dotenv()

WEAVIATE_KEY = os.getenv("WEAVIATE_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_URL = os.getenv("OPENAI_URL")

print(f"Weaviate Key: {WEAVIATE_KEY}")
print(f"OpenAI URL: {OPENAI_URL}")

In [None]:
import weaviate
from weaviate.classes.init import Auth

# Connect to Weaviate
client = weaviate.connect_to_local(
    host="localhost",
    port=8080,
    grpc_port=50051,
    auth_credentials=Auth.api_key(WEAVIATE_KEY),
    headers={
        "X-OpenAI-Api-Key": OPENAI_API_KEY,
        "X-OpenAI-BaseURL": OPENAI_URL
    }
)

print(f"Connected: {client.is_ready()}")

In [None]:
# Get Wikinew collection and check count
wiki = client.collections.get("Wikinew")
total_count = len(wiki)
print(f"Found {total_count} items in Wikinew collection")

In [None]:
# Create output directory
output_dir = "wiki-data/weaviate/nomic-embed-text"
os.makedirs(output_dir, exist_ok=True)
print(f"Exporting to: {output_dir}")

In [None]:
# Export remaining data starting from offset 50,000
all_data = []

print("Exporting remaining data using fetch_objects with offset...")
try:
    # Start from where we left off (50,000) and get the rest
    remaining_items = total_count - 50000
    print(f"Attempting to get {remaining_items} remaining items...")
    
    batch_size = 100
    start_offset = 50000
    
    for i in tqdm(range(0, remaining_items, batch_size), desc="Fetching remaining batches"):
        current_offset = start_offset + i
        try:
            response = wiki.query.fetch_objects(
                limit=min(batch_size, remaining_items - i),
                offset=current_offset,
                include_vector=True
            )

            batch_count = 0
            for obj in response.objects:
                all_data.append({
                    "title": obj.properties["title"],
                    "text": obj.properties["text"],
                    "wiki_id": obj.properties["wiki_id"],
                    "url": obj.properties["url"],
                    "vector": obj.vector["default"]
                })
                batch_count += 1
            
            if batch_count == 0:
                print(f"No more items at offset {current_offset}")
                break
                
        except Exception as e:
            print(f"Error at offset {current_offset}: {e}")
            print(f"Collected {len(all_data)} items before hitting limit")
            break
            
except Exception as e:
    print(f"Error during export: {e}")
    print(f"Managed to collect {len(all_data)} items")

print(f"Collected {len(all_data)} additional items")

In [None]:
# Convert to DataFrame and save as parquet files
df = pd.DataFrame(all_data)

# Split into files of 25k each (like original)
items_per_file = 25000
total_files = (len(df) + items_per_file - 1) // items_per_file

print(f"Splitting into {total_files} files...")
for i in range(total_files):
    start_idx = i * items_per_file
    end_idx = min((i + 1) * items_per_file, len(df))

    file_df = df.iloc[start_idx:end_idx]
    filename = f"{output_dir}/{i+1:04d}.parquet"
    file_df.to_parquet(filename, index=False)
    print(f"✓ Saved {filename} with {len(file_df)} items")

print(f"✅ Export completed! {total_files} files saved to {output_dir}/")
print(f"Total items exported: {len(all_data)}")

In [None]:
# Verify the exported data
test_file = f"{output_dir}/0001.parquet"
test_df = pd.read_parquet(test_file)
print(f"Sample file shape: {test_df.shape}")
print(f"Columns: {test_df.columns.tolist()}")
print(f"Vector dimensions: {len(test_df['vector'].iloc[0])}")

In [None]:
client.close()
print("✓ Done!")