# Recreate Wiki Collection with nomic-embed-text

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

WEAVIATE_KEY = os.getenv("WEAVIATE_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_URL = os.getenv("OPENAI_URL")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")

print(f"Weaviate Key: {WEAVIATE_KEY}")
print(f"OpenAI URL: {OPENAI_URL}")
print(f"Embedding Model: {EMBEDDING_MODEL}")

In [None]:
import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure

# Connect to Weaviate
client = weaviate.connect_to_local(
    host="localhost",
    port=8080,
    grpc_port=50051,
    auth_credentials=Auth.api_key(WEAVIATE_KEY),
    headers={
        "X-OpenAI-Api-Key": OPENAI_API_KEY,
        "X-OpenAI-BaseURL": OPENAI_URL
    }
)

print(f"Connected: {client.is_ready()}")

In [None]:
# Delete existing Wiki collection
try:
    client.collections.delete("Wikinew")
    print("✓ Deleted existing Wiki collection")
except Exception as e:
    print(f"Collection didn't exist or error: {e}")

In [None]:
client.collections.create(
    name="TestCollection",
    
    # Using local ollama embedding model
    vector_config=Configure.Vectors.text2vec_ollama(
        model="nomic-embed-text",
        api_endpoint=OPENAI_URL
    ),
    
    # Using local ollama generative model
    generative_config=Configure.Generative.ollama(
        model="qwen2.5:0.5b",
        api_endpoint=OPENAI_URL
    ),
)

In [None]:
# Create new collection with nomic-embed-text
collection = client.collections.create(
    name="Wikinew",
    # Using local ollama embedding model
    vector_config=Configure.Vectors.text2vec_ollama(
        model="nomic-embed-text",
        api_endpoint=OPENAI_URL
    ),
    
    # Using local ollama generative model
    generative_config=Configure.Generative.ollama(
        model="qwen2.5:0.5b",
        api_endpoint=OPENAI_URL
    ),
)

print("✓ Created new Wiki collection with nomic-embed-text")

In [None]:
# Load data and import
from datasets import load_dataset
from tqdm import tqdm
from weaviate.util import generate_uuid5

print("Loading data...")
dataset = load_dataset('parquet',
                      data_files={'train': ['wiki-data/weaviate/snowflake-arctic-v2/*.parquet']},
                      split="train",
                      streaming=False)

print(f"Loaded {len(dataset)} items")

# Import data (let Weaviate vectorize)
print("Importing and vectorizing...")
with collection.batch.fixed_size(batch_size=200, concurrent_requests=2) as batch:
    for item in tqdm(dataset, desc="Importing"):
        # Only text fields, no vectors
        data_obj = {
            "title": item["title"],
            "text": item["text"],
            "wiki_id": item["wiki_id"],
            "url": item["url"]
        }
        
        id = generate_uuid5(item["wiki_id"])
        batch.add_object(data_obj, uuid=id)
        
        if batch.number_errors > 10:
            print(f"Too many errors: {batch.number_errors}")
            break

print(f"✓ Import completed. Collection count: {len(collection)}")

In [None]:
client.close()
print("✓ Done!")