In [None]:
import weaviate

client = weaviate.connect_to_local()

client.is_ready()

In [None]:
from weaviate.classes.config import Configure #, Property, DataType

client.collections.delete("Wikipedia")

# Create a collection here - with Cohere as a vectorizer
client.collections.create(
    name="Wikipedia",
    
    vectorizer_config=Configure.Vectorizer.text2vec_cohere(
        model="embed-multilingual-v3.0"
    ),

    # Optional - example of how to define property schema
    # properties=[
    #     Property(name="text", data_type=DataType.TEXT),
    #     Property(name="title", data_type=DataType.TEXT, skip_vectorization=True),
    #     Property(name="wiki_id", data_type=DataType.INT, skip_vectorization=True),
    #     Property(name="url", data_type=DataType.TEXT, skip_vectorization=True),
    #     Property(name="lang", data_type=DataType.TEXT, skip_vectorization=True),
    #     Property(name="lang_id", data_type=DataType.INT, skip_vectorization=True),
    #     Property(name="views", data_type=DataType.NUMBER, skip_vectorization=True),
    # ]
)

In [None]:
from datasets import load_dataset
from tqdm import tqdm

def import_wiki_data(lang, max_rows, skip_rows=0):
    print(f"Importing {max_rows} data items for {lang}")

    dataset = load_dataset("Cohere/wikipedia-2023-11-embed-multilingual-v3", lang, split="train", streaming=True)
    dataset = dataset.skip(skip_rows)

    counter = skip_rows

    wikipedia = client.collections.get("Wikipedia")

    with wikipedia.batch.fixed_size(batch_size=1000, concurrent_requests=4) as batch:
        for item in tqdm(dataset, initial=skip_rows, total=max_rows):
            vector = item["emb"]

            data_to_insert = {   
                "wiki_id": item["_id"],
                "text": item["text"],
                "title": item["title"],
                "url": item["url"],
                "lang": lang,
            }

            batch.add_object(
                properties=data_to_insert,
                vector=vector
            )
            
            # stop after the request number reaches = max_rows
            counter += 1
            if counter >= max_rows:
                break
    
    # check for errors at the end
    if (len(wikipedia.batch.failed_objects)>0):
        print("Final error check")
        print(f"Some errors {len(wikipedia.batch.failed_objects)}")
        print(wikipedia.batch.failed_objects[-1])
    
    print(f"Imported {counter} items for {lang}")
    print("-----------------------------------")

In [None]:
import_per_country = 25_000

import_wiki_data("en", import_per_country)
import_wiki_data("de", import_per_country)
import_wiki_data("fr", import_per_country)
# import_wiki_data("es", import_per_country)
# import_wiki_data("it", import_per_country)