This notebook sets up a database and installs python modules to use with the crossword generator project. 



In [None]:
### INSTALL THE WEAVIATE CLIENT

# Uncomment to clear your current pip cache
# !pip cache purge

# Uncomment to upgrade pip
# !pip install --upgrade pip

# Install client from public released
!pip3 install --no-cache -U "weaviate-client==4.*"

# Check installed client version
!pip show weaviate-client | grep Version

In [None]:
### INSTALL ADDITIONAL LIBRARIES

# Import the huggingface datasets
!pip install datasets

# Import tqdm progress monitor
!pip install tqdm

# Import pandas
!pip install pandas

# Import spacy named entity recognition for puzzle generation
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
# CLIENT CONNECTION -- LOCALHOST

import weaviate

# Connect to a local instance
client = weaviate.connect_to_local()

# Check connection
client.is_ready()


In [4]:
# CONSTANTS AND REUSED VALUES

# Set the collection name
collection = "WikipediaSimple"

In [None]:
# LIST ALL COLLECTIONS

# List all the collections on your Weaviate instance
response = client.collections.list_all(simple=False)
for r in response:
    print(f"Collection: {r}")


In [None]:
# COLLECTION DEFINITION

from weaviate.classes.config import Configure

# Remove old version of this collection if there is one
if (client.collections.exists(collection)):
    client.collections.delete(collection)
    print(f"Removed old collection: {collection}")

# Define the collection
articles = client.collections.create(
    name=collection,
    description="Wikipedia articles",
    vectorizer_config=Configure.Vectorizer.text2vec_cohere(
        model="embed-multilingual-v3.0"
    ),
    generative_config=Configure.Generative.cohere(),
    # Uncomment to compress the collection data
    # vector_index_config=Configure.VectorIndex.hnsw(
    # quantizer=Configure.VectorIndex.Quantizer.sq()
    # ),
)


In [None]:
# CHECK COLLECTION DEFINITION

import pprint as pp

# Print the collection definition
collection_definition = client.collections.export_config(collection)
pp.pprint(f"Name: {collection_definition.name}     Description: {collection_definition.description}")
# Collection properties haven't been configured yet
pp.pprint(collection_definition.properties)


In [None]:
### IMPORT WIKIPEDIA

from datasets import load_dataset
from tqdm import tqdm

wikipedia = client.collections.get(collection)

def import_wiki_data(lang, num_rows, skip_rows):
    dataset = load_dataset("Cohere/wikipedia-2023-11-embed-multilingual-v3", lang, split="train", streaming=True)
    dataset = dataset.skip(skip_rows)

    # Edit to change the batch size
    batch_size = 1000

    counter = skip_rows

    with wikipedia.batch.fixed_size(batch_size=batch_size, concurrent_requests=4) as batch:
        for item in tqdm(dataset.skip(skip_rows),
                         initial=skip_rows,
                         total=num_rows
                        ):
            vector = item["emb"]

            data_to_insert = {
                "wiki_id": item["_id"],
                "text": item["text"],
                "title": item["title"],
                "url": item["url"],
                "lang": lang,
            }

            batch.add_object(
                properties=data_to_insert,
                vector=vector
            )

            # stop after the request number reaches = num_rows
            counter += 1
            if counter >= num_rows:
                break

        # check for errors at the end
        if (len(wikipedia.batch.failed_objects)>0):
            print(f"Errors {len(wikipedia.batch.failed_objects)}")
            print(wikipedia.batch.failed_objects[-1])

        print(f"Imported {counter} items for {lang}")

# Edit the value to change the import size
#   The 'simple' variant has 646424 rows
num_rows = 650000

# edit the value to start in the middle of the data set
skip_rows = 0

# Uncomment the language to load data for it
import_wiki_data("simple", num_rows, skip_rows)
# import_wiki_data("en", num_rows, skip_rows)
# import_wiki_data("es", num_rows, skip_rows)
# import_wiki_data("de", num_rows, skip_rows)
# import_wiki_data("fr", num_rows, skip_rows)


In [None]:
### CHECK THE UPLOAD

wikipedia = client.collections.get(collection)
response = wikipedia.aggregate.over_all(total_count=True)
print(f"Collection size: {response.total_count}")

In [None]:
# PRINT THE FIRST FEW OBJECTS
import pprint as pp

# # Uncomment if the client and collection object are undefined
# wikipedia = client.collections.get(collection)

response = wikipedia.query.fetch_objects(
        include_vector=True,
        limit=5
        )

for o in response.objects:
    pp.pprint(o.properties)
    print(o.vector)
