This notebook sets up a database and installs python modules to use with the crossword generator project. 



### Install the Weaviate client

In [None]:
# Uncomment to clear your current pip cache
# !pip cache purge

# Uncomment to upgrade pip
# !pip install --upgrade pip

# Install client from public released
!pip3 install --no-cache -U "weaviate-client==4.*"

# Check installed client version
!pip show weaviate-client | grep Version

### Install additional Python libraries

In [None]:
# # Import the huggingface datasets
# !pip install datasets

# # Import tqdm progress monitor
# !pip install tqdm

# # Import pandas
# !pip install pandas

# Import the Ollama python client
!pip install ollama

# Import spacy named entity recognition for puzzle generation
#    It might take a few minutes to build spacy and setup the data
!pip install spacy
!python -m spacy download en_core_web_sm

### Connect the client to a local Weaviate instance


In [None]:
import weaviate

client = weaviate.connect_to_local()

# Uncomment to check the connection
client.is_ready()

### Check if the Ollama module is enabled

Verify that the Ollama modules are configured.

If they are not configured, enable the `text2vec-ollama` module and the `generative-ollama` module in your Weaviate [configuration file](/developers/weaviate/installation#configuration-files).

In [30]:
meta_info = client.get_meta()
if 'text2vec-ollama' not in meta_info["modules"] :
    print("Enable the text2vec-ollama module.")

if 'generative-ollama' not in meta_info["modules"] :
    print("Enable the generative-ollama module.")


### Set the collection name

If a previous version of the collection exists, uncomment the code to delete it.

In [31]:
# Set the collection name
collection = "CrosswordPuzzles"

# # Uncomment to remove old versions of this collection
# if (client.collections.exists(collection)):
#     client.collections.delete(collection)
#     print(f"Removed old collection: {collection}")

### Define a collection

The local collection holds some books on art from [Project Gutenberg](https://www.gutenberg.org/)

In [None]:
# COLLECTION DEFINITION

from weaviate.classes.config import Configure

# Remove old version of this collection if there is one
if (client.collections.exists(collection)):
    client.collections.delete(collection)
    print(f"Removed old collection: {collection}")

# Define the collection
articles = client.collections.create(
    name=collection,
    description="Create a connection to Ollama",
    vectorizer_config=Configure.Vectorizer.text2vec_cohere(
        model="embed-multilingual-v3.0"
    ),

    generative_config=Configure.Generative.ollama(
        api_endpoint="http://host.docker.internal:11434",  # If using Docker, use this to contact your local Ollama instance
        model="llama3"  # The model to use, e.g. "phi3", or "mistral", "command-r-plus", "gemma"
    )

    generative_config=Configure.Generative.cohere(),
    # Uncomment to compress the collection data
    # vector_index_config=Configure.VectorIndex.hnsw(
    # quantizer=Configure.VectorIndex.Quantizer.sq()
    # ),
)


In [None]:
# CHECK COLLECTION DEFINITION

import pprint as pp

# Print the collection definition
collection_definition = client.collections.export_config(collection)
pp.pprint(f"Name: {collection_definition.name}     Description: {collection_definition.description}")
# Collection properties haven't been configured yet
pp.pprint(collection_definition.properties)


In [None]:
### IMPORT WIKIPEDIA

from datasets import load_dataset
from tqdm import tqdm

wikipedia = client.collections.get(collection)

def import_wiki_data(lang, num_rows, skip_rows):
    dataset = load_dataset("Cohere/wikipedia-2023-11-embed-multilingual-v3", lang, split="train", streaming=True)
    dataset = dataset.skip(skip_rows)

    # Edit to change the batch size
    batch_size = 1000

    counter = skip_rows

    with wikipedia.batch.fixed_size(batch_size=batch_size, concurrent_requests=4) as batch:
        for item in tqdm(dataset.skip(skip_rows),
                         initial=skip_rows,
                         total=num_rows
                        ):
            vector = item["emb"]

            data_to_insert = {
                "wiki_id": item["_id"],
                "text": item["text"],
                "title": item["title"],
                "url": item["url"],
                "lang": lang,
            }

            batch.add_object(
                properties=data_to_insert,
                vector=vector
            )

            # stop after the request number reaches = num_rows
            counter += 1
            if counter >= num_rows:
                break

        # check for errors at the end
        if (len(wikipedia.batch.failed_objects)>0):
            print(f"Errors {len(wikipedia.batch.failed_objects)}")
            print(wikipedia.batch.failed_objects[-1])

        print(f"Imported {counter} items for {lang}")

# Edit the value to change the import size
#   The 'simple' variant has 646424 rows
num_rows = 650000

# edit the value to start in the middle of the data set
skip_rows = 0

# Uncomment the language to load data for it
import_wiki_data("simple", num_rows, skip_rows)
# import_wiki_data("en", num_rows, skip_rows)
# import_wiki_data("es", num_rows, skip_rows)
# import_wiki_data("de", num_rows, skip_rows)
# import_wiki_data("fr", num_rows, skip_rows)


In [None]:
### CHECK THE UPLOAD

wikipedia = client.collections.get(collection)
response = wikipedia.aggregate.over_all(total_count=True)
print(f"Collection size: {response.total_count}")

In [None]:
# PRINT THE FIRST FEW OBJECTS
import pprint as pp

# # Uncomment if the client and collection object are undefined
# wikipedia = client.collections.get(collection)

response = wikipedia.query.fetch_objects(
        include_vector=True,
        limit=5
        )

for o in response.objects:
    pp.pprint(o.properties)
    print(o.vector)


#   *************###############*************

In [None]:
# LIST ALL COLLECTIONS            DEBUG

# List all the collections on your Weaviate instance
response = client.collections.list_all(simple=False)
for r in response:
    print(f"Collection: {r}")
