This notebook sets up a database and installs python modules to use with the crossword generator project. 



### Install the Weaviate client

In [None]:
# Uncomment to clear your current pip cache
# !pip cache purge

# Uncomment to upgrade pip
# !pip install --upgrade pip

# Install client from public released
!pip3 install --no-cache -U "weaviate-client==4.*"

# Check installed client version
!pip show weaviate-client | grep Version

### Install additional Python libraries

In [None]:
# # Import tqdm progress monitor
# !pip install tqdm

# # Import the Ollama python client
# !pip install ollama

# # Import spacy named entity recognition for puzzle generation
# #   It might take a few minutes to build spacy and setup the data
# !pip install spacy
# !python -m spacy download en_core_web_sm

### Install Ollama

Install an LLM and an embedding model.

Ollam should start after you install it. To check if Ollama is running, open localhost:11434 in a browser

In [None]:
# # Uncomment to install Ollama
# !ollama pull llama3         # The LLM
# !ollama pull all-minilm     # For embeddings

### Connect the client to a local Weaviate instance


In [None]:
import weaviate

client = weaviate.connect_to_local()

# Uncomment to check the connection
client.is_ready()

### Check if the Ollama module is enabled in Weaviate

If the Ollama modules are not configured, enable the `text2vec-ollama` module and the `generative-ollama` module in your Weaviate [configuration file](/developers/weaviate/installation#configuration-files).

In [3]:
meta_info = client.get_meta()
if 'text2vec-ollama' not in meta_info["modules"] :
    print("Enable the text2vec-ollama module.")

if 'generative-ollama' not in meta_info["modules"] :
    print("Enable the generative-ollama module.")


### Set the collection name

You will need a collection to store your data. This code lets you choose a collection name and cleans up any earlier versions if they exist.

In [None]:
# Set the collection name
collection_name = "CrosswordPuzzles"

# Uncomment to remove old versions of this collection
if (client.collections.exists(collection_name)):
    client.collections.delete(collection_name)
    print(f"Removed old collection: {collection_name}")

### Define a collection

The local collection holds some books from [Project Gutenberg](https://www.gutenberg.org/).

This definition is very basic. When the books in the database are converted to vector embeddings below, they aren't given any meta-data to record as properties here.  

In [None]:

from weaviate.classes.config import Property, DataType, Configure

# lets create the collection, specifing our base url accordingling
collection = client.collections.create(
    name=collection_name,
    description="Source texts for puzzles",
    properties=[
        Property(name="text", data_type=DataType.TEXT),
    ],
    vectorizer_config=Configure.Vectorizer.text2vec_ollama(
        api_endpoint="http://localhost:11434",
        model="all-minilm"
    ),
    generative_config=Configure.Generative.ollama(
        api_endpoint="http://localhost:11434",
        model="llama3"
    )
)

# # Uncomment to check the collection definition
# collection_definition = client.collections.export_config(collection_name)
# print(f"Name: {collection_definition.name}     Description: {collection_definition.description}")


### Import the data into the collection

Process some text files (books from Project Guttenberg) to use as project specific data. 

In [20]:
import os

# Get a list of the sources
source_dir = "../inputs/"
source_files = [f for f in os.listdir(source_dir) if os.path.isfile(source_dir + "/" + f)]
# print(source_files)

for sf in source_files:
    with open(sf, 'r') as f:
        source_text = f.read()

    print(f"{f}: {len(f)}")


['pg69700.txt', 'pg48320.txt', 'pg108.txt', 'pg66991.txt']


In [None]:
### IMPORT WIKIPEDIA

from datasets import load_dataset
from tqdm import tqdm

wikipedia = client.collections.get(collection)

def import_wiki_data(lang, num_rows, skip_rows):
    dataset = load_dataset("Cohere/wikipedia-2023-11-embed-multilingual-v3", lang, split="train", streaming=True)
    dataset = dataset.skip(skip_rows)

    # Edit to change the batch size
    batch_size = 1000

    counter = skip_rows

    with wikipedia.batch.fixed_size(batch_size=batch_size, concurrent_requests=4) as batch:
        for item in tqdm(dataset.skip(skip_rows),
                         initial=skip_rows,
                         total=num_rows
                        ):
            vector = item["emb"]

            data_to_insert = {
                "wiki_id": item["_id"],
                "text": item["text"],
                "title": item["title"],
                "url": item["url"],
                "lang": lang,
            }

            batch.add_object(
                properties=data_to_insert,
                vector=vector
            )

            # stop after the request number reaches = num_rows
            counter += 1
            if counter >= num_rows:
                break

        # check for errors at the end
        if (len(wikipedia.batch.failed_objects)>0):
            print(f"Errors {len(wikipedia.batch.failed_objects)}")
            print(wikipedia.batch.failed_objects[-1])

        print(f"Imported {counter} items for {lang}")

# Edit the value to change the import size
#   The 'simple' variant has 646424 rows
num_rows = 650000

# edit the value to start in the middle of the data set
skip_rows = 0

# Uncomment the language to load data for it
import_wiki_data("simple", num_rows, skip_rows)
# import_wiki_data("en", num_rows, skip_rows)
# import_wiki_data("es", num_rows, skip_rows)
# import_wiki_data("de", num_rows, skip_rows)
# import_wiki_data("fr", num_rows, skip_rows)


In [None]:
### CHECK THE UPLOAD

wikipedia = client.collections.get(collection)
response = wikipedia.aggregate.over_all(total_count=True)
print(f"Collection size: {response.total_count}")

In [None]:
# PRINT THE FIRST FEW OBJECTS
import pprint as pp

# # Uncomment if the client and collection object are undefined
# wikipedia = client.collections.get(collection)

response = wikipedia.query.fetch_objects(
        include_vector=True,
        limit=5
        )

for o in response.objects:
    pp.pprint(o.properties)
    print(o.vector)


#   *************###############*************

In [None]:
# LIST ALL COLLECTIONS            DEBUG

# List all the collections on your Weaviate instance
response = client.collections.list_all(simple=False)
for r in response:
    print(f"Collection: {r}")
