This notebook sets up a database and installs python modules to use with the crossword generator project. 



### Install the Weaviate client

In [None]:
# Uncomment to clear your current pip cache
# !pip cache purge

# Uncomment to upgrade pip
# !pip install --upgrade pip

# Install client from public released
!pip3 install --no-cache -U "weaviate-client==4.*"

# Check installed client version
!pip show weaviate-client | grep Version

### Install additional Python libraries

In [None]:
# # Import the Ollama python client
# !pip install ollama

# # Import spacy named entity recognition for puzzle generation
# #   It might take a few minutes to build spacy and setup the data
# !pip install spacy
# !python -m spacy download en_core_web_sm

### Install Ollama

Install an LLM and an embedding model.

Ollam should start after you install it. To check if Ollama is running, open localhost:11434 in a browser

In [None]:
# # Uncomment to install Ollama
# !ollama pull llama3         # The LLM
# !ollama pull all-minilm     # For embeddings

### Connect the client to a local Weaviate instance


In [None]:
import weaviate

client = weaviate.connect_to_local()

# # Uncomment to check the connection
# client.is_ready()

### Check if the Ollama module is enabled in Weaviate

If the Ollama modules are not configured, enable the `text2vec-ollama` module and the `generative-ollama` module in your Weaviate [configuration file](/developers/weaviate/installation#configuration-files).

In [None]:
meta_info = client.get_meta()
if_ok = True

if 'text2vec-ollama' not in meta_info["modules"] :
    print("Enable the text2vec-ollama module.")
    if_ok = False

if 'generative-ollama' not in meta_info["modules"] :
    print("Enable the generative-ollama module.")
    if_ok = False

if if_ok:
    print("Modules are installed")

### Set the collection name

You will need a collection to store your data. This code lets you choose a collection name and cleans up any earlier versions if they exist.

In [None]:
# Set the collection name
collection_name = "CrosswordPuzzles"

# Uncomment to remove old versions of this collection
if (client.collections.exists(collection_name)):
    client.collections.delete(collection_name)
    print(f"Removed old collection: {collection_name}")

### Define a collection

The local collection holds some books from [Project Gutenberg](https://www.gutenberg.org/).

This definition is very basic. When the books in the database are converted to vector embeddings below, they aren't given any meta-data to record as properties here.  

In [18]:

from weaviate.classes.config import Property, DataType, Configure

# lets create the collection, specifing our base url accordingling
collection = client.collections.create(
    name=collection_name,
    description="Source texts for puzzles",
    properties=[
        Property(name="text", data_type=DataType.TEXT),
    ],
    vectorizer_config=Configure.Vectorizer.text2vec_ollama(
        api_endpoint="http://localhost:11434",
        model="all-minilm"
    ),
    generative_config=Configure.Generative.ollama(
        api_endpoint="http://localhost:11434",
        model="llama3"
    )
)

# # Uncomment to check the collection definition
# collection_definition = client.collections.export_config(collection_name)
# print(f"Name: {collection_definition.name}     Description: {collection_definition.description}")


### Import the data into the collection

Process some text files (books from Project Guttenberg) to use as project specific data. 

In [13]:
import os
import spacy
import ollama


# Initiate spacy to process the files
nlp = spacy.load('en_core_web_sm')

# Get a list of the sources
source_dir = "../inputs/"
source_files = [f for f in os.listdir(source_dir) if os.path.isfile(source_dir + f)]

for sf in source_files:
    with open(source_dir + sf, 'r') as f:
        sentences = []
        header_flag = True
        source_text = f.read()

        # Split each source file into sentence-like strings
        for s in nlp(source_text).sents:
            # Uncomment to show progress
            print("Starting new text")

            s = str(s)
            counter = 0
            new_line = 0

            # Don't include file header information
            if header_flag:
                if s.startswith('*** START'):
                    header_flag = False
                continue
            else:
                if len(s) > 0:
                    sentences.append(s)

            # Create embeddings for the sentences
            with collection.batch.dynamic() as batch:
                for snt in sentences:
                    response = ollama.embeddings(model="all-minilm", prompt=snt)
                    embedding = response["embedding"]

                    # Uncomment to show vectorizing progress
                    counter += 1
                    if (counter  == 20):
                        print(f".", end=" ")
                        counter = 0
                        new_line += 1
                        if (new_line == 2):
                            new_line = 0
                            print()

                    # Batch add objects to the collection
                    batch.add_object(
                        properties = {"text": snt},
                        vector = embedding,
                    )


Next file
. . . . . . . . . . . . . . . . . . . . . . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . . 
. . . 
. . . 
. . . 
. . . 
. . . 
. . . 
. . . 
. . . 
. . . 
. . . 
. . . 
. . . 
. . . 
. . . 
. . . 
. . . 
. . . 
. . . 
. . . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . . 
. . 
. . . 
. . 
. . . 
. . 
. . . 
. . 
. . . 
. . 
. . . 
. . 
. . . 
. . 
. . . 
. . 
. . . 
. . 
. . . 
. . 
. . . 
. . 
. . . 
. . 
. . . 
. . 
. . . 
. . 
. . . 
. . 
. . . 
. . 
. . . 
. . 
. . . 
. . 
. . . 
. . 
. . . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 
. . 


KeyboardInterrupt: 

### Check the data upload

In [None]:
# Uncomment to print the number of objects
collection = client.collections.get(collection_name)
response = collection.aggregate.over_all(total_count=True)
print(f"Collection size: {response.total_count}")

# Uncomment to print the first 3 objects
response = collection.query.fetch_objects(
    limit=3,
    include_vector=True
    )
for o in response.objects:
    pp.pprint(o.properties)
    print(o.vector)