In [1]:

import weaviate
from weaviate.util import generate_uuid5
import json
from collections import Counter

In [2]:
def json_print(data):
    print(json.dumps(data, indent=2))

In [3]:
# Instatiate the client with rest API
client = weaviate.Client("http://localhost:8080")

In [4]:
# Print the client information to confirm the modules are loaded.
meta_info = client.get_meta()
print(json.dumps(meta_info, indent=2))

{
  "hostname": "http://[::]:8080",
  "modules": {
    "generative-openai": {
      "documentationHref": "https://platform.openai.com/docs/api-reference/completions",
      "name": "Generative Search - OpenAI"
    },
    "text2vec-openai": {
      "documentationHref": "https://platform.openai.com/docs/guides/embeddings/what-are-embeddings",
      "name": "OpenAI Module"
    }
  },
  "version": "1.22.2"
}


In [5]:
data_chunks = []
data_dict = {}

In [6]:
#upload and prepare data
with open("/Users/tdubon/GenTextAnalyzer/illiad.txt", newline="") as f: 
    data = f.readlines()

In [7]:
len(data)

14189

In [8]:
# Convert data to json - line number, text
for idx, value in enumerate(data):
    data_dict[idx] = value.strip()
    if data_dict[idx] == "":
        del(data_dict[idx])



In [9]:
len(data_dict)


12825

In [10]:
#to read in a dict to json
datastore = json.dumps(data_dict, indent=4)
print(datastore[0:200])

{
    "0": "\ufeffThe Project Gutenberg eBook of The Iliad",
    "2": "This ebook is for the use of anyone anywhere in the United States and",
    "3": "most other parts of the world at no cost and wi


In [25]:
for i, d in data_dict.items():
    if i == 125:
        print(d)

In [27]:
"""Setup the schema, an outline requiring the data type, vectorizer and the list of classes. Note that it is essential to have your data cleaned and the categories clearly identified for this step. If using your own vectorizer, "none" should be specified for "vectorizer". """


client.schema.delete_class("Illiad")
def add_items(batch_size=50):
    client.batch.configure(batch_size=1)
    with client.batch as batch:
        for i, d in data_dict.items():
            print(f"importing items: {i+1}")
            properties = {
                "line_no": i,
                "text": d
            }
            illiad_uuid = generate_uuid5(
                'Illiad', str(i) + d)

            batch.add_data_object(
                data_object=properties,
                class_name="Illiad",
                uuid=illiad_uuid)
    client.batch.create_objects()


add_items(1)

importing items: 1
importing items: 3
importing items: 4
importing items: 5
importing items: 6
importing items: 7
importing items: 8
importing items: 9
importing items: 11
importing items: 14
importing items: 16
importing items: 17
importing items: 18
importing items: 20
importing items: 21
importing items: 23
importing items: 25
importing items: 28
importing items: 32
importing items: 33
importing items: 36
importing items: 37
importing items: 39
importing items: 40
importing items: 41
importing items: 43
importing items: 45
importing items: 46
importing items: 47
importing items: 49
importing items: 54
importing items: 55
importing items: 56
importing items: 59
importing items: 61
importing items: 63
importing items: 64
importing items: 65
importing items: 66
importing items: 67
importing items: 68
importing items: 69
importing items: 70
importing items: 71
importing items: 72
importing items: 73
importing items: 74
importing items: 75
importing items: 76
importing items: 77
importin

## Querying objects

In [439]:
#Retrieve uuid, vector
result = (
    client.query.get("Illiad",["line_no", "text"])
    .with_additional("vector")
    .with_additional("id").with_limit(1).do())

In [440]:
json_print(result)

{
  "data": {
    "Get": {
      "Illiad": [
        {
          "_additional": {
            "id": "000f798b-1ae4-5dfc-a263-f038ee31a8ef",
            "vector": [
              -0.005871944,
              -0.015564189,
              0.013893229,
              -0.008314377,
              0.011818004,
              0.0091768075,
              -0.018663552,
              -0.001494937,
              0.013738261,
              -0.0043660584,
              0.011353099,
              0.012269433,
              0.03142484,
              -0.007701242,
              -0.010032501,
              -0.0030926247,
              0.014526577,
              0.028055968,
              0.014310969,
              -0.0132261915,
              0.0037562924,
              0.008887085,
              0.00910943,
              0.0010569836,
              -0.0018444575,
              0.014782611,
              0.011117278,
              -0.032934096,
              0.03444335,
              -0.031047525,
         

In [441]:
#Query object count to verify number of records 
count = client.query.aggregate("Illiad").with_meta_count().do()
json_print(count)

{
  "data": {
    "Aggregate": {
      "Illiad": [
        {
          "meta": {
            "count": 12825
          }
        }
      ]
    }
  }
}


In [442]:
#Query object - semantic search using near text - dense search
response = (
    client.query
    .get("Illiad",["line_no","text"])
    .with_near_text({"concepts": "food"})
    .with_additional('distance')
    .with_limit(2)
    .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Illiad": [
        {
          "_additional": {
            "distance": 0.1778084
          },
          "line_no": 5030,
          "text": "man his meal."
        },
        {
          "_additional": {
            "distance": 0.18595463
          },
          "line_no": 10647,
          "text": "food and wine; for thence is vigour and might. For no man fasting from"
        }
      ]
    }
  }
}


In [383]:
#Query object - keyword search using BM25 sparse search
response = (
    client.query
    .get("Illiad",["line_no","text"])
    .with_bm25(query="food")
    .with_additional('distance')
    .with_limit(20)
    .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Illiad": [
        {
          "_additional": null,
          "line_no": 10647,
          "text": "food and wine; for thence is vigour and might. For no man fasting from"
        },
        {
          "_additional": null,
          "line_no": 6241,
          "text": "his soul, and desire of sweet food taketh his heart, even then the"
        },
        {
          "_additional": null,
          "line_no": 7441,
          "text": "them ambrosial food to graze withal, and golden tethers he bound about"
        },
        {
          "_additional": null,
          "line_no": 10548,
          "text": "the steerage of the ships, or were stewards there and dealt out food,"
        },
        {
          "_additional": null,
          "line_no": 13205,
          "text": "taking thought of neither food nor rest? good were even a woman\u2019s"
        },
        {
          "_additional": null,
          "line_no": 10686,
          "text": "avenged the shame. 

In [385]:
#Hybrid sparse/dense serch
response = (
    client.query
    .get("Illiad",["line_no","text"])
    .with_hybrid(query="food", alpha=0.5)
    .with_limit(15)
    .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Illiad": [
        {
          "line_no": 10647,
          "text": "food and wine; for thence is vigour and might. For no man fasting from"
        },
        {
          "line_no": 10548,
          "text": "the steerage of the ships, or were stewards there and dealt out food,"
        },
        {
          "line_no": 7441,
          "text": "them ambrosial food to graze withal, and golden tethers he bound about"
        },
        {
          "line_no": 10686,
          "text": "avenged the shame. Till then down my throat at least nor food nor drink"
        },
        {
          "line_no": 10648,
          "text": "food shall be able to fight with the foe all day till the going down of"
        },
        {
          "line_no": 10651,
          "text": "in his going fail. But the man who having his fill of food and wine"
        },
        {
          "line_no": 10738,
          "text": "to be the food of fishes; but Achilles arose up and spake in 

In [387]:
#Query object - generative search 
response = (
    client.query
    .get("Illiad",["line_no","text"])
    .with_near_text({"concepts": "family"})
    .with_additional('distance')
    .with_generate(grouped_task="Tell me about why and when family was mentioned")
    .with_limit(2)
    .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Illiad": [
        {
          "_additional": {
            "distance": 0.19485354,
            "generate": {
              "error": null,
              "groupedResult": "Family is mentioned in the given text in two instances. \n\nIn the first instance, family is mentioned in the phrase \"kinsfolk and my daughter in her girlhood and the lovely company of mine.\" Here, family is mentioned to refer to the narrator's relatives, specifically their daughter during her childhood. The phrase suggests that the narrator enjoyed the pleasant company of their family members, including their daughter, during that time.\n\nIn the second instance, family is mentioned in the phrase \"former husband and her city and parents.\" Here, family is mentioned to refer to the narrator's ex-spouse, their daughter's city, and her parents (presumably the narrator's in-laws). The mention of family in this context indicates that the narrator is discussing their daughter's connecti

In [None]:
#Query object - generative search with ner id
response = (
    client.query
    .get("Illiad",["line_no","text"])
    .with_near_text({"concepts": "main characters"})
    .with_additional('distance')
    .with_generate(grouped_task="What can you tell me about the characters in the list {unique_ents} and their roles")
    .with_limit(2)
    .do()
)

In [369]:
json_print(response["data"]["Get"]["Illiad"][0]["_additional"]["generate"]["groupedResult"])

"From the given list, we can identify the following characters and their roles:\n\n1. The Host: The role of the host is not specified in the given information. More context is needed to determine their significance or involvement in a particular story or narrative.\n\n2. Kaineus: Kaineus is a character from Greek mythology. He was originally born as a woman named Caenis but was transformed into a man by the gods. Kaineus is known for his invulnerability and his participation in the battle of Lapiths and Centaurs.\n\n3. Exadios: There is limited information available about Exadios. It is unclear if this character is from a specific mythological story or if they have a significant role in any particular narrative.\n\n4. Polyphemos: Polyphemos, also known as Polyphemus, is a cyclops from Greek mythology. He is most famously known for his encounter with the hero Odysseus in Homer's epic poem, \"The Odyssey.\" Polyphemos is depicted as a giant with a single eye in the middle of his forehead

In [444]:
prompt = "Summarize the role of the character Zeus, provide quotes of his most significant actions"

response = (
    client.query
    .get("Illiad",["line_no","text"])
    .with_near_text({"concepts": ["main characters", "main character actions", "main character emotions"]})
    .with_generate(grouped_task=prompt)
    .with_additional('distance')
    .with_limit(15)
    .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Illiad": [
        {
          "_additional": {
            "distance": 0.1875931,
            "generate": {
              "error": "send POST request: Post \"https://api.openai.com/v1/chat/completions\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)",
              "groupedResult": null
            }
          },
          "line_no": 2270,
          "text": "as were chieftains gathered around him in a circle, the godlike hero"
        },
        {
          "_additional": {
            "distance": 0.18809581,
            "generate": null
          },
          "line_no": 8998,
          "text": "heroes, men of Ares\u2019 company, play the man, my friends, and be mindful"
        },
        {
          "_additional": {
            "distance": 0.19188082,
            "generate": null
          },
          "line_no": 438,
          "text": "all this tale of strife between you twain that are chiefest of the"
        },
      

### NER - Importing related vectors 

In [241]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [301]:
all_ent_labels = []

for i, d in data_dict.items():
    doc = nlp(d.strip())
    for ent in doc.ents:
        all_ent_labels.append({"ent_type": ent.label_, "ent": ent.text})

In [400]:
print(all_ent_labels[0]["ent"])
len(all_ent_labels)

Project Gutenberg eBook


8686

In [409]:
person_ents = []
counter = ["count", "name"]

In [410]:
#retrieve all unique entities
for number, dict_entry in enumerate(all_ent_labels):
    for i in dict_entry.keys():
        if i == "ent_type" and dict_entry[i] == "PERSON":
            person_ents.append(all_ent_labels[number]["ent"])
        

In [421]:
unique_ents = set(person_ents)
len(unique_ents)

568

In [420]:
c = Counter(person_ents)
print(c)

Counter({'Zeus': 470, 'Achilles': 276, 'Priam': 177, 'Patroklos': 120, 'Ares': 96, 'Aias': 88, 'Odysseus': 82, 'Agamemnon': 80, 'Aineias': 61, 'Hera': 59, 'Kronos': 50, 'Nestor': 48, 'Project Gutenberg': 46, 'Ida': 39, 'Iris': 38, 'Trojan': 34, 'Poseidon': 32, 'Teukros': 25, 'Telamon': 23, 'Athene': 22, 'Deiphobos': 21, 'Apollo': 18, 'Glaukos': 18, 'Thetis': 17, 'Thou': 16, 'Gerenia': 16, 'Lykia': 16, 'Pylos': 12, 'Agamemnon king': 12, 'Machaon': 12, 'Eurypylos': 12, 'Fate': 12, 'god': 11, 'Shaker': 11, 'Briseis': 10, 'Diomedes': 10, 'Gutenberg': 10, 'Tlepolemos': 9, 'Laomedon': 9, 'Helen': 8, 'thou hast': 8, 'Alexandros': 8, 'Oileus': 7, 'Pandaros': 7, 'Leto': 7, 'Aiakos': 7, 'Menoitios': 7, 'Chryseis': 6, 'Idomeneus': 6, 'Euaimon': 6, 'Aphrodite': 6, 'Simoeis': 6, 'Hephaistos': 6, 'Skamandros': 6, 'Tros': 6, 'thou shalt': 6, 'Hekabe': 6, 'Telamonian Aias': 6, 'Lo': 6, 'Messenger': 6, 'Achilles Peleus': 5, 'keepeth': 5, 'Kronion': 5, 'whate’er': 5, 'Menestheus': 5, 'Mekisteus': 5, 'El