In [20]:
#!pip install -U weaviate-client
!pip install "weaviate-client==3.*" # Version 4 of the client does not work

Collecting weaviate-client==3.*
  Downloading weaviate_client-3.26.2-py3-none-any.whl.metadata (3.4 kB)
Collecting validators<1.0.0,>=0.21.2 (from weaviate-client==3.*)
  Downloading validators-0.22.0-py3-none-any.whl.metadata (4.7 kB)
Collecting authlib<2.0.0,>=1.2.1 (from weaviate-client==3.*)
  Downloading Authlib-1.3.0-py2.py3-none-any.whl.metadata (3.8 kB)
Downloading weaviate_client-3.26.2-py3-none-any.whl (120 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.4/120.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Authlib-1.3.0-py2.py3-none-any.whl (223 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.7/223.7 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading validators-0.22.0-py3-none-any.whl (26 kB)
Installing collected packages: validators, authlib, weaviate-client
Successfully installed authlib-1.3.0 validators-0.22.0 weaviate-client-3.26.2


In [None]:
!pip install openai

In [1]:
import pandas as pd
import numpy as np
import os

import weaviate

In [21]:
WCS_ENDPOINT = "https://my-sandbox1-1486fdzz.weaviate.network/"
WCS_API_KEY = "YOUR-API-KEY"
OPENAI_API_KEY = 'YOUR-API-KEY'

VECTOR_DB_NAME = 'OHS_ACT_VDB'

In [5]:
base_path = '../input/movie-summaries-cmu/'

In [4]:
os.listdir('../input/movie-summaries-cmu/')

['Amended Act - Occupational Health and Safety.doc', 'ohs-act.txt']

## Read the file

In [9]:
file_path = base_path + 'ohs-act-data/ohs-act.txt'

with open(file_path, "r") as file:
    
    # read the file
    content = file.read()
    
    # split by the # symbol
    lines = content.split("#")
    
    # create the chunks
    chunk_list = [line.strip() for line in lines]

len(chunk_list)

In [15]:
chunk_list[0]

'REPUBLIC OF SOUTH AFRICA {section title}\n\nNo. 85 of 1993: Occupational Health and Safety Act\nas amended by\nOccupational Health and Safety Amendment Act, No. 181 Of 1993\n\nACT\nTo provide for the health and safety of persons at work and for the health and safety of persons in connection with the use of plant and machinery; the protection of persons other than persons at work against hazards to health and  safety arising out of or in connection with the activities of persons at work; to establish an advisory council for occupational health and safety; and to provide for matters connected therewith.'

## Create a list of dicts

In [None]:
# Example data format for upload to the vector database

data = [
   {
      "title": "Object0",
      "foo": 99, 
      "quote_text": "The quick brown fox jumps over the lazy dog."
   },
   {
      "title": "Object1",
      "foo": 77, 
      "quote_text": "A nimble red fox leaped over the sleeping hound."
   }
]
   

In [16]:
data_list = []

for i, chunk in enumerate(chunk_list):
    
    data = {
        "chunk_id": i,
        "chunk_text": chunk
    }
    
    data_list.append(data)
    
len(data_list)

48

In [19]:
data_list[0]

{'chunk_id': 0,
 'chunk_text': 'REPUBLIC OF SOUTH AFRICA {section title}\n\nNo. 85 of 1993: Occupational Health and Safety Act\nas amended by\nOccupational Health and Safety Amendment Act, No. 181 Of 1993\n\nACT\nTo provide for the health and safety of persons at work and for the health and safety of persons in connection with the use of plant and machinery; the protection of persons other than persons at work against hazards to health and  safety arising out of or in connection with the activities of persons at work; to establish an advisory council for occupational health and safety; and to provide for matters connected therewith.'}

## Create the Weaviate vector database

In [22]:
import weaviate
import json

client = weaviate.Client(
    url = WCS_ENDPOINT,  # Replace with your endpoint
    auth_client_secret=weaviate.auth.AuthApiKey(api_key=WCS_API_KEY),  # Replace w/ your Weaviate instance API key
    additional_headers = {
        "X-OpenAI-Api-Key": OPENAI_API_KEY  # Replace with your inference API key
    }
)

            Please consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.


In [23]:
# resetting the schema. CAUTION: This will delete your collection 
if client.schema.exists(VECTOR_DB_NAME):
     client.schema.delete_class(VECTOR_DB_NAME)


# Create a Table in the database.
# We call this table a "class".

class_obj = {
    "class": VECTOR_DB_NAME, # This is the name of the database table
    "vectorizer": "text2vec-openai",  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
    "moduleConfig": {
        "text2vec-openai": {},
        "generative-openai": {}  # Ensure the `generative-openai` module is used for generative queries
    }
}

client.schema.create_class(class_obj)

## Upload the data to the vector database

The vectorization will be done by the database using OpenAi embeddings.

In [None]:
    data = {
        "chunk_id": i,
        "chunk_text": chunk
    }

In [24]:
import requests
import json

client.batch.configure(batch_size=5)  # Configure batch
with client.batch as batch:  # Initialize a batch process
    for i, data in enumerate(data_list):  # Batch import data
        print(f"importing chunk: {i+1}")
        properties = {
            "chunk_id": data["chunk_id"],
            "chunk_text": data["chunk_text"],
        }
        batch.add_data_object(
            data_object=properties,
            class_name=VECTOR_DB_NAME
        )

importing chunk: 1
importing chunk: 2
importing chunk: 3
importing chunk: 4
importing chunk: 5
importing chunk: 6
importing chunk: 7
importing chunk: 8
importing chunk: 9
importing chunk: 10
importing chunk: 11
importing chunk: 12
importing chunk: 13
importing chunk: 14
importing chunk: 15
importing chunk: 16
importing chunk: 17
importing chunk: 18
importing chunk: 19
importing chunk: 20
importing chunk: 21
importing chunk: 22
importing chunk: 23
importing chunk: 24
importing chunk: 25
importing chunk: 26
importing chunk: 27
importing chunk: 28
importing chunk: 29
importing chunk: 30
importing chunk: 31
importing chunk: 32
importing chunk: 33
importing chunk: 34
importing chunk: 35
importing chunk: 36
importing chunk: 37
importing chunk: 38
importing chunk: 39
importing chunk: 40
importing chunk: 41
importing chunk: 42
importing chunk: 43
importing chunk: 44
importing chunk: 45
importing chunk: 46
importing chunk: 47
importing chunk: 48


In [25]:
# Check number of objects in MyTable1

response = (
    client.query
    .aggregate(VECTOR_DB_NAME)
    .with_meta_count()
    .do()
)

print(response)

{'data': {'Aggregate': {'OHS_ACT_VDB': [{'meta': {'count': 48}}]}}}


## Run a similarity search

In [31]:
# Run a similarity search

import weaviate
import json

client = weaviate.Client(
    url = WCS_ENDPOINT,  # Replace with your endpoint
    auth_client_secret=weaviate.auth.AuthApiKey(api_key=WCS_API_KEY),  # Replace w/ your Weaviate instance API key
    additional_headers = {
        "X-OpenAI-Api-Key": OPENAI_API_KEY  # Replace with your inference API key
    }
)

query_text = "What is an inspector supposed to do?"

response = (
    client.query
    .get(VECTOR_DB_NAME, ["chunk_id", "chunk_text"])
    .with_near_text({"concepts": [query_text]})
    .with_limit(5)
    #.with_additional(["distance", "vector, id"]) # Also return the vector, the distance and the id
    .with_additional(["distance", "id"])
    .do()
)

print(json.dumps(response, indent=4))

{
    "data": {
        "Get": {
            "OHS_ACT_VDB": [
                {
                    "_additional": {
                        "distance": 0.18719542,
                        "id": "015fea51-69b5-49d6-b509-b3a7973c915e"
                    },
                    "chunk_id": 26,
                    "chunk_text": "Functions of inspectors {section title}\u2028\nAn inspector may, for the purposes of this Act-\u2028\nwithout previous notice, at all reasonable times, enter any premises which are occupied or used by an employer or on or in which an employee performs any work or any plant or machinery is used, or which he suspects to be such premises;\u2028\nquestion any person who is or was on or in such premises, either alone or in the presence of any other person, on any matter to which this Act relates;\u2028\nrequire from any person who has control over or custody of a book, record or other document on or in those premises, to produce to him forthwith, or at such time and pl

## Create a natural language answer

In [36]:
# Generative natural language output
# https://weaviate.io/developers/weaviate/quickstart#generative-search-grouped-task
# I don't think ChatGPT is being used for the text generation. It could be some other OpenAi model.

query_text1 = "What is the definition of listed work?"
query_text2 = "What is the definition of listed work?"

response = (
    client.query
    .get(VECTOR_DB_NAME, ["chunk_id", "chunk_text"])
    .with_near_text({"concepts": [query_text1]})
    .with_generate(grouped_task=query_text2)
    .with_limit(5)
    #.with_additional(["distance", "vector, id"]) # Also return the vector, the distance and the id
    .with_additional(["distance", "id"])
    .do()
)

print(json.dumps(response, indent=4))

{
    "data": {
        "Get": {
            "OHS_ACT_VDB": [
                {
                    "_additional": {
                        "distance": 0.17520219,
                        "generate": {
                            "error": null,
                            "groupedResult": "Listed work refers to any work that has been declared by the Minister, through a notice in the Gazette, to be listed work. This declaration is made under specific conditions or circumstances specified in the notice. Before declaring any work to be listed work, the Minister must publish a draft notice in the Gazette and invite interested persons to submit written comments and representations within a specified period. A minimum of three months must elapse between the publication of the draft notice and the final notice.\n\nEmployers who have employees undertaking listed work or who may be exposed to hazards from listed work have certain duties. They must identify the hazards and evaluate the risks as

## Use OpenAi to create a natural language response

In [None]:
# Query the vector database

query_text = "What is an inspector supposed to do?"

response = (
    client.query
    .get(VECTOR_DB_NAME, ["chunk_id", "chunk_text"])
    .with_near_text({"concepts": [query_text]})
    .with_limit(5)
    #.with_additional(["distance", "vector, id"]) # Also return the vector, the distance and the id
    .with_additional(["distance", "id"])
    .do()
)

print(json.dumps(response, indent=4))

In [None]:
# Get the chunks for the context


In [None]:
#question = 'How do I go to Johor Baru by taxi?'
pred_text_list = list(df_sorted['pred_text'])
context = pred_text_list[0:3]

# Prepare the prompt
prompt = f"""
Excerpts from the South African Occupational Health and Safety Act (OHS Act): 
{context}
Question: {query_text}

Extract the answer to the question from the text provided. 
If the text doesn't contain the answer, 
reply that the answer is not available."""

In [None]:
from openai import OpenAI

client = OpenAI(api_key=OPENAI_API_KEY)

completion = client.chat.completions.create(
  model="gpt-3.5-turbo-0301",
  messages=[
    {"role": "system", "content": "You are a helpful legal assistant who is an expert on the South African OHS Act."},
    {"role": "user", "content": prompt}
  ]
)


print(completion.choices[0].message.content)