In [None]:
# Note:
# The database containing the vectors for the OHS Act chunks has already been created on Weaviate.
# The vector database is called: OHS_ACT_VDB

In [None]:
#!pip install -U weaviate-client
!pip install "weaviate-client==3.*" # Version 4 of the client does not work

In [None]:
!pip install openai

In [None]:
import pandas as pd
import numpy as np
import os

import weaviate

In [None]:
WCS_ENDPOINT = "https://my-sandbox1-1486fdzz.weaviate.network/" # Weaviate
WCS_API_KEY = "YOUR-API-KEY" # Weaviate
OPENAI_API_KEY = 'YOUR-API-KEY'
COHERE_API_KEY = 'YOUR-API-KEY'

VECTOR_DB_NAME = 'OHS_ACT_VDB'

In [None]:
base_path = '../input/movie-summaries-cmu/'

In [None]:
os.listdir('../input/movie-summaries-cmu/')

## Connect to the Weaviate vector database

In [None]:
import weaviate
import json

wcs_client = weaviate.Client(
    url = WCS_ENDPOINT,  # Replace with your endpoint
    auth_client_secret=weaviate.auth.AuthApiKey(api_key=WCS_API_KEY),  # Replace w/ your Weaviate instance API key
    additional_headers = {
        "X-OpenAI-Api-Key": OPENAI_API_KEY, 
        "X-Cohere-Api-Key": COHERE_API_KEY
    }
)

# check that the client is ready
wcs_client.is_ready()

## Use Cohere to run a keyword search

In [None]:
# Run a keyword search

query_text = "machinery"

response = (
    wcs_client.query
    .get(VECTOR_DB_NAME, ["chunk_id", "chunk_text"])
    .with_bm25(query=query_text)
    .with_limit(2)
    .do()
)

print(json.dumps(response, indent=4))

## Run a hybrid search

In [None]:
query_text = "What is the definition of listed work?"

response = (
    wcs_client.query
    .get(VECTOR_DB_NAME, ["chunk_id", "chunk_text"])
    .with_hybrid(query=query_text, alpha=0.5)
    .with_limit(10)
    #.with_additional(["distance", "vector, id"]) # Also return the vector, the distance and the id
    .with_additional(["distance", "id"])
    .do()
)

print(json.dumps(response, indent=4))

## Use OpenAi to create a natural language response

### Extract the context

In [None]:
# Get the chunks for the context
response_list = response['data']['Get']['OHS_ACT_VDB']

len(response_list)

In [None]:
text = response_list[1]['chunk_text']

text

In [None]:
# Extract just the text

pred_text_list = []

for i in range(0, len(response_list)):
    
    pred_text = response_list[i]['chunk_text']
    
    pred_text_list.append(pred_text)
    
len(pred_text_list)

In [None]:
pred_text_list[2]

### Create the natural language response

In [None]:
# ** Here we are selcting the chunk that we know contains the definitions.
# It seems that the natural langiage output is not good when we provide
# al lot of text in the context.
context = pred_text_list[5]

# Prepare the prompt
prompt = f"""
Excerpts from the South African Occupational Health and Safety Act (OHS Act): 
{context}
Question: {query_text}

Extract the answer to the question from the text provided. 
If the text doesn't contain the answer, 
reply that the answer is not available."""

query_text

In [None]:
from openai import OpenAI

openai_client = OpenAI(api_key=OPENAI_API_KEY)

completion = openai_client.chat.completions.create(
  model="gpt-3.5-turbo-0125",
  messages=[
    {"role": "system", "content": "You are a helpful legal assistant who is an expert on the South African OHS Act."},
    {"role": "user", "content": prompt}
  ]
)


print(completion.choices[0].message.content)

In [None]:
len(pred_text_list)

## Apply reranking inside Weaviate

Take note that after reranking the results, the definitions now appear first.

In [None]:
# Ref: https://weaviate.io/developers/weaviate/search/rerank

# *** Notes:
# 1. Here we are doing hybrid search
# 2. If we specify 10 results then the definitions section will appear after reranking.
# 3. If we specify only 3 results then the definitions section will not be selected
# during the hybrid search and as a result the definitions section won't be available for reranking.

"""
# This code is from the Wevaiate docs and it works:

query_text = "What is the definition of listed work?"

response = (
    wcs_client.query
    .get(VECTOR_DB_NAME, ["chunk_id", "chunk_text"])
    .with_hybrid(query=query_text, alpha=0.5)
    .with_additional("rerank(property: \"chunk_text\" query: \"What is the definition of listed work?\") { score }")
    .with_limit(10)
    #.with_additional(["distance", "vector, id"]) # Also return the vector, the distance and the id
    #.with_additional(["distance", "id"])
    .do()
)
"""

# This is how to use a variable called query_text:

query_text = "What is the definition of listed work?"

response = (
    wcs_client.query
    .get(VECTOR_DB_NAME, ["chunk_id", "chunk_text"])
    .with_hybrid(query=query_text, alpha=0.5)
    .with_additional(f"rerank(property: \"chunk_text\" query: \"{query_text}\") {{ score }}")
    .with_limit(10)
    .do()
)

print(json.dumps(response, indent=4))
