In [None]:
!pip install -q langchain==0.0.230 eland elasticsearch huggingface-hub tqdm requests ipython GitPython
!pip install "transformers==4.31.0" "datasets[s3]==2.13.0" sagemaker --upgrade --quiet
!pip install "sagemaker>=2.163.0" --upgrade --quiet

In [None]:
import re
import requests
import json
from tqdm import tqdm
from IPython.display import display
from IPython.display import Markdown
from pathlib import Path
from elasticsearch import Elasticsearch

Now we create a prompt chain that gets the most relevant passage from Elasticsearch using a vector search, and then uses that knowledge in the prompt to the LLM.

!huggingface-cli login --token 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

In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
from datasets import load_dataset
from random import randrange

# Load dataset from the hub. Data is private, remember to authenticate your Huggingface CLI token that's added to Yield organization.
data_files = {"train": "qa_pairs_train.jsonl", "test":"qa_pairs_test.jsonl"}
dataset = load_dataset("YieldInc/chatbot_qa_dataset_splitted", data_files=data_files)['test']

print(f"dataset size: {len(dataset)}")

In [None]:
es_cloud_id = getpass.getpass('Enter Elastic Cloud ID:  ')
es_user = getpass.getpass('Enter cluster username:  ')
es_pass = getpass.getpass('Enter cluster password:  ')
endpoint = getpass.getpass('Enter the URL for the Elastic Cloud Deployment Instance: ')
es_url =  f"https://{es_user}:{es_pass}@{endpoint}:443"

index_name = getpass.getpass('Enter the document database index:  ')
es = Elasticsearch(cloud_id=es_cloud_id,
                   basic_auth=(es_user, es_pass)
                   )
print(es.info()) # should return cluster info

In [None]:
model_id = "baai__bge-base-en" 
# this refers to the model string appeared in Machine Learning models section under 'Machine Learning' on the ElasticSearch Kibana Panel

def search_question(query_text):
    # Elasticsearch query (BM25) and kNN configuration for hybrid search
    print("Query text is", query_text)
    query = {
        "bool": {
            "must": [{
                "match": {
                    "text": {
                        "query": query_text,
                        "boost": 15
                    }
                }
            }]
        }
    }

    knn = {
        "field": "title-vector",
        "k": 5,
        "num_candidates": 20,
        "query_vector_builder": {
            "text_embedding": {
                "model_id": model_id,
                "model_text": query_text
            }
        },
        "boost": 5
    } # boost is defining relative importance of keyword search and kNN vector retrieval search.

    fields = ["text"] # define the field in document collection to match against query.
    index = index_name
    resp = es.search(index=index,
                     query=query,
                     knn=knn,
                     fields=fields,
                     size=5,
                     source=True)
    
    # print("Query is", query)
    # print("Response is", resp)
    body = [resp['hits']['hits'][i]['fields']['text'][0] for i in range(len(resp['hits']['hits']))] # return all relevant docs here
    sources = [resp['hits']['hits'][i]['_source']['metadata']['source'] for i in range(len(resp['hits']['hits']))]
    return body, sources

def format_yield_validate(sample, relevant_documents):
    instruction = f"%%% Instruction\n{sample['question']}"
    docs = '\n'.join(relevant_documents)
    context = f"%%% Context\n{docs}"
    response = f"%%% Answer"
    # join all the parts together
    prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
    return prompt

In [None]:
sample = dataset[randrange(len(dataset))] # random QA entry
relevant_docs, relevant_sources = search_question(sample['question'])
prompt = format_yield_validate(sample, relevant_docs)
print(prompt)

# hyperparameters for llm
payload = {
  "inputs":  prompt,
  "parameters": {
    "do_sample": True,
    "top_p": 0.9,
    "temperature": 0.1,
    "top_k": 50,
    "max_new_tokens": 1024,
    "repetition_penalty": 1.03,
    "stop": ["</s>"]
  }
}

output = llm.predict(payload)[0]['generated_text']
print(f"GENERATED ANSWER: {output}\n\n")
print(f"REFERENCE ANSWER: {sample['answer']}\n\n")

In [None]:
# directly test it on the Lambda API.
import urllib.parse
from urllib.parse import quote

url = getpass.getpass("Enter the Lambda function's URL from function configuration on AWS Lambda panel: ")
sample = dataset[randrange(len(dataset))]
relevant_docs, relevant_srcs = search_question(sample['question'])
prompt = format_yield_validate(sample, relevant_docs)
print(prompt)

# URL-encode the query to the actual format, around 3k tokens max, could be errorneous
encoded_query = urllib.parse.quote(prompt)
full_url = f"{url}?query={encoded_query}"
print(full_url)

# Send GET request to your Lambda function
response = requests.get(full_url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the response
    answer = json.loads(response.text)

    # Send the answer to the Discord channel
    print(f"Answer:{answer}\n")
    print(f"Sources:{relevant_srcs}")
    print(f"Reference Answer:{sample['answer']}\n")
else:
    print(f"{response.status_code} An error occurred while processing your request.")