In [None]:
%pip install azure-identity
%pip install azure-search-documents
%pip install pymongo

In [1]:
import dotenv
import openai
import os
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizedQuery
from azure.core.credentials import AzureKeyCredential
from pymongo import MongoClient
from datetime import datetime
import pytz

dotenv.load_dotenv()

# Initialize Azure search variables
AZURE_AI_SEARCH_ENDPOINT = os.getenv("AZURE_AI_SEARCH_ENDPOINT")
AZURE_AI_SEARCH_API_KEY = os.getenv("AZURE_AI_SEARCH_API_KEY")
AZURE_AI_SEARCH_INDEX = os.getenv("AZURE_AI_SEARCH_INDEX")
AZURE_AI_SEARCH_RESULTS = os.getenv("AZURE_AI_SEARCH_RESULTS")
AZURE_AI_SEARCH_SEMANTIC = str(os.getenv("AZURE_AI_SEARCH_SEMANTIC"))    
AZURE_AI_SEARCH_CONTENT = os.getenv("AZURE_AI_SEARCH_CONTENT")
AZURE_AI_SEARCH_FILEPATH = os.getenv("AZURE_AI_SEARCH_FILEPATH")
AZURE_AI_SEARCH_VECTOR = os.getenv("AZURE_AI_SEARCH_VECTOR")

# Set up OpenAI client based on environment variables
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_ADA_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT")
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
AZURE_OPENAI_MAX_RESPONSE = int(os.getenv("AZURE_OPENAI_MAX_RESPONSE"))

azure_credential = AzureKeyCredential(AZURE_AI_SEARCH_API_KEY)

openai_client = openai.AzureOpenAI(
    api_version="2023-07-01-preview",
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_KEY,)

# get the embedding of a text
def get_embedding(text):
    get_embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=text)
    return get_embeddings_response.data[0].embedding

# save chat history to azure Cosmos DB mongodb collection
def save_chat_history(input, content, output):
    
    # Cosmos DB settings
    connection_string = os.getenv("AZCOSMOS_CONNSTR")
    client = MongoClient(connection_string)
    databasename = os.getenv("AZCOSMOS_CHAT_DATABASE_NAME")
    collectionname = os.getenv("AZCOSMOS_CHAT_CONTAINER_NAME")
    db = client[databasename]
    collection = db[collectionname]

    # Get the current timestamp in UTC+8
    utc_8 = pytz.timezone("Asia/Singapore")
    timestamp = datetime.now(utc_8).strftime("%Y-%m-%d %H:%M:%S")
    
    # Define the record to insert
    record = {
        "input": input,
        "content": content,
        "output": output,
        "type": "pdf_chat",
        "timestamp": timestamp
    }

    # Insert the record into the collection
    collection.insert_one(record)   
    

In [2]:
search_client = SearchClient(AZURE_AI_SEARCH_ENDPOINT, AZURE_AI_SEARCH_INDEX, credential=azure_credential)

input = "What is a Public Key Infrastructure?"
search_vector = get_embedding(input)

r = search_client.search(
        input,
        top=AZURE_AI_SEARCH_RESULTS, 
        vector_queries=[
                VectorizedQuery(vector=search_vector, k_nearest_neighbors=50, fields=AZURE_AI_SEARCH_VECTOR)],
        query_type="semantic",
        semantic_configuration_name=AZURE_AI_SEARCH_SEMANTIC)

# Initialize an empty list to accumulate all content and sources
all_content_list = []

# Iterate over the search results
for doc in r:
    content = doc[AZURE_AI_SEARCH_CONTENT].replace("\n", " ")
    source = doc[AZURE_AI_SEARCH_FILEPATH]
    combined_content = f"Content: {content} Source: {source}"
    all_content_list.append(combined_content)  # Append combined content and source to the list
    
# Join the list into a single string with a space separator
all_content = " ".join(all_content_list)


SYSTEM_MESSAGE = """Assistant helps user questions about the circulars. 
    Answer only with the facts listed in the sources provided.
    If there isn't enough information provided, say you don't know.
    Each source has a Content followed by a colon which contains the actual information including the source name for each fact you use.
    Use square brackets to reference the source, for example [info1.txt]"""

USER_MESSAGE = input + "\nSources:" + all_content

response = openai_client.chat.completions.create(
    model=AZURE_OPENAI_DEPLOYMENT,
    temperature=0.0,
    max_tokens=AZURE_OPENAI_MAX_RESPONSE,
    messages=[
        {"role": "system", "content": SYSTEM_MESSAGE},
        {"role": "user", "content": USER_MESSAGE},
    ],
    top_p=0.5
)

save_chat_history(input, all_content, response.choices[0].message.content)

answer = response.choices[0].message.content
print(f"Answer: {answer}")

  client = MongoClient(connection_string)


Answer: A Public Key Infrastructure (PKI) is essentially a set of hardware, software, policies, personnel, and procedures needed to create, manage, distribute, use, store, and revoke digital certificates. It is encapsulated in the Department of Information and Communications Technology's (DICT) Philippine National Public Key Infrastructure (PNPKI) services, which include Certificate Authority & Registration Authority, Validation Authority, and Timestamping services [CIRCULAR-LETTER-NO-2024-7-DATED-FEBRUARY-27-2024.pdf].


In [82]:
search_client = SearchClient(AZURE_AI_SEARCH_ENDPOINT, AZURE_AI_SEARCH_INDEX, credential=azure_credential)

search_query = "I have a worker who helped me in my project and I want to pay him token for the service he provided as a thank you. how much can i give him?"
search_vector = get_embedding(search_query)

r = search_client.search(
        search_query,
        top=AZURE_AI_SEARCH_RESULTS, 
        vector_queries=[
                VectorizedQuery(vector=search_vector, k_nearest_neighbors=50, fields="contentVector")],
        query_type="semantic",
        semantic_configuration_name="azureml-default")

# Initialize an empty list to accumulate all content and sources
all_content_list = []

# Iterate over the search results
for doc in r:
    content = doc["content"].replace("\n", " ")
    source = doc["filepath"]
    combined_content = f"Content: {content} Source: {source}"
    all_content_list.append(combined_content)  # Append combined content and source to the list
    
# Join the list into a single string with a space separator
all_content = " ".join(all_content_list)

#print all_content into a txt file
with open("all_content.txt", "w") as file:
    
    file.write(all_content)