Reference: https://clemenssiebler.com/posts/chatting-private-data-langchain-azure-openai-service/

# Setup

In [1]:
import os
import re

import langchain
import openai
from dotenv import load_dotenv
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import AzureChatOpenAI
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.text_splitter import TokenTextSplitter
from langchain.vectorstores import FAISS

In [2]:
langchain.__version__

'0.0.154'

In [3]:
load_dotenv()

True

In [4]:
DEPLOYMENT_GPT35 = os.environ["DEPLOYMENT_MODEL_GPT35"]
DEPLOYMENT_GPT4 = os.environ["DEPLOYMENT_MODEL_GPT4"]
EMBEDDING_MODEL = "text-embedding-ada-002"

# switch between models here
# DEPLOYMENT_NAME = DEPLOYMENT_GPT35 # significantly worse
DEPLOYMENT_NAME = DEPLOYMENT_GPT4

if DEPLOYMENT_NAME == DEPLOYMENT_GPT35:
    MODEL_NAME = "gpt-3.5-turbo"
elif DEPLOYMENT_NAME == DEPLOYMENT_GPT4:
    MODEL_NAME = "gpt-4"
else:
    raise ValueError(f"Unmapped model {DEPLOYMENT_MODEL}")

In [5]:
assert os.environ["OPENAI_API_TYPE"] == "azure"
assert os.environ["OPENAI_API_KEY"]
assert os.environ["OPENAI_API_VERSION"]

url_pattern = r"https://.*openai\.azure\.com/"
regex = re.compile(url_pattern)
assert bool(regex.match(os.environ["OPENAI_API_BASE"]))

In [6]:
# for chat, there is no need to set to openai's attribute, instead we pass the dict to AzureChatOpenAI
AD_CREDENTIALS = {
    "deployment_name": DEPLOYMENT_NAME,
    "openai_api_type": os.environ["OPENAI_API_TYPE"],
    "openai_api_key": os.environ["OPENAI_API_KEY"],
    "openai_api_base": os.environ["OPENAI_API_BASE"],
    "openai_api_version": os.environ["OPENAI_API_VERSION"],
}

In [7]:
# for embeddings, we need to set to openai attributes
openai.api_type = os.environ["OPENAI_API_TYPE"]
openai.api_key = os.environ["OPENAI_API_KEY"]
openai.api_version = os.environ["OPENAI_API_VERSION"]
openai.api_base = os.environ["OPENAI_API_BASE"]

# Talk to your data

## Create LLM and Embeddings

In [8]:
llm = AzureChatOpenAI(**AD_CREDENTIALS)

In [9]:
embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, chunk_size=1)

## Load up text files, chunk them up

In [10]:
loader = DirectoryLoader("./data/", glob="*.txt", loader_cls=TextLoader)
documents = loader.load()
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

In [11]:
len(docs) # 5x1000=5k, just above the 4k threshold

5

## Ingest documents into Faiss

In [12]:
%pip install faiss-cpu

Note: you may need to restart the kernel to use updated packages.


In [13]:
%%time
db = FAISS.from_documents(documents=docs, embedding=embeddings)

CPU times: user 251 ms, sys: 43.2 ms, total: 294 ms
Wall time: 7.01 s


## Create document question-answering chat chain

In [14]:
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(
    """Given the following conversation and a follow up question, \
    rephrase the follow up question to be a standalone question.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
)

qa = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=db.as_retriever(),
    condense_question_prompt=CONDENSE_QUESTION_PROMPT,
    return_source_documents=True,
    verbose=False,
)

## Ask questions

In [15]:
chat_history = []
query = "Which film won the best picture?"
result = qa({"question": query, "chat_history": chat_history})

print("Question:", query)
print("Answer:", result["answer"])

Question: Which film won the best picture?
Answer: "Everything Everywhere All at Once" won the best picture at the 95th Academy Awards.


In [16]:
chat_history = [(query, result["answer"])]
query = "Who are the main cast members for that movie?"
result = qa({"question": query, "chat_history": chat_history})

print("Question:", query)
print("Answer:", result["answer"])

Question: Who are the main cast members for that movie?
Answer: The main cast members for the movie "Everything Everywhere All at Once" include Michelle Yeoh, Ke Huy Quan, and Jamie Lee Curtis.


In [17]:
chat_history = [(query, result["answer"])]
query = "Did Stephanie Hsu win anything?"
result = qa({"question": query, "chat_history": chat_history})

print("Question:", query)
print("Answer:", result["answer"])

Question: Did Stephanie Hsu win anything?
Answer: Based on the provided context, Stephanie Hsu was nominated for Best Supporting Actress for her role in "Everything Everywhere All at Once" at the 95th Academy Awards. However, she did not win the award, as Jamie Lee Curtis won in that category for her role in the same film.


In [18]:
chat_history = [(query, result["answer"])]
query = "What's the significance of the best actress award this time?"
result = qa({"question": query, "chat_history": chat_history})

print("Question:", query)
print("Answer:", result["answer"])

Question: What's the significance of the best actress award this time?
Answer: At the 95th Academy Awards, Jamie Lee Curtis won the Best Supporting Actress award for her role as an IRS inspector in "Everything Everywhere All at Once." The significance of this award lies in Curtis's long career as a veteran headliner in genre movies, particularly horror films like "Halloween." In her acceptance speech, she dedicated her statue to all the people who had supported her throughout her career in genre films. Additionally, she acknowledged her family history in entertainment, as both her parents, Janet Leigh and Tony Curtis, were also Oscar-nominated actors.


In [19]:
chat_history = [(query, result["answer"])]
query = "Sorry I am referring to Michell Yeoh."
result = qa({"question": query, "chat_history": chat_history})

print("Question:", query)
print("Answer:", result["answer"])

Question: Sorry I am referring to Michell Yeoh.
Answer: The significance of the Best Actress award for Michelle Yeoh this time is that she became the first Asian woman to be recognized as Best Actress at the Oscars. This honor came after a long career in martial arts and action movies like "Crouching Tiger Hidden Dragon" and "Yes, Madam." Her win serves as a beacon of hope and possibilities for little boys and girls who look like her, showing that they too can achieve success in the entertainment industry.


In [20]:
chat_history = [(query, result["answer"])]
query = "The best actor came from which film? Who are the other nominees of the same movie?"
result = qa({"question": query, "chat_history": chat_history})

print("Question:", query)
print("Answer:", result["answer"])

Question: The best actor came from which film? Who are the other nominees of the same movie?
Answer: Brendan Fraser won Best Actor for his performance in "The Whale." Hong Chau was also nominated from the same movie in the Best Supporting Actress category.


In [21]:
chat_history = [(query, result["answer"])]
query = "Did RRR win anything?"
result = qa({"question": query, "chat_history": chat_history})

print("Question:", query)
print("Answer:", result["answer"])

Question: Did RRR win anything?
Answer: Yes, the movie "RRR" won an award for Best Original Song at the 95th Academy Awards. The song "Naatu Naatu" from "RRR" won the award, with music by M.M. Keeravaani and lyrics by Chandrabose.


In [22]:
chat_history = [(query, result["answer"])]
query = "Who else was nominated in the same category?"
result = qa({"question": query, "chat_history": chat_history})

print("Question:", query)
print("Answer:", result["answer"])

Question: Who else was nominated in the same category?
Answer: The other nominees in the Best Original Song category at the 95th Academy Awards were:

1. "Naatu Naatu" from "RRR" — music by M.M. Keeravaani, lyric by Chandrabose
2. "Applause" from "Tell It Like a Woman" — music and lyric by Diane Warren
3. "Hold My Hand" from "Top Gun: Maverick" — music and lyric by Lady Gaga and BloodPop
4. "Lift Me Up" from "Black Panther: Wakanda Forever" — music by Tems, Rihanna, Ryan Coogler, and Ludwig Goransson; lyric by Tems and Ryan Coogler


In [23]:
chat_history = [(query, result["answer"])]
query = "What was the color theme of the award ceremony?"
result = qa({"question": query, "chat_history": chat_history})

print("Question:", query)
print("Answer:", result["answer"])

Question: What was the color theme of the award ceremony?
Answer: The 95th Academy Awards featured a notable cosmetic change by replacing the usual red carpet with a champagne-colored carpet.


In [24]:
chat_history = [(query, result["answer"])]
query = "What is the source url of the context article?"
result = qa({"question": query, "chat_history": chat_history})

print("Question:", query)
print("Answer:", result["answer"])

Question: What is the source url of the context article?
Answer: The source URL of the context article about the color theme of the 95th Academy Awards ceremony is: https://variety.com/2023/awards/awards/oscar-winners-2023-list-1235548935/
