# Download content for FAQ DB

In [1]:
# First, we need to download a doc to be used as DB
# to create this json follow the parse_faq_document.ipynb
!wget https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json

--2024-07-04 12:16:58--  https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json [following]
--2024-07-04 12:16:58--  https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 658332 (643K) [text/plain]
Saving to: ‘documents.json’


2024-07-04 12:16:59 (44.6 MB/s) - ‘documents.json’ saved [658332/658332]



In [2]:
# inspect the file
!head documents.json

[
  {
    "course": "data-engineering-zoomcamp",
    "documents": [
      {
        "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
        "section": "General course-related questions",
        "question": "Course - When will the course start?"
      },
      {


In [3]:
# Let's load the documents

import json

# open doc
with open('./documents.json', 'rt') as f_in:
    documents_file = json.load(f_in)

documents = []

# go over file and flatten it 
for course in documents_file:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [4]:
# inspect created documents list
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [5]:
len(documents)

948

# Index these documents / build small search engine with elasticsearch


In [6]:
from elasticsearch import Elasticsearch

In [7]:
es = Elasticsearch("http://localhost:9200")
es.info()

ObjectApiResponse({'name': '5c03b6f80a2a', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'cKaC9MVfQVGFxlfb7R14MQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [8]:
# create an index 
# (an index in elasticsearch is like a table in a "usual" database)
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            # actual answer from faq doc
            "text": {"type": "text"},
            # topic 
            "section": {"type": "text"},
            # actual question from faq doc
            "question": {"type": "text"},
            # keyword to filter search
            "course": {"type": "keyword"} 
        }
    }
}

# this is the index to be created 
index_name = "course-questions"

In [9]:
# Check if the index already exists, in case you restarted notebook 
if not es.indices.exists(index=index_name):
    # Create the index if it does not exist
    response = es.indices.create(index=index_name, body=index_settings)
    print("Index created:", response)
    
else:
    print(f"Index '{index_name}' already exists.")
    
    # Delete the index if it exists
    if es.indices.exists(index=index_name):
        es.indices.delete(index=index_name)
        print(f"Index '{index_name}' deleted.")
        
    # Create a new index
    response = es.indices.create(index=index_name, body=index_settings)
    print("Index created:", response)

Index 'course-questions' already exists.
Index 'course-questions' deleted.
Index created: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'}


In [10]:
# Now we're ready to index all the documents
from tqdm.auto import tqdm

for doc in tqdm(documents):
    
    es.index(index=index_name, 
             document=doc)

  from .autonotebook import tqdm as notebook_tqdm


ERROR! Session/line number was not unique in database. History logging moved to new session 3


100%|██████████████████████| 948/948 [00:22<00:00, 41.81it/s]


In [11]:
# Retrieving from the docs
user_question = "How do I join the course after it has started?"

In [12]:
search_query = {
    "size": 5,  # Limit the search results to 5
    "query": {
        "bool": {  # Boolean query to combine multiple conditions
            "must": {  # Condition that must be met
                # Search multiple fields for the query
                "multi_match": {  

                    # The user's question to search for
                    "query": user_question,  
                    # Fields to search in, with "question" field boosted by 3
                    "fields": ["question^3", "text", "section"],  
                    # Use the best matching field for scoring
                    "type": "best_fields"  
                }
            },
            "filter": {  # Filter to narrow down the results
                "term": {  # Term filter to match exact value
                    "course": "data-engineering-zoomcamp"  # Only include docs from this 
                }
            }
        }
    }
}


In [13]:
# ask elasticsearch to get answers for search_query 
response = es.search(index=index_name, 
                     body=search_query)

In [14]:
# output formatted response 
for hit in response['hits']['hits']:
    doc = hit['_source']
    print(f"Section: {doc['section']}")
    print(f"Question: {doc['question']}")
    print(f"Answer: {doc['text'][:60]}...\n")

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishe...

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependenc...

Section: General course-related questions
Question: How do I use Git / GitHub for this course?
Answer: After you create a GitHub account, you should clone the cour...

Section: Workshop 1 - dlthub
Question: How do I install the necessary dependencies to run the code?
Answer: Answer: To run the provided code, ensure that the 'dlt[duckd...



In [15]:
# Cleaning it

# We can make it cleaner by putting it into a function
def retrieve_documents(query, index_name="course-questions", max_results=5):
    es = Elasticsearch("http://localhost:9200")
    
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es.search(index=index_name, body=search_query)
    documents = [hit['_source'] for hit in response['hits']['hits']]
    return documents

In [16]:
 # print the answers

user_question = "How do I join the course after it has started?"

response = retrieve_documents(user_question)

for doc in response:
    print(f"Section: {doc['section']}")
    print(f"Question: {doc['question']}")
    print(f"Answer: {doc['text'][:60]}...\n")

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishe...

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependenc...

Section: General course-related questions
Question: How do I use Git / GitHub for this course?
Answer: After you create a GitHub account, you should clone the cour...

Section: Workshop 1 - dlthub
Question: How do I install the necessary dependencies to run the code?
Answer: Answer: To run the provided code, ensure that the 'dlt[duckd...



# Generation - Answering questions

In [17]:
# Note: Make sure we have the SDK installed and the key is set.

In [18]:
# how to communicate with ChatGPT
# if you get an error, restart juypter notebook from terminal 
# as port was probably started before api key was set
# you may need to delete documents.json too
from openai import OpenAI

client = OpenAI()

response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", 
               "content": "The course already started. Can I still join?"}]
)
print(response.choices[0].message.content)

It depends on the policies of the course instructor or institution. Some courses may allow late registration or enrollment, while others may have strict deadlines for joining. It is best to reach out to the instructor or the institution offering the course to inquire about the possibility of joining late.


In [None]:
# Once OpenAI API is set up, let's build a Prompt
# First, we put all the documents together in one string

In [19]:
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

context_docs = retrieve_documents(user_question)

context_result = ""

for doc in context_docs:
    doc_str = context_template.format(**doc)
    context_result += ("\n\n" + doc_str)

context = context_result.strip()
print(context)

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependencies and requirements:
Google cloud account
Google Cloud SDK
Python 3 (installed with Anaconda)
Terrafo

In [20]:
# Now build the actual prompt, return "NONE" is to avoid hallucination

prompt = f"""
You're a course teaching assistant. 

Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database. 
Only use the facts from the CONTEXT. 

If the CONTEXT doesn't contains the answer, return "NONE".

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

In [23]:
# Now we can put it to OpenAI API and augment the model's answer using retrieval from the DB

response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", 
               "content": prompt}]
)
answer = response.choices[0].message.content
answer

'You can still join the course after it has started by submitting the homeworks, but there will be deadlines for turning in the final projects.'

In [28]:
# test it for hallucination
faulty_user_question = "Will there be presents?"

faulty_prompt = f"""
You're a course teaching assistant. 

Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database. 
Only use the facts from the CONTEXT. 

If the CONTEXT doesn't contains the answer, return "NONE".

QUESTION: {faulty_user_question}

CONTEXT:

{context}
""".strip()

response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", 
               "content": faulty_prompt}]
)
answer = response.choices[0].message.content
answer

'NONE'

In [31]:
# Cleaning
# Now let's put everything together in one function:

context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

prompt_template = """
You're a course teaching assistant.
Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database.
Don't use other information outside of the provided CONTEXT.  

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

In [42]:
# Concat all docs into one string
def build_context(documents):
    context = ""
    
    for doc in documents:
        doc_str = f"Section: {doc['section']}\nQuestion: {doc['question']}\nAnswer: {doc['text']}\n\n"
        context += doc_str
    
    return context.strip()


# takes docs concats them and puts them to prompt
def build_prompt(user_question, documents):
    context = build_context(documents)
    return f"""
You're a course teaching assistant. 

Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database. 
Only use the facts from the CONTEXT. 

If the CONTEXT doesn't contains the answer, return "NONE".

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

# sends built prompt to openai  
def ask_openai(prompt, model="gpt-3.5-turbo"):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", 
                   "content": prompt}]
    )
    answer = response.choices[0].message.content
    return answer

# combined pipeline 
def qa_bot(user_question):
    context_docs = retrieve_documents(user_question)
    prompt = build_prompt(user_question, context_docs)
    answer = ask_openai(prompt)
    return answer

In [38]:
# Now we can ask it different questions

qa_bot("I'm getting invalid reference format: repository name must be lowercase")

'NONE'

In [43]:
qa_bot("The course already started. Can I still join?")

"Yes, even if you don't register, you're still eligible to submit the homeworks."