In [1]:
# pip install mistralai

In [2]:
import openai 
from openai import OpenAI
from mistralai import Mistral
import os
from similarDocSearch import *
import json
from credentials import *

# Getting all the required credentials

In [3]:
cred = Credentials()
all_cred = cred.get_credential()

# Reading the data file

In [4]:
with open("data/documents.json", 'rt') as f_in:
    docs_raw = json.load(f_in)

len(docs_raw) # # as it's of 3 courses

3

In [5]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course'] 
        documents.append(doc)

In [6]:
documents[:2]

[{'text': "The purpose of this document is to capture frequently asked\ntechnical question\nThe next cohort starts in Jan 2025. More inFo at DTC Article.\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'GitHub - See DE-zoomcamp  prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp'}]

# Retrieving the top n documents matching or most similar to the user's passed query

In [7]:
# Initializing the class similarDocSearch
obj = similarDocSearch(text_fields = ["question", "text", "section"])
obj

<similarDocSearch.similarDocSearch at 0x725bb27cd6f0>

In [8]:
# Fitting
obj.fit(documents)

<similarDocSearch.similarDocSearch at 0x725bb27cd6f0>

In [9]:
# Giving a query to get the top n documents matching to our query
q = "The course has already started, can i still enroll?"
boost = {'question': 3, 
         'section': 0.5} # Used to give a field more or less importance 
filter_course = "data-engineering-zoomcamp"
num_results = 5

results = obj.search(query = q, boosts=boost, filter_course=filter_course, num_results=num_results)
results

[{'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'GitHub - See DE-zoomcamp  prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'All the main videos are stored in the Main “

# Prompt Creation

<b style="color:blue; font-size:1.3em"> We have found the most relevant documents corresponding to our query. So our prompt will be such that the prompt should answer our question (query) based on the context, and context here will be set of Section-Question-Answer from all the similar documents

In [10]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
Use only the facts from the CONTEXT when answering the QUESTION.
If the CONTEXT doesn't contains the answer, output None and explain the reason for None

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

In [11]:
# Context creation to be passed into the prompt
context = ""

for doc in results:
    context = context + f"section: {doc['section']}\n question: {doc['question']}\n answer: {doc['text']}\n \n"

print(context)

section: General course-related questions
 question: How can we contribute to the course?
 answer: Star the repo! Share it with friends if you find it useful ❣️
Create a PR if you see you can improve the text or the structure of the repository.
 
section: General course-related questions
 question: Course - What are the prerequisites for this course?
 answer: GitHub - See DE-zoomcamp  prerequisites
 
section: General course-related questions
 question: Course - What can I do before the course starts?
 answer: You can start by installing and setting up all the dependencies and requirements:
Google cloud account
Google Cloud SDK
Python 3 (installed with Anaconda)
Git
Look over the prerequisites and syllabus to see if you are comfortable with these subjects.
 
section: General course-related questions
 question: Course - Which playlist on YouTube should I refer to?
 answer: All the main videos are stored in the Main “DATA ENGINEERING” playlist (no year specified). The Github repository ha

In [12]:
# Prompt formatting
prompt = prompt_template.format(question=q, context=context).strip()
print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
Use only the facts from the CONTEXT when answering the QUESTION.
If the CONTEXT doesn't contains the answer, output None and explain the reason for None

QUESTION: The course has already started, can i still enroll?

CONTEXT: 
section: General course-related questions
 question: How can we contribute to the course?
 answer: Star the repo! Share it with friends if you find it useful ❣️
Create a PR if you see you can improve the text or the structure of the repository.
 
section: General course-related questions
 question: Course - What are the prerequisites for this course?
 answer: GitHub - See DE-zoomcamp  prerequisites
 
section: General course-related questions
 question: Course - What can I do before the course starts?
 answer: You can start by installing and setting up all the dependencies and requirements:
Google cloud account
Google Cloud SDK
Python 3 (installed with Anaconda)
Gi

# Passing the PROMPT to the openai using LLM model Apis

# OpenAI API

In [13]:
client = OpenAI(api_key=all_cred["RAG_OPENAI_KEY"])

In [14]:
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role":"user", "content":prompt}]
)

response

ChatCompletion(id='chatcmpl-A5KuyWfQya05OW4W1ZTpATGYppe9r', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='None. \n\nThe CONTEXT does not provide information regarding the enrollment policy or whether enrollment is possible after the course has started.', refusal=None, role='assistant', function_call=None, tool_calls=None))], created=1725835412, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_483d39d857', usage=CompletionUsage(completion_tokens=25, prompt_tokens=429, total_tokens=454))

In [15]:
response.choices[0].message.content

'None. \n\nThe CONTEXT does not provide information regarding the enrollment policy or whether enrollment is possible after the course has started.'

# MistralAI API

In [16]:
client = Mistral(api_key=all_cred["MISTRAL_KEY"])

In [17]:
chat_response = client.chat.complete(
    model = "mistral-large-latest",
    messages = [
        {
            "role": "user",
            "content": prompt,
        },
    ]
)

chat_response

ChatCompletionResponse(id='29fc2950d9dc469db095ae83140f44e7', object='chat.completion', model='mistral-large-latest', usage=UsageInfo(prompt_tokens=513, completion_tokens=38, total_tokens=551), created=1725835412, choices=[ChatCompletionChoice(index=0, message=AssistantMessage(content='None. The CONTEXT does not provide information about whether enrollment is possible after the course has started. It only mentions the start date of the next cohort and registration details.', tool_calls=None, prefix=False, role='assistant'), finish_reason='stop')])

In [18]:
print(chat_response.choices[0].message.content)

None. The CONTEXT does not provide information about whether enrollment is possible after the course has started. It only mentions the start date of the next cohort and registration details.


# <b style="color:red; font-size:1.3em"> Combining the above code and putting it into functions

In [19]:
# Function that would search the most similar documents to our query and will return it
def search(query):
    boost = {'question': 3, 
         'section': 0.5} # Used to give a field more or less importance 
    
    filter_course = "data-engineering-zoomcamp"
    num_results = 10
    
    results = obj.search(query = query, boosts=boost, filter_course=filter_course, num_results=num_results)
    return results

In [20]:
# Function that would build up the prompt
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
    Use only the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT doesn't contains the answer, output None
    
    QUESTION: {question}
    
    CONTEXT: 
    {context}
    """.strip()

    # Context creation to be passed into the prompt
    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\n question: {doc['question']}\n answer: {doc['text']}\n \n"
    
    # Prompt formatting
    prompt = prompt_template.format(question=query, context=context).strip()
    
    return(prompt)

In [21]:
# Function that would use LLM
def llm(prompt):
    client = OpenAI(api_key=all_cred["RAG_OPENAI_KEY"])

    response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role":"user", "content":prompt}]
    )
    
    return(response.choices[0].message.content)

In [22]:
# Function that would make a whole RAG
def RAG(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [23]:
q = "The course has already started, can i still enroll?"
RAG(q)

"Yes, you can still join the course after the start date, even if you don't register. You're still eligible to submit the homeworks, but be aware that there will be deadlines for turning in the final projects."

# Implementing the RAG using ElasticSearch Vector DB

In [31]:
from elasticsearch import Elasticsearch

In [32]:
# Making a connection to elasticsearch
es_client = Elasticsearch("http://localhost:9200")

In [33]:
# Checking the connection
es_client.info()

ObjectApiResponse({'name': '1bb5aea7b28f', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'cXXUTh3gS_y3IT27udfIMA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [34]:
# Index Creation
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [course-questions/XK83O5mVRJ2FJsNgAkAPmg] already exists')

In [37]:
# Checking how our document looks
documents[0]

{'text': "The purpose of this document is to capture frequently asked\ntechnical question\nThe next cohort starts in Jan 2025. More inFo at DTC Article.\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [38]:
# Adding progressbar
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [39]:
# Creating indexes for the documents
for doc in documents:
    es_client.index(index=index_name, document=doc)

In [58]:
# Function that would search a document index inside the vectorDB
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [59]:
query = "The course has already started, can i still enroll?"
elastic_search(query)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course /finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homew

In [56]:
# Function that would make a whole RAG
def RAG(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [57]:
RAG(query)

"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."