In [1]:
# pip install mistralai

In [2]:
import openai 
from openai import OpenAI
from mistralai import Mistral
import os
from similarDocSearch import *
import json
from credentials import *

# Getting all the required credentials

In [3]:
cred = Credentials()
all_cred = cred.get_credential()

# Reading the data file

In [4]:
with open("data/documents.json", 'rt') as f_in:
    docs_raw = json.load(f_in)

len(docs_raw) # # as it's of 3 courses

3

In [5]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course'] 
        documents.append(doc)

In [6]:
documents[:2]

[{'text': "The purpose of this document is to capture frequently asked\ntechnical question\nThe next cohort starts in Jan 2025. More inFo at DTC Article.\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'GitHub - See DE-zoomcamp  prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp'}]

# Retrieving the top n documents matching or most similar to the user's passed query

In [7]:
# Initializing the class similarDocSearch
obj = similarDocSearch(text_fields = ["question", "text", "section"])
obj

<similarDocSearch.similarDocSearch at 0x7280d71056c0>

In [8]:
# Fitting
obj.fit(documents)

<similarDocSearch.similarDocSearch at 0x7280d71056c0>

In [9]:
# Giving a query to get the top n documents matching to our query
q = "The course has already started, can i still enroll?"
boost = {'question': 3, 
         'section': 0.5} # Used to give a field more or less importance 
filter_course = "data-engineering-zoomcamp"
num_results = 5

results = obj.search(query = q, boosts=boost, filter_course=filter_course, num_results=num_results)
results

[{'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'GitHub - See DE-zoomcamp  prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'All the main videos are stored in the Main “

# Prompt Creation

<b style="color:blue; font-size:1.3em"> We have found the most relevant documents corresponding to our query. So our prompt will be such that the prompt should answer our question (query) based on the context, and context here will be set of Section-Question-Answer from all the similar documents

In [20]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
Use only the facts from the CONTEXT when answering the QUESTION.
If the CONTEXT doesn't contains the answer, output None and explain the reason for None

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

In [21]:
# Context creation to be passed into the prompt
context = ""

for doc in results:
    context = context + f"section: {doc['section']}\n question: {doc['question']}\n answer: {doc['text']}\n \n"

print(context)

section: General course-related questions
 question: How can we contribute to the course?
 answer: Star the repo! Share it with friends if you find it useful ❣️
Create a PR if you see you can improve the text or the structure of the repository.
 
section: General course-related questions
 question: Course - What are the prerequisites for this course?
 answer: GitHub - See DE-zoomcamp  prerequisites
 
section: General course-related questions
 question: Course - What can I do before the course starts?
 answer: You can start by installing and setting up all the dependencies and requirements:
Google cloud account
Google Cloud SDK
Python 3 (installed with Anaconda)
Git
Look over the prerequisites and syllabus to see if you are comfortable with these subjects.
 
section: General course-related questions
 question: Course - Which playlist on YouTube should I refer to?
 answer: All the main videos are stored in the Main “DATA ENGINEERING” playlist (no year specified). The Github repository ha

In [22]:
# Prompt formatting
prompt = prompt_template.format(question=q, context=context).strip()
print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
Use only the facts from the CONTEXT when answering the QUESTION.
If the CONTEXT doesn't contains the answer, output None and explain the reason for None

QUESTION: The course has already started, can i still enroll?

CONTEXT: 
section: General course-related questions
 question: How can we contribute to the course?
 answer: Star the repo! Share it with friends if you find it useful ❣️
Create a PR if you see you can improve the text or the structure of the repository.
 
section: General course-related questions
 question: Course - What are the prerequisites for this course?
 answer: GitHub - See DE-zoomcamp  prerequisites
 
section: General course-related questions
 question: Course - What can I do before the course starts?
 answer: You can start by installing and setting up all the dependencies and requirements:
Google cloud account
Google Cloud SDK
Python 3 (installed with Anaconda)
Gi

# Passing the PROMPT to the openai using LLM model Apis

# OpenAI API

In [23]:
client = OpenAI(api_key=all_cred["RAG_OPENAI_KEY"])

In [24]:
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role":"user", "content":prompt}]
)

response

ChatCompletion(id='chatcmpl-A3Y3Dj9N9TptEdEn66Q5NwjM8c0eG', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='None. \n\nThe context does not provide information about whether enrollment is still possible after the course has started. It mentions the course start date and encourages registration before the course starts, but it does not clarify the policies for late enrollment.', refusal=None, role='assistant', function_call=None, tool_calls=None))], created=1725409239, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_f33667828e', usage=CompletionUsage(completion_tokens=46, prompt_tokens=429, total_tokens=475))

In [25]:
response.choices[0].message.content

'None. \n\nThe context does not provide information about whether enrollment is still possible after the course has started. It mentions the course start date and encourages registration before the course starts, but it does not clarify the policies for late enrollment.'

# MistralAI API

In [26]:
client = Mistral(api_key=all_cred["MISTRAL_KEY"])

In [27]:
chat_response = client.chat.complete(
    model = "mistral-large-latest",
    messages = [
        {
            "role": "user",
            "content": prompt,
        },
    ]
)

chat_response

ChatCompletionResponse(id='c0a34ceadd4141669cdfc76e54fa8842', object='chat.completion', model='mistral-large-latest', usage=UsageInfo(prompt_tokens=513, completion_tokens=35, total_tokens=548), created=1725409241, choices=[ChatCompletionChoice(index=0, message=AssistantMessage(content='None. The CONTEXT provided does not include information about whether enrollment is possible after the course has started. Therefore, I cannot answer the question based on the given information.', tool_calls=None, prefix=False, role='assistant'), finish_reason='stop')])

In [28]:
print(chat_response.choices[0].message.content)

None. The CONTEXT provided does not include information about whether enrollment is possible after the course has started. Therefore, I cannot answer the question based on the given information.
