In [2]:
import numpy as np
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()
documents[10]

{'text': 'It depends on your background and previous experience with modules. It is expected to require about 5 - 15 hours per week. [source1] [source2]\nYou can also calculate it yourself using this data and then update this answer.',
 'section': 'General course-related questions',
 'question': 'Course - \u200b\u200bHow many hours per week am I expected to spend on this  course?',
 'course': 'data-engineering-zoomcamp',
 'id': 'ea739c65'}

In [3]:
ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth['course'] == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')
ground_truth[10]

{'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp',
 'document': '5170565b'}

In [5]:
doc_idx = {d['id']: d for d in documents}
doc_idx['5170565b']['text']

'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

In [6]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [8]:
from elasticsearch import Elasticsearch

In [9]:
es_client = Elasticsearch('http://localhost:9200')
es_client.info()

ObjectApiResponse({'name': '20d66b7bd8aa', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'HmfYve9ZSCekVkIW_b3HoQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [10]:
index_settings = {
    "settings": {
        "number_of_shards": 1, 
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384, 
                "index": True,
                "similarity": "cosine"
            }, 
        }
    }
}

index_name = "course-query"
if es_client.indices.exists(index=index_name):
    es_client.indices.delete(index=index_name)

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-query'})

In [11]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    question_text_vector = model.encode(question + ' ' + text).astype(float).tolist()
    doc["question_text_vector"] = question_text_vector
    es_client.index(index=index_name, document=doc)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [01:06<00:00, 14.26it/s]


In [12]:
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field, 
        "query_vector": vector, 
        "k": 5, 
        "num_candidates": 10000, 
        "filter": {
            "term": {
                "course": course
            }
        }
    }
    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }
    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    result_docs = []
    for hit in es_results["hits"]["hits"]:
        result_docs.append(hit['_source'])
    return result_docs

def question_text_vector_knn(q):
    question = q['question']
    course = q['course']
    v_q = model.encode(question).astype(float).tolist()
    return elastic_search_knn('question_text_vector', v_q, course)

In [13]:
question_text_vector_knn(dict(
        question="course has already started. can i still join?",
        course="machine-learning-zoomcamp"
))

[{'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'id': 'ee58a693'},
 {'question': 'How long is the course?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Approximately 4 months, but may take more if you want to do some extra activities (an extra project, an article, etc)',
  'id': '67e2fd13'},
 {'question': 'I just joined. What should I do next? How can I access course materials?',
  'course': 'machine-learning-zoomcamp',
  'section': 'Gene

In [14]:
def build_prompt(query, search_results):
    prompt_template = """
        You're a course teaching assistant.Answer the question based on CONTEXT from the FAQ database.
        Use only the facts from the CONTEXT when answering the QUESTION.

        QUESTION: {question}
        CONTEXT: {context}
    """.strip()
    context = "" 
    for doc in search_results:
        context += f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [15]:
from dotenv import load_dotenv
import os

load_dotenv()
print(os.getenv("MISTRAL_API_KEY"))

mM6V2SsnI9IzQXWcn4Fr8xCKYt11HFiT


In [16]:
from mistralai import Mistral

MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
MODEL_NAME = os.getenv("MODEL_NAME")

client = Mistral(api_key=MISTRAL_API_KEY)

def llm(prompt):
    response = client.chat.complete(
        model=MODEL_NAME,
        messages=[{
            "role": "user",
            "content": prompt
        }]
    )
    json_response = response.choices[0].message.content    
    return json_response

In [17]:
def rag(query: dict) -> str:
    search_results = question_text_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt)
    return answer

In [18]:
ground_truth[10]

{'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp',
 'document': '5170565b'}

In [19]:
rag(ground_truth[10])

"Yes, if you miss a session, it will be recorded and you can watch it later. According to the first answer in the context, everything is recorded, including office hours sessions. So, you won't miss any information even if you can't attend a live session."

In [20]:
doc_idx['5170565b']['text']

'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

In [21]:
answer_orig = "Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack."
answer_llm = "Yes, sessions are recorded and available for viewing if you miss one. This includes both the pre-recorded course videos and the office hours live sessions where questions are answered. You can find the recordings in the course playlist on YouTube. If you have a question that isn't covered in a recorded session, you can ask it in Slack."


In [22]:
v_orig = model.encode(answer_orig)
v_llm = model.encode(answer_llm)

In [23]:
v_llm.dot(v_orig)

np.float32(0.6778729)

In [None]:
answers = {}

In [None]:
for i, rec in enumerate(tqdm(ground_truth)):
    if i in answers:
        continue

    answer_llm = rag(rec)
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['text']

    answers[i] = {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'course': rec['course'],
    }

 26%|█████████████████████████████▍                                                                                  | 480/1830 [19:55<43:41,  1.94s/it]