## Homework

### Question 1. Mage version 

Mage version is **0.9.72**

### Question 2. Number of documents

In [1]:
import io
import requests
import docx

def clean_line(line):
    line = line.strip()
    line = line.strip('\uFEFF')
    return line

def read_faq(file_id):
    url = f'https://docs.google.com/document/d/{file_id}/export?format=docx'
    
    response = requests.get(url)
    response.raise_for_status()
    
    with io.BytesIO(response.content) as f_in:
        doc = docx.Document(f_in)

    questions = []

    question_heading_style = 'heading 2'
    section_heading_style = 'heading 1'
    
    heading_id = ''
    section_title = ''
    question_title = ''
    answer_text_so_far = ''
     
    for p in doc.paragraphs:
        style = p.style.name.lower()
        p_text = clean_line(p.text)
    
        if len(p_text) == 0:
            continue
    
        if style == section_heading_style:
            section_title = p_text
            continue
    
        if style == question_heading_style:
            answer_text_so_far = answer_text_so_far.strip()
            if answer_text_so_far != '' and section_title != '' and question_title != '':
                questions.append({
                    'text': answer_text_so_far,
                    'section': section_title,
                    'question': question_title,
                })
                answer_text_so_far = ''
    
            question_title = p_text
            continue
        
        answer_text_so_far += '\n' + p_text
    
    answer_text_so_far = answer_text_so_far.strip()
    if answer_text_so_far != '' and section_title != '' and question_title != '':
        questions.append({
            'text': answer_text_so_far,
            'section': section_title,
            'question': question_title,
        })

    return questions

In [2]:
faq_documents = {
    'llm-zoomcamp': '1qZjwHkvP0lXHiE4zdbWyUXSVfmVGzougDD6N37bat3E',
}

documents = []

for course, file_id in faq_documents.items():
    print(course)
    course_documents = read_faq(file_id)
    documents.append({'course': course, 'documents': course_documents})

print(f"\nNumber of FAQ documents processed: {len(documents)}")

llm-zoomcamp

Number of FAQ documents processed: 1


### Question 3. Chunking: number of questions

In [3]:
import hashlib

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [4]:
data = documents[0]
documents = []

for doc in data['documents']:
    doc['course'] = data['course']
    doc['document_id'] = generate_document_id(doc)
    documents.append(doc)

print(f"Number of documents (chunks): {len(documents)}")

Number of documents (chunks): 86


### Question 4. Export: last processed document

In [5]:
from datetime import datetime
from elasticsearch import Elasticsearch
import hashlib

def clean_line(line):
    return line.strip().strip('\uFEFF')

def read_faq(file_id):
    url = f'https://docs.google.com/document/d/{file_id}/export?format=docx'
    response = requests.get(url)
    response.raise_for_status()
    
    with io.BytesIO(response.content) as f_in:
        doc = docx.Document(f_in)

    questions = []
    question_heading_style = 'heading 2'
    section_heading_style = 'heading 1'
    
    section_title = ''
    question_title = ''
    answer_text_so_far = ''
     
    for p in doc.paragraphs:
        style = p.style.name.lower()
        p_text = clean_line(p.text)
    
        if len(p_text) == 0:
            continue
    
        if style == section_heading_style:
            section_title = p_text
            continue
    
        if style == question_heading_style:
            answer_text_so_far = answer_text_so_far.strip()
            if answer_text_so_far != '' and section_title != '' and question_title != '':
                questions.append({
                    'text': answer_text_so_far,
                    'section': section_title,
                    'question': question_title,
                })
                answer_text_so_far = ''
    
            question_title = p_text
            continue
        
        answer_text_so_far += '\n' + p_text
    
    answer_text_so_far = answer_text_so_far.strip()
    if answer_text_so_far != '' and section_title != '' and question_title != '':
        questions.append({
            'text': answer_text_so_far,
            'section': section_title,
            'question': question_title,
        })

    return questions

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    return hash_hex[:8]

In [6]:
es = Elasticsearch(['http://localhost:9200'])

In [7]:
index_name_prefix = 'documents'
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
index_name = f"{index_name_prefix}_{current_time}"
print("index name:", index_name)

index name: documents_20240816_202103


In [8]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "document_id": {"type": "keyword"}
        }
    }
}

In [9]:
es.indices.create(index=index_name, body=index_settings)

faq_documents = {
    'llm-zoomcamp': '1qZjwHkvP0lXHiE4zdbWyUXSVfmVGzougDD6N37bat3E'
}

documents = []
for course, file_id in faq_documents.items():
    course_documents = read_faq(file_id)
    documents.append({'course': course, 'documents': course_documents})

last_document = None

In [10]:
for course_dict in documents:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        doc['document_id'] = generate_document_id(doc)
        es.index(index=index_name, id=doc['document_id'], body=doc)
        last_document = doc

print(f"Last document: {last_document}")

Last document: {'text': 'Answer', 'section': 'Workshops: X', 'question': 'Question', 'course': 'llm-zoomcamp', 'document_id': 'd8c4c7bb'}


### Question 5. Testing the retrieval: document id 

In [11]:
user_question = "When is the next cohort?"

query = {
    "size": 10,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": user_question,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            }
        }
    }
}

In [13]:
index_name = 'documents_20240816_202103'
response = es.search(index = index_name, body = query)

for hit in response['hits']['hits']:
    doc = hit['_source']
    print(f"Section: {doc['section']}")
    print(f"ID: {doc['document_id']}")
    print(f"Question: {doc['question']}")
    print(f"Score: {hit['_score']}\n")

Section: General course-related questions
ID: bf024675
Question: When will the course be offered next?
Score: 25.331835

Section: Module 3: X
ID: ee355823
Question: What is the cosine similarity?
Score: 12.660435

Section: Workshops: dlthub
ID: 6cf805ca
Question: There is an error when opening the table using dbtable = db.open_table("notion_pages___homework"): FileNotFoundError: Table notion_pages___homework does not exist.Please first call db.create_table(notion_pages___homework, data)
Score: 12.212482

Section: Workshops: dlthub
ID: e18124d4
Question: There is an error when running main(): FileNotFoundError: Table notion_pages___homework does not exist.Please first call db.create_table(notion_pages___homework, data)
Score: 11.193924

Section: General course-related questions
ID: fb81c6ff
Question: I was working on next week’s homework/content - why does it keep changing?
Score: 10.403244

Section: General course-related questions
ID: a5301a1f
Question: What is the video/zoom link to 